In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/playground-series-s5e2/sample_submission.csv
/kaggle/input/playground-series-s5e2/train.csv
/kaggle/input/playground-series-s5e2/test.csv
/kaggle/input/playground-series-s5e2/training_extra.csv


In [2]:
#Data loading
train_path = '/kaggle/input/playground-series-s5e2/train.csv'
test_path = '/kaggle/input/playground-series-s5e2/test.csv'

train_full = pd.read_csv(train_path)
test_full = pd.read_csv(test_path)

#separating feature and target
train_full.dropna(axis=0, subset=['Price'], inplace=True)
y = train_full['Price']
X = train_full.drop(columns='Price')

#separating the right columns
numerical_cols = [col for col in X.columns
                 if X[col].dtype in ['int64','float']]
categorical_cols = [col for col in X.columns
                   if X[col].nunique() < 10 and X[col].dtype == 'object']
right_cols = numerical_cols + categorical_cols
X = X[right_cols].copy()
test = test_full[right_cols].copy()


In [3]:
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor

#spliting training and validation data
X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.2, random_state=1)

#Creating the pipeline
numerical_transformer = SimpleImputer(strategy='mean')
categorical_transformer = Pipeline(steps=[
    ('impute', SimpleImputer(strategy='most_frequent')),
    ('one_hot', OneHotEncoder(handle_unknown='ignore'))
])
transformer = ColumnTransformer(transformers=[
    ('num', numerical_transformer, numerical_cols),
    ('cat', categorical_transformer, categorical_cols)
])

model = XGBRegressor(n_estimators=50, learning_rate=0.01)
my_pipeline = Pipeline(steps=[
    ('preprocess', transformer),
    ('model', model)
])

In [4]:
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import cross_val_score

my_pipeline.fit(X, y)
preds = my_pipeline.predict(test)

output = pd.DataFrame({'id':test['id'], 'Price': preds})
output.to_csv('submission.csv', index=False)
