## Import Libraries

In [1]:
import pandas as pd
import numpy as np

# Make and Compose Pipeline
from sklearn.pipeline import make_pipeline
from sklearn.compose import make_column_transformer

# Import SimpleImputer
from sklearn.impute import SimpleImputer

# Import Scaling and transformation Libraries
from sklearn.preprocessing import StandardScaler, OrdinalEncoder

# Import Decision tree
from sklearn.tree import DecisionTreeRegressor

## Import Data

In [2]:
train = pd.read_csv('input/dm.csv')
#test = pd.read_csv('input/test.csv')

In [4]:
train.head()

Unnamed: 0,Age,Gender,OwnHome,Married,Location,Salary,Children,History,Catalogs,AmountSpent,Cust_Id
0,Old,Female,Own,Single,Far,47500,0,High,6,755,247
1,Middle,Male,Rent,Single,Close,63600,0,High,6,1318,127
2,Young,Female,Rent,Single,Close,13500,0,Low,18,296,479
3,Middle,Male,Own,Married,Close,85600,1,High,18,2436,475
4,Middle,Female,Own,Single,Close,68400,0,High,12,1304,151


In [28]:
#Identify Numerical Features
numerical_features = ['Salary','Catalogs','Children']

#Identify Categorical Features
categorical_features = ['Gender','Age','OwnHome','Married','Location','History']

## Train Validation Split

In [29]:
y = train['AmountSpent']
X = train.drop(['Cust_Id','AmountSpent'], axis = 1)

In [30]:
from sklearn.model_selection import train_test_split
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.30, random_state=42)

## Build Pipeline

In [31]:
preprocessor = make_column_transformer(

    (make_pipeline(
        SimpleImputer(strategy = 'mean'),
        StandardScaler()), numerical_features),
    
    (make_pipeline(        
        SimpleImputer(strategy = 'most_frequent'),
        OrdinalEncoder(categories = 'auto')), categorical_features),

)

In [32]:
#model = make_pipeline(preprocessor, DecisionTreeClassifier())
dt = make_pipeline(preprocessor, DecisionTreeRegressor())

## Build Model

In [33]:
from sklearn.model_selection import GridSearchCV

In [34]:
parameters = {
    'decisiontreeregressor__criterion':('mse', 'friedman_mse', 'mae', 'poisson')
    , 'decisiontreeregressor__max_depth':[5,10,15,20,30]
    , 'decisiontreeregressor__min_samples_split': [2,4,6,8]
}

In [35]:
model = GridSearchCV(dt, parameters, verbose = 2, n_jobs = -1)

In [36]:
model.fit(X_train, y_train)

Fitting 5 folds for each of 80 candidates, totalling 400 fits


GridSearchCV(estimator=Pipeline(steps=[('columntransformer',
                                        ColumnTransformer(transformers=[('pipeline-1',
                                                                         Pipeline(steps=[('simpleimputer',
                                                                                          SimpleImputer()),
                                                                                         ('standardscaler',
                                                                                          StandardScaler())]),
                                                                         ['Salary',
                                                                          'Catalogs',
                                                                          'Children']),
                                                                        ('pipeline-2',
                                                                         

## Check Accurary

In [38]:
from sklearn.metrics import mean_squared_error

In [41]:
y_train_pred = model.predict(X_train)
y_val_pred = model.predict(X_val)

In [42]:
print(f"Train MSE : {mean_squared_error(y_train, y_train_pred)}")
print(f"Test MSE : {mean_squared_error(y_val, y_val_pred)}")

Train Accuracy : 225290.5138162758
Test Accuracy : 159121.00462986075
