# Build Model

In [1]:
import numpy as np
import pandas as pd

In [2]:
#Load dataset
df = pd.read_csv("train.csv")

In [3]:
prediction = ["Lemon?"]
predictor = list(df.drop(prediction, axis=1))

# Feature Extraction

In [4]:
#Remove uncessary Features
predictor.remove("WheelTypeID")
predictor.remove("RefId")
predictor.remove("Model")
predictor.remove("Trim")
predictor.remove("SubModel")
predictor.remove("Color")
predictor.remove("PRIMEUNIT")
#predictor.remove("AUCGUART")
predictor.remove("VNZIP1")
predictor.remove("Date Purchased")
predictor.remove("BYRNO")
predictor

['Auction',
 'Vehicle Year',
 'Vehicle Age',
 'Make',
 'Transmission',
 'WheelType',
 'VehOdo',
 'Nationality',
 'Size',
 'TopThreeAmericanName',
 'MMRAcquisitionAuctionAveragePrice',
 'MMRAcquisitionAuctionCleanPrice',
 'MMRAcquisitionRetailAveragePrice',
 'MMRAcquisitonRetailCleanPrice',
 'MMRCurrentAuctionAveragePrice',
 'MMRCurrentAuctionCleanPrice',
 'MMRCurrentRetailAveragePrice',
 'MMRCurrentRetailCleanPrice',
 'AUCGUART',
 'VNST',
 'VehBCost',
 'IsOnlineSale',
 'WarrantyCost']

In [5]:
#Convert columns to boolean
#df['Lemon?'] = df['Lemon?'].astype('bool')
#df['IsOnlineSale'] = df['IsOnlineSale'].astype('bool')


In [6]:
#fill in missing value


#Fill in missing string values
X = df[predictor]
values = {'Transmission': 'AUTO', 'WheelType': 'Alloy', 'Nationality': 'AMERICAN', 'Size': 'MEDIUM', 'TopThreeAmericanName':'GM','AUCGUART':'None'}
X = X.fillna(value=values)

#Transmission seems to have 'Manual and 'MANUAL', replace 'Manual' with all caps
X = X.replace('Manual', 'MANUAL')
X.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 43789 entries, 0 to 43788
Data columns (total 23 columns):
Auction                              43789 non-null object
Vehicle Year                         43789 non-null int64
Vehicle Age                          43789 non-null int64
Make                                 43789 non-null object
Transmission                         43789 non-null object
WheelType                            43789 non-null object
VehOdo                               43789 non-null int64
Nationality                          43789 non-null object
Size                                 43789 non-null object
TopThreeAmericanName                 43789 non-null object
MMRAcquisitionAuctionAveragePrice    43775 non-null float64
MMRAcquisitionAuctionCleanPrice      43775 non-null float64
MMRAcquisitionRetailAveragePrice     43775 non-null float64
MMRAcquisitonRetailCleanPrice        43775 non-null float64
MMRCurrentAuctionAveragePrice        43598 non-null float64
MMRC

In [7]:
#fill missing value with mean
X = X.fillna(X.mean())
X.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 43789 entries, 0 to 43788
Data columns (total 23 columns):
Auction                              43789 non-null object
Vehicle Year                         43789 non-null int64
Vehicle Age                          43789 non-null int64
Make                                 43789 non-null object
Transmission                         43789 non-null object
WheelType                            43789 non-null object
VehOdo                               43789 non-null int64
Nationality                          43789 non-null object
Size                                 43789 non-null object
TopThreeAmericanName                 43789 non-null object
MMRAcquisitionAuctionAveragePrice    43789 non-null float64
MMRAcquisitionAuctionCleanPrice      43789 non-null float64
MMRAcquisitionRetailAveragePrice     43789 non-null float64
MMRAcquisitonRetailCleanPrice        43789 non-null float64
MMRCurrentAuctionAveragePrice        43789 non-null float64
MMRC

In [8]:
#One hot encode
string_variables =['Auction', 'Make', 'Transmission', 'WheelType', 'Nationality', 'Size', 'TopThreeAmericanName','VNST','AUCGUART']
X = pd.get_dummies(X, prefix=string_variables)

In [9]:
#Split validation set and training set
from sklearn.model_selection import train_test_split

y = np.ravel(df[prediction])

X_train, X_test, y_train, y_test = train_test_split(X, y,
                                                    test_size=0.2,
                                                    random_state=123,
                                                    stratify=y)

# Training

In [10]:
#Data processing pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import AdaBoostClassifier
from xgboost.sklearn import XGBClassifier
from sklearn.pipeline import make_pipeline
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import RobustScaler
from sklearn.metrics import mean_squared_error, r2_score

rfc_pipeline = make_pipeline(RobustScaler(), RandomForestClassifier())
#svc_pipeline = make_pipeline(RobustScaler(), SVC())
gbc_pipeline = make_pipeline(RobustScaler(), GradientBoostingClassifier())
adb_pipeline = make_pipeline(RobustScaler(), AdaBoostClassifier())
xgb_pipeline = make_pipeline(RobustScaler(), XGBClassifier())



In [11]:
#Train
rfc_pipeline.fit(X_train, y_train)
#svc_pipeline.fit(X_train, y_train)
gbc_pipeline.fit(X_train, y_train)
adb_pipeline.fit(X_train, y_train)
xgb_pipeline.fit(X_train, y_train)

Pipeline(memory=None,
     steps=[('robustscaler', RobustScaler(copy=True, quantile_range=(25.0, 75.0), with_centering=True,
       with_scaling=True)), ('xgbclassifier', XGBClassifier(base_score=0.5, colsample_bylevel=1, colsample_bytree=1,
       gamma=0, learning_rate=0.1, max_delta_step=0, max_depth=3,
       min_child_weight=1, missing=None, n_estimators=100, nthread=-1,
       objective='binary:logistic', reg_alpha=0, reg_lambda=1,
       scale_pos_weight=1, seed=0, silent=True, subsample=1))])

In [12]:
#Predict
pred = rfc_pipeline.predict(X_test)
print("RFC:")
print("MSE: " + str(mean_squared_error(y_test,pred)))
print("R2: " + str(r2_score(y_test,pred)))

#pred2 = svc_pipeline.predict(X_test)
#print("SVC:")
#print("MSE: " + str(mean_squared_error(y_test,pred2)))
#print("R2:" + str(r2_score(y_test,pred2)))

pred3 = gbc_pipeline.predict(X_test)
print("GBC:")
print("MSE: " + str(mean_squared_error(y_test,pred3)))
print("R2: " + str(r2_score(y_test,pred3)))

pred4 = adb_pipeline.predict(X_test)
print("ADB:")
print("MSE: " + str(mean_squared_error(y_test,pred4)))
print("R2: " + str(r2_score(y_test,pred4)))

pred5 = xgb_pipeline.predict(X_test)
print("XGB:")
print("MSE: " + str(mean_squared_error(y_test,pred5)))
print("R2: " + str(r2_score(y_test,pred5)))

RFC:
MSE: 0.127654715689
R2: -0.183622673705
GBC:
MSE: 0.122288193652
R2: -0.133863938764
ADB:
MSE: 0.122288193652
R2: -0.133863938764
XGB:
MSE: 0.122516556291
R2: -0.135981331741


# Predict 

In [13]:
predict = pd.read_csv("predictthis.csv")

In [14]:
#Handle Missing values in the dataframe for predictthis
X_pred = predict[predictor]
X_pred.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 29194 entries, 0 to 29193
Data columns (total 23 columns):
Auction                              29194 non-null object
Vehicle Year                         29194 non-null int64
Vehicle Age                          29194 non-null int64
Make                                 29194 non-null object
Transmission                         29191 non-null object
WheelType                            27905 non-null object
VehOdo                               29194 non-null int64
Nationality                          29194 non-null object
Size                                 29194 non-null object
TopThreeAmericanName                 29194 non-null object
MMRAcquisitionAuctionAveragePrice    29190 non-null float64
MMRAcquisitionAuctionCleanPrice      29190 non-null float64
MMRAcquisitionRetailAveragePrice     29190 non-null float64
MMRAcquisitonRetailCleanPrice        29190 non-null float64
MMRCurrentAuctionAveragePrice        29070 non-null float64
MMRC

In [15]:
values = {'Transmission': 'AUTO', 'WheelType': 'Alloy', 'Nationality': 'AMERICAN', 'Size': 'MEDIUM', 'TopThreeAmericanName':'GM', 'AUCGUART': 'None'}
X_pred = X_pred.fillna(value=values)
X_pred.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 29194 entries, 0 to 29193
Data columns (total 23 columns):
Auction                              29194 non-null object
Vehicle Year                         29194 non-null int64
Vehicle Age                          29194 non-null int64
Make                                 29194 non-null object
Transmission                         29194 non-null object
WheelType                            29194 non-null object
VehOdo                               29194 non-null int64
Nationality                          29194 non-null object
Size                                 29194 non-null object
TopThreeAmericanName                 29194 non-null object
MMRAcquisitionAuctionAveragePrice    29190 non-null float64
MMRAcquisitionAuctionCleanPrice      29190 non-null float64
MMRAcquisitionRetailAveragePrice     29190 non-null float64
MMRAcquisitonRetailCleanPrice        29190 non-null float64
MMRCurrentAuctionAveragePrice        29070 non-null float64
MMRC

In [16]:
X_pred = X_pred.fillna(X.mean())
X_pred.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 29194 entries, 0 to 29193
Data columns (total 23 columns):
Auction                              29194 non-null object
Vehicle Year                         29194 non-null int64
Vehicle Age                          29194 non-null int64
Make                                 29194 non-null object
Transmission                         29194 non-null object
WheelType                            29194 non-null object
VehOdo                               29194 non-null int64
Nationality                          29194 non-null object
Size                                 29194 non-null object
TopThreeAmericanName                 29194 non-null object
MMRAcquisitionAuctionAveragePrice    29194 non-null float64
MMRAcquisitionAuctionCleanPrice      29194 non-null float64
MMRAcquisitionRetailAveragePrice     29194 non-null float64
MMRAcquisitonRetailCleanPrice        29194 non-null float64
MMRCurrentAuctionAveragePrice        29194 non-null float64
MMRC

In [17]:
string_variables =['Auction', 'Make', 'Transmission', 'WheelType', 'Nationality', 'Size', 'TopThreeAmericanName','VNST', 'AUCGUART']
X_pred = pd.get_dummies(X_pred, prefix=string_variables)
X_pred.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 29194 entries, 0 to 29193
Columns: 114 entries, Vehicle Year to AUCGUART_WV
dtypes: float64(9), int64(5), uint8(100)
memory usage: 5.9 MB


In [18]:
#Predict!
final_pred = xgb_pipeline.predict(X_pred)

In [19]:
final_pred

array([0, 0, 0, ..., 0, 0, 0], dtype=int64)

In [20]:
#Submission CSV
my_submission = pd.DataFrame({'RefId': predict.RefId , 'Lemon?': final_pred})
my_submission.to_csv('submission_final.csv', index=False)