# Build Model

In [1]:
import numpy as np
import pandas as pd

In [2]:
#Load dataset
df = pd.read_csv("train.csv")

In [3]:
prediction = ["Lemon?"]
predictor = list(df.drop(prediction, axis=1))

# Feature Extraction

In [4]:
#Remove uncessary Features
predictor.remove("WheelTypeID")
predictor.remove("RefId")
predictor.remove("Model")
predictor.remove("Trim")
predictor.remove("SubModel")
predictor.remove("Color")
predictor.remove("PRIMEUNIT")
#predictor.remove("AUCGUART")
predictor.remove("VNZIP1")
predictor.remove("Date Purchased")
predictor.remove("BYRNO")
predictor

['Auction',
 'Vehicle Year',
 'Vehicle Age',
 'Make',
 'Transmission',
 'WheelType',
 'VehOdo',
 'Nationality',
 'Size',
 'TopThreeAmericanName',
 'MMRAcquisitionAuctionAveragePrice',
 'MMRAcquisitionAuctionCleanPrice',
 'MMRAcquisitionRetailAveragePrice',
 'MMRAcquisitonRetailCleanPrice',
 'MMRCurrentAuctionAveragePrice',
 'MMRCurrentAuctionCleanPrice',
 'MMRCurrentRetailAveragePrice',
 'MMRCurrentRetailCleanPrice',
 'AUCGUART',
 'VNST',
 'VehBCost',
 'IsOnlineSale',
 'WarrantyCost']

In [5]:
#Convert columns to boolean
#df['Lemon?'] = df['Lemon?'].astype('bool')
#df['IsOnlineSale'] = df['IsOnlineSale'].astype('bool')


In [None]:
#fill in missing value


#Fill in missing string values
X = df[predictor]
values = {'Transmission': 'AUTO', 'WheelType': 'Alloy', 'Nationality': 'AMERICAN', 'Size': 'MEDIUM', 'TopThreeAmericanName':'GM','AUCGUART':'None'}
X = X.fillna(value=values)

#Transmission seems to have 'Manual and 'MANUAL', replace 'Manual' with all caps
X = X.replace('Manual', 'MANUAL')
X.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 43789 entries, 0 to 43788
Data columns (total 23 columns):
Auction                              43789 non-null object
Vehicle Year                         43789 non-null int64
Vehicle Age                          43789 non-null int64
Make                                 43789 non-null object
Transmission                         43789 non-null object
WheelType                            43789 non-null object
VehOdo                               43789 non-null int64
Nationality                          43789 non-null object
Size                                 43789 non-null object
TopThreeAmericanName                 43789 non-null object
MMRAcquisitionAuctionAveragePrice    43775 non-null float64
MMRAcquisitionAuctionCleanPrice      43775 non-null float64
MMRAcquisitionRetailAveragePrice     43775 non-null float64
MMRAcquisitonRetailCleanPrice        43775 non-null float64
MMRCurrentAuctionAveragePrice        43598 non-null float64
MMRC

In [None]:
#fill missing value with mean
X = X.fillna(X.mean())
X.info()

In [None]:
#One hot encode
string_variables =['Auction', 'Make', 'Transmission', 'WheelType', 'Nationality', 'Size', 'TopThreeAmericanName','VNST']
X = pd.get_dummies(X, prefix=string_variables)

In [None]:
#Split validation set and training set
from sklearn.model_selection import train_test_split

y = np.ravel(df[prediction])

X_train, X_test, y_train, y_test = train_test_split(X, y,
                                                    test_size=0.2,
                                                    random_state=123,
                                                    stratify=y)

# Training

In [None]:
#Data processing pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import AdaBoostClassifier
from xgboost.sklearn import XGBClassifier
from sklearn.pipeline import make_pipeline
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import RobustScaler
from sklearn.metrics import mean_squared_error, r2_score

rfc_pipeline = make_pipeline(RobustScaler(), RandomForestClassifier())
#svc_pipeline = make_pipeline(RobustScaler(), SVC())
gbc_pipeline = make_pipeline(RobustScaler(), GradientBoostingClassifier())
adb_pipeline = make_pipeline(RobustScaler(), AdaBoostClassifier())
xgb_pipeline = make_pipeline(RobustScaler(), XGBClassifier())

In [None]:
#Train
rfc_pipeline.fit(X_train, y_train)
#svc_pipeline.fit(X_train, y_train)
gbc_pipeline.fit(X_train, y_train)
adb_pipeline.fit(X_train, y_train)
xgb_pipeline.fit(X_train, y_train)

In [None]:
#Predict
pred = rfc_pipeline.predict(X_test)
print("RFC:")
print("MSE: " + str(mean_squared_error(y_test,pred)))
print("R2: " + str(r2_score(y_test,pred)))

#pred2 = svc_pipeline.predict(X_test)
#print("SVC:")
#print("MSE: " + str(mean_squared_error(y_test,pred2)))
#print("R2:" + str(r2_score(y_test,pred2)))

pred3 = gbc_pipeline.predict(X_test)
print("GBC:")
print("MSE: " + str(mean_squared_error(y_test,pred3)))
print("R2: " + str(r2_score(y_test,pred3)))

pred4 = adb_pipeline.predict(X_test)
print("ADB:")
print("MSE: " + str(mean_squared_error(y_test,pred4)))
print("R2: " + str(r2_score(y_test,pred4)))

pred5 = xgb_pipeline.predict(X_test)
print("XGB:")
print("MSE: " + str(mean_squared_error(y_test,pred5)))
print("R2: " + str(r2_score(y_test,pred5)))

# Predict 

In [None]:
predict = pd.read_csv("predictthis.csv")

In [None]:
#Handle Missing values in the dataframe for predictthis
X_pred = predict[predictor]
X_pred.info()

In [None]:
values = {'Transmission': 'AUTO', 'WheelType': 'Alloy', 'Nationality': 'AMERICAN', 'Size': 'MEDIUM', 'TopThreeAmericanName':'GM'}
X_pred = X_pred.fillna(value=values)
X_pred.info()

In [None]:
X_pred = X_pred.fillna(X.mean())
X_pred.info()

In [None]:
string_variables =['Auction', 'Make', 'Transmission', 'WheelType', 'Nationality', 'Size', 'TopThreeAmericanName','VNST']
X_pred = pd.get_dummies(X_pred, prefix=string_variables)
X_pred.info()

In [None]:
#Predict!
final_pred = xgb_pipeline.predict(X_pred)

In [None]:
final_pred

In [None]:
#Submission CSV
my_submission = pd.DataFrame({'RefId': predict.RefId , 'Lemon?': final_pred})
my_submission.to_csv('submission_final.csv', index=False)