In [None]:
# libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import StandardScaler
from imblearn.over_sampling import SMOTE
from sklearn.neural_network import MLPClassifier
from sklearn.feature_selection import RFECV

#reading csv file
df=pd.read_csv('CaseStudyData.csv')
df=df.drop(['PRIMEUNIT','AUCGUART'],axis=1)
df=df.drop(['PurchaseDate','WheelTypeID','PurchaseID'],axis=1)

#Data Pre-processing
Auction_map={'ADESA':0,'MANHEIM':1,'OTHER':2}
df['Auction']=df['Auction'].map(Auction_map)
df['Auction'].fillna(0,inplace=True)
df['Auction'] = df['Auction'].astype(int)

df['MMRAcquisitionAuctionCleanPrice']=df['MMRAcquisitionAuctionCleanPrice'].replace('?',np.NaN)
df['MMRAcquisitionAuctionCleanPrice'].fillna(df['MMRAcquisitionAuctionCleanPrice'].median(skipna=True),inplace=True)
df['MMRAcquisitionAuctionCleanPrice'] = df['MMRAcquisitionAuctionCleanPrice'].astype(float)

df['MMRAcquisitionRetailAveragePrice']=df['MMRAcquisitionRetailAveragePrice'].replace('?',np.NaN)
df['MMRAcquisitionRetailAveragePrice'].fillna(df['MMRAcquisitionRetailAveragePrice'].median(skipna=True),inplace=True)
df['MMRAcquisitionRetailAveragePrice'] = df['MMRAcquisitionRetailAveragePrice'].astype(float)

df['MMRAcquisitonRetailCleanPrice']=df['MMRAcquisitonRetailCleanPrice'].replace('?',np.NaN)
df['MMRAcquisitonRetailCleanPrice'].fillna(df['MMRAcquisitonRetailCleanPrice'].median(skipna=True),inplace=True)
df['MMRAcquisitonRetailCleanPrice'] = df['MMRAcquisitonRetailCleanPrice'].astype(float)

df['MMRCurrentAuctionCleanPrice']=df['MMRCurrentAuctionCleanPrice'].replace('?',np.NaN)
df['MMRCurrentAuctionCleanPrice'].fillna(df['MMRCurrentAuctionCleanPrice'].median(skipna=True),inplace=True)
df['MMRCurrentAuctionCleanPrice'] = df['MMRCurrentAuctionCleanPrice'].astype(float)

df['MMRCurrentRetailAveragePrice']=df['MMRCurrentRetailAveragePrice'].replace('?',np.NaN)
df['MMRCurrentRetailAveragePrice'].fillna(df['MMRCurrentRetailAveragePrice'].median(skipna=True),inplace=True)
df['MMRCurrentRetailAveragePrice'] = df['MMRCurrentRetailAveragePrice'].astype(float)

df['MMRCurrentRetailCleanPrice']=df['MMRCurrentRetailCleanPrice'].replace('?',np.NaN)
df['MMRCurrentRetailCleanPrice'].fillna(df['MMRCurrentRetailCleanPrice'].median(skipna=True),inplace=True)
df['MMRCurrentRetailCleanPrice'] = df['MMRCurrentRetailCleanPrice'].astype(float)

WheelType_map={'Alloy':0,'Covers':1,'Special':2,'?':0}
df['WheelType']=df['WheelType'].map(WheelType_map)
df['WheelType'].fillna(0,inplace=True)
df['WheelType'] = df['WheelType'].astype(float)

Make_map={'ACURA':0,'BUICK':1,'CADILLAC':3,'CHEVROLET':4,'CHRYSLER':5,'DODGE':6,'FORD':7,'GMC':8,'HONDA':9,'HYUNDAI':10,'INFINITI':11,'ISUZU':12,'JEEP':13,'KIA':14,'LEXUS':15,'LINCOLN':16,'MAZDA':17,'MERCURY':18,'MINI':19,'MITSUBISHI':20,'NISSAN':21,'OLDSMOBILE':22,'PONTIAC':23,'SATURN':24,'SCION':25,'SUBARU':26,'SUZUKI':27,'TOYOTA':27,'VOLKSWAGEN':28,'VOLVO':29,'?':4}
df['Make']=df['Make'].map(Make_map)
df['Make'].fillna(4,inplace=True)
df['Make'] = df['Make'].astype(float)

Color_map={'BEIGE':0,'BLACK':1,'BLUE':2,'BROWN':3,'GOLD':4,'GREEN':5,'GREY':6,'MAROON':7,'NOT AVAIL':8,'ORANGE':9,'OTHER':10,'PURPLE':11,'RED':12,'SILVER':13,'WHITE':14,'YELLOW':15,'?':13}
df['Color']=df['Color'].map(Color_map)
df['Color'].fillna(13,inplace=True)
df['Color'] = df['Color'].astype(int)

Transmission_map={'AUTO':0,'MANUAL':1,'?':0,'Manual':1}
df['Transmission']=df['Transmission'].map(Transmission_map)
df['Transmission'].fillna(0,inplace=True)
df['Transmission'] = df['Transmission'].astype(int)

vehodo_map={'?':df['VehOdo'].mean()}
df['VehOdo'].fillna(df['VehOdo'].mean(),inplace=True)
df['VehOdo'] = df['VehOdo'].astype(int)

Nationality_map={'AMERICAN':0,'OTHER':1,'OTHER ASIAN':2,'TOP LINE ASIAN':3,'USA':4,'?':0}
df['Nationality']=df['Nationality'].map(Nationality_map)
df['Nationality'].fillna(0,inplace=True)
df['Nationality'] = df['Nationality'].astype(int)

Size_map={'COMPACT':0,'CROSSOVER':1,'LARGE':2,'LARGE SUV':3,'LARGE TRUCK':4,'MEDIUM':5,'MEDIUM SUV':6,'SMALL SUV':7,'SMALL TRUCK':8,'SPECIALTY':9,'SPORTS':10,'VAN':11,'?':5}    
df['Size']=df['Size'].map(Size_map)
df['Size'].fillna(5,inplace=True)
df['Size'] = df['Size'].astype(int)

df['VehYear'].fillna(2006.0,inplace=True)
df['VehYear'] = df['VehYear'].astype(int)

american_name_map={'CHRYSLER':0,'FORD':1,'GM':2,'OTHER':3,'?':2}
df['TopThreeAmericanName']=df['TopThreeAmericanName'].map(american_name_map)
df['TopThreeAmericanName'].fillna(2,inplace=True)
df['TopThreeAmericanName'] = df['TopThreeAmericanName'].astype(int)

df['MMRAcquisitionAuctionAveragePrice']=df['MMRAcquisitionAuctionAveragePrice'].replace('?',np.NaN)
df['MMRAcquisitionAuctionAveragePrice'].fillna(df['MMRAcquisitionAuctionAveragePrice'].median(skipna=True),inplace=True)
df['MMRAcquisitionAuctionAveragePrice'] = df['MMRAcquisitionAuctionAveragePrice'].astype(float)

df['MMRCurrentAuctionAveragePrice']=df['MMRCurrentAuctionAveragePrice'].replace('?',np.NaN)
df['MMRCurrentAuctionAveragePrice'].fillna(df['MMRCurrentAuctionAveragePrice'].median(skipna=True),inplace=True)
df['MMRCurrentAuctionAveragePrice'] = df['MMRCurrentAuctionAveragePrice'].astype(float)

df['MMRCurrentRetailRatio']=df['MMRCurrentRetailRatio'].replace(['?','#VALUE!'],np.NaN)
df['MMRCurrentRetailRatio'].fillna(df['MMRCurrentRetailRatio'].median(skipna=True),inplace=True)
df['MMRCurrentRetailRatio'] = df['MMRCurrentRetailRatio'].astype(float)

vnst_map = {'TX':0, 'FL':1,'CO':2,'NC':3,'AZ':4,'CA':5,'OK':6,'SC':7,'TN':8,'GA':9,'VA':10,'MO':11,'PA':12,'NV':13,'IN':14,'MS':15,'LA':16,'NJ':17,'NM':18,'KY':19,'AL':20,'IL':21,'UT':22,'WV':23,'WA':24,'OR':25,'NH':26,'NE':27,'OH':28,'ID':29,'NY':30,'?':0}
df['VNST'] = df['VNST'].map(vnst_map)
df['VNST'].fillna(0,inplace=True)
df['VNST'] = df['VNST'].astype(int)

df['VehBCost']=df['VehBCost'].replace(['?'],np.NaN)
df['VehBCost'].fillna(0,inplace=True)
df['VehBCost'] = df['VehBCost'].astype(int)

mmr4_map={'0':0,'1':1,'-1':1,'4':1,'2':1,'?':0}
df['IsOnlineSale']=df['IsOnlineSale'].map(mmr4_map)
df['IsOnlineSale'].fillna(0,inplace=True)
df['IsOnlineSale'] = df['IsOnlineSale'].astype(float)

ForSale_map={'Yes':0,'YES':0,'yes':0,'No':1,'?':0}
df['ForSale']=df['ForSale'].map(ForSale_map)
df['ForSale'].fillna(0,inplace=True)
df['ForSale'] = df['ForSale'].astype(float)

df['WarrantyCost'].fillna(df['WarrantyCost'].mean(),inplace=True)
df['WarrantyCost'] = df['WarrantyCost'].astype(int)


# random state
rs = 10

# train test split
y = df['IsBadBuy']
X = df.drop(['IsBadBuy'], axis=1)
rs = 12
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size = .1,random_state=12)
sm = SMOTE(random_state=12, ratio = 1.0)
x_res, y_res = sm.fit_sample(X_train, y_train)
x_train_res, x_val_res, y_train_res, y_val_res = train_test_split(x_res,y_res,test_size = .1,random_state=12)

scaler = StandardScaler()
x_train_res = scaler.fit_transform(x_train_res, y_train_res)
x_val_res = scaler.transform(x_val_res)


model = MLPClassifier(random_state=rs)
model.fit(x_train_res, y_train_res)

print("Train accuracy:", model.score(x_train_res, y_train_res))
print("Test accuracy:", model.score(x_val_res, y_val_res))

y_pred = model.predict(x_val_res)
print(classification_report(y_val_res, y_pred))

print(model)

print(X_train.shape)
params = {'hidden_layer_sizes': [(x,) for x in range(5, 86, 20)]}

cv = GridSearchCV(param_grid=params, estimator=MLPClassifier(random_state=rs), cv=10, n_jobs=-1)
cv.fit(x_train_res, y_train_res)

print("Train accuracy:", cv.score(x_train_res, y_train_res))
print("Test accuracy:", cv.score(x_val_res, y_val_res))

y_pred = cv.predict(x_val_res)
print(classification_report(y_val_res, y_pred))

print(cv.best_params_)

params = {'hidden_layer_sizes': [(3,), (5,), (7,), (9,)], 'alpha': [0.01,0.001, 0.0001, 0.00001]}

cv = GridSearchCV(param_grid=params, estimator=MLPClassifier(random_state=rs), cv=10, n_jobs=-1)
cv.fit(x_train_res, y_train_res)

print("Train accuracy:", cv.score(x_train_res, y_train_res))
print("Test accuracy:", cv.score(x_val_res, y_val_res))

y_pred = cv.predict(x_val_res)
print(classification_report(y_val_res, y_pred))

print(cv.best_params_)

# list columns to be transformed
columns_to_transform = ['Make', 'Size', 'Auction', 'Color','VehBCost', 'VehOdo', 'WheelType', 'Nationality']

# copy the dataframe
df_log = df.copy()

# transform the columns with np.log
for col in columns_to_transform:
    df_log[col] = df_log[col].apply(lambda x: x+1)
    df_log[col] = df_log[col].apply(np.log)
    
# create X, y and train test data partitions
y_log = df_log['IsBadBuy']
X_log = df_log.drop(['IsBadBuy'], axis=1)
X_train, X_test, y_train, y_test = train_test_split(X_log,y_log,test_size = .1,random_state=12)
sm = SMOTE(random_state=12, ratio = 1.0)
x_res, y_res = sm.fit_sample(X_train, y_train)
X_train_log, X_test_log, y_train_log, y_test_log = train_test_split(x_res, y_res, test_size=.1,random_state=12)

# standardise them again
scaler_log = StandardScaler()
X_train_log = scaler_log.fit_transform(X_train_log, y_train_log)
X_test_log = scaler_log.transform(X_test_log)

params = {'hidden_layer_sizes': [(3,), (5,), (7,), (9,)], 'alpha': [0.01,0.001, 0.0001, 0.00001]}

cv = GridSearchCV(param_grid=params, estimator=MLPClassifier(random_state=rs), cv=10, n_jobs=-1)
cv.fit(X_train_log, y_train_log)

print("Train accuracy:", cv.score(X_train_log, y_train_log))
print("Test accuracy:", cv.score(X_test_log, y_test_log))

y_pred = cv.predict(X_test_log)
print(classification_report(y_test_log, y_pred))

print(cv.best_params_)

#RFE
rfe = RFECV(estimator = LogisticRegression(random_state=rs), cv=10)
rfe.fit(x_train_res, y_train_res) # run the RFECV

# comparing how many variables before and after
print("Original feature set", x_train_res.shape[1])
print("Number of features after elimination", rfe.n_features_)

X_train_sel = rfe.transform(x_train_res)
X_test_sel = rfe.transform(x_val_res)

cv1 = GridSearchCV(param_grid=params, estimator=MLPClassifier(random_state=rs), cv=10, n_jobs=-1)
cv1.fit(x_train_res, y_train_res)

print("Train accuracy:", cv1.score(X_train_log, y_train_log))
print("Test accuracy:", cv1.score(X_test_log, y_test_log))

y_pred = cv1.predict(x_val_res)
print(classification_report(y_val_res, y_pred))

print(cv1.best_params_)

  interactivity=interactivity, compiler=compiler, result=result)


('Train accuracy:', 0.8651825741761062)
('Test accuracy:', 0.8430859254696643)
             precision    recall  f1-score   support

          0       0.82      0.88      0.85      3231
          1       0.87      0.81      0.84      3263

avg / total       0.84      0.84      0.84      6494

MLPClassifier(activation='relu', alpha=0.0001, batch_size='auto', beta_1=0.9,
       beta_2=0.999, early_stopping=False, epsilon=1e-08,
       hidden_layer_sizes=(100,), learning_rate='constant',
       learning_rate_init=0.001, max_iter=200, momentum=0.9,
       nesterovs_momentum=True, power_t=0.5, random_state=12, shuffle=True,
       solver='adam', tol=0.0001, validation_fraction=0.1, verbose=False,
       warm_start=False)
(37328, 25)
('Train accuracy:', 0.8581328496629137)
('Test accuracy:', 0.834154604250077)
             precision    recall  f1-score   support

          0       0.81      0.87      0.84      3231
          1       0.87      0.79      0.83      3263

avg / total       0.84 