In [1]:
import pandas as pd
import numpy as np
import pydot
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score
from io import StringIO
from sklearn.tree import export_graphviz
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from imblearn.over_sampling import SMOTE
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.feature_selection import RFECV

#reading csv file
df=pd.read_csv('CaseStudyData.csv')
df=df.drop(['PRIMEUNIT','AUCGUART'],axis=1)
df=df.drop(['PurchaseDate','WheelTypeID','PurchaseID'],axis=1)

#Data Pre-processing
Auction_map={'ADESA':0,'MANHEIM':1,'OTHER':2}
df['Auction']=df['Auction'].map(Auction_map)
df['Auction'].fillna(0,inplace=True)
df['Auction'] = df['Auction'].astype(int)

df['MMRAcquisitionAuctionCleanPrice']=df['MMRAcquisitionAuctionCleanPrice'].replace('?',np.NaN)
df['MMRAcquisitionAuctionCleanPrice'].fillna(df['MMRAcquisitionAuctionCleanPrice'].median(skipna=True),inplace=True)
df['MMRAcquisitionAuctionCleanPrice'] = df['MMRAcquisitionAuctionCleanPrice'].astype(float)

df['MMRAcquisitionRetailAveragePrice']=df['MMRAcquisitionRetailAveragePrice'].replace('?',np.NaN)
df['MMRAcquisitionRetailAveragePrice'].fillna(df['MMRAcquisitionRetailAveragePrice'].median(skipna=True),inplace=True)
df['MMRAcquisitionRetailAveragePrice'] = df['MMRAcquisitionRetailAveragePrice'].astype(float)

df['MMRAcquisitonRetailCleanPrice']=df['MMRAcquisitonRetailCleanPrice'].replace('?',np.NaN)
df['MMRAcquisitonRetailCleanPrice'].fillna(df['MMRAcquisitonRetailCleanPrice'].median(skipna=True),inplace=True)
df['MMRAcquisitonRetailCleanPrice'] = df['MMRAcquisitonRetailCleanPrice'].astype(float)

df['MMRCurrentAuctionCleanPrice']=df['MMRCurrentAuctionCleanPrice'].replace('?',np.NaN)
df['MMRCurrentAuctionCleanPrice'].fillna(df['MMRCurrentAuctionCleanPrice'].median(skipna=True),inplace=True)
df['MMRCurrentAuctionCleanPrice'] = df['MMRCurrentAuctionCleanPrice'].astype(float)

df['MMRCurrentRetailAveragePrice']=df['MMRCurrentRetailAveragePrice'].replace('?',np.NaN)
df['MMRCurrentRetailAveragePrice'].fillna(df['MMRCurrentRetailAveragePrice'].median(skipna=True),inplace=True)
df['MMRCurrentRetailAveragePrice'] = df['MMRCurrentRetailAveragePrice'].astype(float)

df['MMRCurrentRetailCleanPrice']=df['MMRCurrentRetailCleanPrice'].replace('?',np.NaN)
df['MMRCurrentRetailCleanPrice'].fillna(df['MMRCurrentRetailCleanPrice'].median(skipna=True),inplace=True)
df['MMRCurrentRetailCleanPrice'] = df['MMRCurrentRetailCleanPrice'].astype(float)

WheelType_map={'Alloy':0,'Covers':1,'Special':2,'?':0}
df['WheelType']=df['WheelType'].map(WheelType_map)
df['WheelType'].fillna(0,inplace=True)
df['WheelType'] = df['WheelType'].astype(float)

Make_map={'ACURA':0,'BUICK':1,'CADILLAC':3,'CHEVROLET':4,'CHRYSLER':5,'DODGE':6,'FORD':7,'GMC':8,'HONDA':9,'HYUNDAI':10,'INFINITII':11,'ISUZU':12,'JEEP':13,'KIA':14,'LEXUS':15,'LINCOLN':16,'MAZDA':17,'MERCURY':18,'MINI':19,'MITSUBISHI':20,'NISSAN':21,'OLDSMOBILE':22,'PONTIAC':23,'SATURN':24,'SCION':25,'SUBARU':26,'SUZUKI':27,'TOYOTA':27,'VOLKSWAGEN':28,'VOLVO':29,'?':4}
df['Make']=df['Make'].map(Make_map)
df['Make'].fillna(4,inplace=True)
df['Make'] = df['Make'].astype(float)

Color_map={'BEIGE':0,'BLACK':1,'BLUE':2,'BROWN':3,'GOLD':4,'GREEN':5,'GREY':6,'MAROON':7,'NOT AVAIL':8,'ORANGE':9,'OTHER':10,'PURPLE':11,'RED':12,'SILVER':13,'WHITE':14,'YELLOW':15,'?':13}
df['Color']=df['Color'].map(Color_map)
df['Color'].fillna(13,inplace=True)
df['Color'] = df['Color'].astype(int)

Transmission_map={'AUTO':0,'MANUAL':1,'?':0,'Manual':1}
df['Transmission']=df['Transmission'].map(Transmission_map)
df['Transmission'].fillna(0,inplace=True)
df['Transmission'] = df['Transmission'].astype(int)

vehodo_map={'?':df['VehOdo'].mean()}
df['VehOdo'].fillna(df['VehOdo'].mean(),inplace=True)
df['VehOdo'] = df['VehOdo'].astype(int)

Nationality_map={'AMERICAN':0,'OTHER':1,'OTHER ASIAN':2,'TOP LINE ASIAN':3,'USA':4,'?':0}
df['Nationality']=df['Nationality'].map(Nationality_map)
df['Nationality'].fillna(0,inplace=True)
df['Nationality'] = df['Nationality'].astype(int)

Size_map={'COMPACT':0,'CROSSOVER':1,'LARGE':2,'LARGE SUV':3,'LARGE TRUCK':4,'MEDIUM':5,'MEDIUM SUV':6,'SMALL SUV':7,'SMALL TRUCK':8,'SPECIALTY':9,'SPORTS':10,'VAN':11,'?':5}    
df['Size']=df['Size'].map(Size_map)
df['Size'].fillna(5,inplace=True)
df['Size'] = df['Size'].astype(int)

df['VehYear'].fillna(2006.0,inplace=True)
df['VehYear'] = df['VehYear'].astype(int)

american_name_map={'CHRYSLER':0,'FORD':1,'GM':2,'OTHER':3,'?':2}
df['TopThreeAmericanName']=df['TopThreeAmericanName'].map(american_name_map)
df['TopThreeAmericanName'].fillna(2,inplace=True)
df['TopThreeAmericanName'] = df['TopThreeAmericanName'].astype(int)

df['MMRAcquisitionAuctionAveragePrice']=df['MMRAcquisitionAuctionAveragePrice'].replace('?',np.NaN)
df['MMRAcquisitionAuctionAveragePrice'].fillna(df['MMRAcquisitionAuctionAveragePrice'].median(skipna=True),inplace=True)
df['MMRAcquisitionAuctionAveragePrice'] = df['MMRAcquisitionAuctionAveragePrice'].astype(float)

df['MMRCurrentAuctionAveragePrice']=df['MMRCurrentAuctionAveragePrice'].replace('?',np.NaN)
df['MMRCurrentAuctionAveragePrice'].fillna(df['MMRCurrentAuctionAveragePrice'].median(skipna=True),inplace=True)
df['MMRCurrentAuctionAveragePrice'] = df['MMRCurrentAuctionAveragePrice'].astype(float)

df['MMRCurrentRetailRatio']=df['MMRCurrentRetailRatio'].replace(['?','#VALUE!'],np.NaN)
df['MMRCurrentRetailRatio'].fillna(df['MMRCurrentRetailRatio'].median(skipna=True),inplace=True)
df['MMRCurrentRetailRatio'] = df['MMRCurrentRetailRatio'].astype(float)

vnst_map = {'TX':0, 'FL':1,'CO':2,'NC':3,'AZ':4,'CA':5,'OK':6,'SC':7,'TN':8,'GA':9,'VA':10,'MO':11,'PA':12,'NV':13,'IN':14,'MS':15,'LA':16,'NJ':17,'NM':18,'KY':19,'AL':20,'IL':21,'UT':22,'WV':23,'WA':24,'OR':25,'NH':26,'NE':27,'OH':28,'ID':29,'NY':30,'?':0}
df['VNST'] = df['VNST'].map(vnst_map)
df['VNST'].fillna(0,inplace=True)
df['VNST'] = df['VNST'].astype(int)

df['VehBCost']=df['VehBCost'].replace(['?'],np.NaN)
df['VehBCost'].fillna(0,inplace=True)
df['VehBCost'] = df['VehBCost'].astype(int)

mmr4_map={'0':0,'1':1,'-1':1,'4':1,'2':1,'?':0}
df['IsOnlineSale']=df['IsOnlineSale'].map(mmr4_map)
df['IsOnlineSale'].fillna(0,inplace=True)
df['IsOnlineSale'] = df['IsOnlineSale'].astype(float)

ForSale_map={'Yes':0,'YES':0,'yes':0,'No':1,'?':0}
df['ForSale']=df['ForSale'].map(ForSale_map)
df['ForSale'].fillna(0,inplace=True)
df['ForSale'] = df['ForSale'].astype(float)

df['WarrantyCost'].fillna(df['WarrantyCost'].mean(),inplace=True)
df['WarrantyCost'] = df['WarrantyCost'].astype(int)

# set the random seed - consistent
rs = 12

# train test split
y = df['IsBadBuy']
X = df.drop(['IsBadBuy'], axis=1)
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size = .1,random_state=12)
sm = SMOTE(random_state=12, ratio = 1.0)
x_res, y_res = sm.fit_sample(X_train, y_train)
x_train_res, x_val_res, y_train_res, y_val_res = train_test_split(x_res,y_res,test_size = .2,random_state=12)

#******************Regression*******************#
# initialise a standard scaler object
# Standardisation
scaler = StandardScaler()

# visualise min, max, mean and standard dev of data before scaling
print("Before scaling\n-------------")
for i in range(5):
    col = x_train_res[:,i]
    print("Variable #{}: min {}, max {}, mean {:.2f} and std dev {:.2f}".
          format(i, min(col), max(col), np.mean(col), np.std(col)))

# learn the mean and std.dev of variables from training data
# then use the learned values to transform training data
x_train_res = scaler.fit_transform(x_train_res, y_train_res)

print("After scaling\n-------------")
for i in range(5):
    col = x_train_res[:,i]
    print("Variable #{}: min {}, max {}, mean {:.2f} and std dev {:.2f}".
          format(i, min(col), max(col), np.mean(col), np.std(col)))

# use the statistic that you learned from training to transform test data
# NEVER learn from test data, this is supposed to be a set of dataset
# that the model has never seen before
x_val_res = scaler.transform(x_val_res)


#Training logistic regression

model = LogisticRegression(random_state=rs)

# fit it to training data
model.fit(x_train_res, y_train_res)
# training and test accuracy
print("Train accuracy:", model.score(x_train_res, y_train_res))
print("Test accuracy:", model.score(x_val_res, y_val_res))

# classification report on test data
y_pred = model.predict(x_val_res)
print(classification_report(y_val_res, y_pred))

#printing coefficients
print(model.coef_)

#Coefficient with feature names
feature_names = X.columns
coef = model.coef_[0]

# limit to 20 features, you can comment the following line to print out everything
coef = coef[:20]

for i in range(len(coef)):
    print(feature_names[i], ':', coef[i])
    
# grab feature importances from the model and feature name from the original X
coef = model.coef_[0]
feature_names = X.columns

# sort them out in descending order
indices = np.argsort(np.absolute(coef))
indices = np.flip(indices, axis=0)

# limit to 20 features, you can leave this out to print out everything
indices = indices[:20]

print("Printing feature importance:")

for i in indices:
    print(feature_names[i], ':', coef[i])
    
# grid search CV
params = {'C': [pow(10, x) for x in range(-6, 4)]}

# use all cores to tune logistic regression with C parameter
cv = GridSearchCV(param_grid=params, estimator=LogisticRegression(random_state=rs), cv=10, n_jobs=-1)
cv.fit(x_train_res, y_train_res)

# test the best model
print("Train accuracy:", cv.score(x_train_res, y_train_res))
print("Test accuracy:", cv.score(x_val_res, y_val_res))

y_pred = cv.predict(x_val_res)
print(classification_report(y_val_res, y_pred))

# print parameters of the best model
print(cv.best_params_)


def plot_skewed_columns(df):
    # setting up subplots for easier visualisation
    f, axes = plt.subplots(2,4, figsize=(10,10), sharex=False)

    # gift avg plots
    sns.distplot(df['Make'].dropna(), ax=axes[0,0])
    sns.distplot(df['Size'].dropna(), ax=axes[0,1])
    sns.distplot(df['Auction'].dropna(), ax=axes[1,0])
    sns.distplot(df['Color'].dropna(), ax=axes[1,1])

    # gift cnt plots
    sns.distplot(df['VehBCost'].dropna(), ax=axes[0,2])
    sns.distplot(df['VehOdo'].dropna(), ax=axes[0,3])
    sns.distplot(df['WheelType'].dropna(), ax=axes[1,2])
    sns.distplot(df['Nationality'].dropna(), ax=axes[1,3])

    plt.show()
    
plot_skewed_columns(df)

# list columns to be transformed
columns_to_transform = ['Make', 'Size', 'Auction', 'Color','VehBCost', 'VehOdo', 'WheelType', 'Nationality']

# copy the dataframe
df_log = df.copy()

# transform the columns with np.log
for col in columns_to_transform:
    df_log[col] = df_log[col].apply(lambda x: x+1)
    df_log[col] = df_log[col].apply(np.log)

# plot them again to show the distribution
plot_skewed_columns(df_log)

y = df_log['IsBadBuy']
X = df_log.drop(['IsBadBuy'], axis=1)
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size = .1,random_state=12)
sm = SMOTE(random_state=12, ratio = 1.0)
x_res, y_res = sm.fit_sample(X_train, y_train)
x_train_res, x_val_res, y_train_res, y_val_res = train_test_split(x_res,y_res,test_size = .1,random_state=12)

# standardise them again
scaler_log = StandardScaler()
X_train_log = scaler_log.fit_transform(x_train_res, y_train_res)
X_test_log = scaler_log.transform(x_val_res)

# grid search CV
params = {'C': [pow(10, x) for x in range(-6, 4)]}

cv = GridSearchCV(param_grid=params, estimator=LogisticRegression(random_state=rs), cv=10, n_jobs=-1)
cv.fit(X_train_log, y_train_res)

# test the best model
print("Train accuracy:", cv.score(X_train_log, y_train_res))
print("Test accuracy:", cv.score(X_test_log, y_val_res))

y_pred = cv.predict(X_test_log)
print(classification_report(y_val_res, y_pred))

# print parameters of the best model
print(cv.best_params_)

#Recursive Feature Elimination
rfe = RFECV(estimator = LogisticRegression(random_state=rs), cv=10)
rfe.fit(x_train_res, y_train_res) # run the RFECV

# comparing how many variables before and after
print("Original feature set", x_train_res.shape[1])
print("Number of features after elimination", rfe.n_features_)
print("Remaining features after elimination: ", rfe.ranking_)

X_train_sel = rfe.transform(x_train_res)
X_test_sel = rfe.transform(x_val_res)

# grid search CV
params = {'C': [pow(10, x) for x in range(-6, 4)]}

cv = GridSearchCV(param_grid=params, estimator=LogisticRegression(random_state=rs), cv=10, n_jobs=-1)
cv.fit(X_train_sel, y_train_res)

# test the best model
print("Train accuracy:", cv.score(X_train_sel, y_train_res))
print("Test accuracy:", cv.score(X_test_sel, y_val_res))

y_pred = cv.predict(X_test_sel)
print(classification_report(y_val_res, y_pred))



  interactivity=interactivity, compiler=compiler, result=result)


Before scaling
-------------
Variable #0: min 1231113600.0, max 1293667200.0, mean 1262364467.67 and std dev 18060331.24
Variable #1: min 0.0, max 2.0, mean 0.89 and std dev 0.64
Variable #2: min 2001.0, max 2009.0, mean 2005.05 and std dev 1.67
Variable #3: min 0.0, max 29.0, mean 9.59 and std dev 6.64
Variable #4: min 0.0, max 15.0, mean 8.02 and std dev 4.76
After scaling
-------------
Variable #0: min -1.73035960682, max 1.73323135165, mean -0.00 and std dev 1.00
Variable #1: min -1.39172402645, max 1.72151741763, mean -0.00 and std dev 1.00
Variable #2: min -2.41707947038, max 2.35939041043, mean 0.00 and std dev 1.00
Variable #3: min -1.44361224562, max 2.92111193886, mean 0.00 and std dev 1.00
Variable #4: min -1.68430364605, max 1.46524274582, mean 0.00 and std dev 1.00
('Train accuracy:', 0.6481639916498408)
('Test accuracy:', 0.6492146596858639)
             precision    recall  f1-score   support

          0       0.65      0.64      0.65      3231
          1       0.65   

  return np.add.reduce(sorted[indexer] * weights, axis=axis) / sumval


<Figure size 1000x1000 with 8 Axes>

<Figure size 1000x1000 with 8 Axes>



('Train accuracy:', 0.6437493583381815)
('Test accuracy:', 0.6496766245765322)
             precision    recall  f1-score   support

          0       0.65      0.64      0.65      3231
          1       0.65      0.66      0.65      3263

avg / total       0.65      0.65      0.65      6494

{'C': 1}
('Original feature set', 25L)
('Number of features after elimination', 20)
('Remaining features after elimination: ', array([5, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 4, 1, 1, 1, 3, 1, 1, 1,
       1, 2, 6]))
('Train accuracy:', 0.6401902741179288)
('Test accuracy:', 0.6408992916538343)
             precision    recall  f1-score   support

          0       0.64      0.62      0.63      3231
          1       0.64      0.66      0.65      3263

avg / total       0.64      0.64      0.64      6494

