# Classification Analysis on Car Seats Data

We will divide the house prices in two categories and then try to predict those. The categories represents the target that the Car Seat Company is trying to reach and if that has been reached.

Author: Julia Hammerer, Vanessa Mai 
Last Changes: 10.12.2018

In [None]:
import sys
sys.path.insert(0, '../helper/')

In [None]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn.pipeline import Pipeline
from sklearn.model_selection import StratifiedKFold, cross_val_score, train_test_split, KFold
from sklearn.metrics import confusion_matrix, precision_score, recall_score, f1_score, accuracy_score
from matplotlib import pyplot as plt
from sklearn.metrics import roc_curve, auc
import itertools
from scipy import interp
from sklearn import svm

from helper import plot_confusion_matrix

In [None]:
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)
warnings.filterwarnings("ignore")


## Data Prep

In [None]:
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)

In [None]:
df=pd.read_csv("../data/Carseats.csv")
df.head()

In [None]:
df = df.drop(columns=["Unnamed: 0"])

In [None]:
def Sales_target(row):
    if row["Sales"]<=7.49:
        return "No"
    else:
        return "Yes"


df["AboveTarget"]= df.apply(lambda row: Sales_target(row), axis=1)

In [None]:
df.head()

In [None]:
df.drop(columns=["Sales"], inplace=True)

In [None]:
# we have to encode our categorical data
cols_to_transform= df.drop(columns=["AboveTarget"]).select_dtypes(include='object').columns.values
df_hot1encoded= pd.get_dummies(df, columns=cols_to_transform, prefix=cols_to_transform, drop_first=True)

In [None]:
df_hot1encoded.head()

In [None]:
display(df_hot1encoded.head())
print(df_hot1encoded.shape)

In [None]:
featureCols=df_hot1encoded.columns.tolist()
featureCols.remove("AboveTarget")

targetCol=df_hot1encoded["AboveTarget"]

In [None]:
X= df_hot1encoded[featureCols].values
y= df_hot1encoded["AboveTarget"].values

In [None]:
scaler= StandardScaler()
scaler.fit_transform(X)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=123)


## Model Comparison
in order to compare the models, we will save the models and the CV-results in a list:


In [None]:
opt_models = []

## Logistic Regression Iteration 1

In [None]:
pipe1 = Pipeline([('stdSc', StandardScaler()),
                 ('clf', LogisticRegression(C=0.1,random_state=1)) 
                ])

In [None]:
pipe1.fit(X_train, y_train)

In [None]:
y_pred=pipe1.predict(X_test)


In [None]:
print('Training Accuracy: %.3f' % pipe1.score(X_train, y_train)) 
print('Test Accuracy: %.3f' % pipe1.score(X_test, y_test)) 

In [None]:
class_names = ('Below Target', 'Above Target')

confusion_mat = confusion_matrix(y_test, y_pred)
plot_confusion_matrix(confusion_mat, normalize=False, classes=class_names)

In [None]:
print ("Accuracy:       ",accuracy_score(y_test, y_pred))
print ("Precision:      ",precision_score(y_test, y_pred,average=None))
print ("Recall:         ",recall_score(y_test, y_pred,average=None))
print ("F1-Score:       ",f1_score(y_test, y_pred,average=None))

Results for the `Middle Class`is better than the others. This is expected, since there are more samples for that class. Overall, the results are somewhat good. Let's try optimizing it all a bit. 

## Logistic Regression Iteration 2
Gridsearch plus CV

In [None]:
pipe1.get_params

In [None]:
# since newton-cg and saga only accepts l2 penalty, we will only use that in the gridsearch
penalties=["l2"]
c=np.logspace(-4, 0, 40)
solvers = ["newton-cg", "sag", "saga", "lbfgs"]

param_grid_lr={'clf__penalty' : penalties,
            'clf__C' :c,
              'clf__solver': solvers}

In [None]:
lr_randSearch = RandomizedSearchCV(estimator=pipe1, param_distributions=param_grid_lr,scoring='accuracy',cv=10, n_iter=8)
lr_randSearch.fit(X_train, y_train)


In [None]:
print("Best Configuration found by Grid Search")
print(lr_randSearch.best_score_)
print(lr_randSearch.best_params_)
print(lr_randSearch.best_estimator_)

In [None]:
pipe2=lr_randSearch.best_estimator_
pipe2.fit(X_train, y_train)
y_pred=pipe2.predict(X_test)
confusion_mat=confusion_matrix(y_test, y_pred)
print("Accuracy: ", accuracy_score(y_test, y_pred))
print("Recall: ",recall_score(y_test, y_pred,average=None))
print("Precision: ",precision_score(y_test, y_pred,average=None))
print("F1: ",f1_score(y_test, y_pred,average=None))
print(confusion_mat)


In [None]:
# add model to summary
opt_models.append(("Log Reg", pipe2.steps[1][1]))


Recall is not as good as the other scores. 

In [None]:
score=cross_val_score(pipe2, X, y, scoring="accuracy", cv=10)

In [None]:
score.mean()
print("Mean accuracy: %5.2f (+/-%5.2f) "%(score.mean(), score.std()))

In [None]:

cnf_matrix = confusion_matrix(y_test, y_pred)
np.set_printoptions(precision=2)
plt.figure()
plot_confusion_matrix(cnf_matrix, classes=class_names, title='Confusion matrix')
plt.show()

Same results as before.

### Feature Importance (LR)


In [None]:
LRImp=pd.DataFrame(index=featureCols, columns=["LR"])
LRImp["LR"]=pipe2.steps[1][1].coef_[0]
feature_imp=LRImp.sort_values(by="LR", ascending=False)

In [None]:
feature_imp.sort_values(by="LR", ascending=True).plot(kind="barh")

## Logistic Regression Iteration 3
- used subset of feature based on previous Logistic Regression

In [None]:
sel_features=feature_imp[abs(feature_imp["LR"])>0.05].index.tolist()

In [None]:
pipe3 = Pipeline([('stdSc', StandardScaler()),
                 ('clf', LogisticRegression(C=0.1,random_state=1)) 
                ])

In [None]:
# since newton-cg and saga only accepts l2 penalty, we will only use that in the gridsearch
penalties=["l2"]
c=np.logspace(-4, 0, 40)
solvers = ["newton-cg", "sag", "saga", "lbfgs"]

param_grid_lr={'clf__penalty' : penalties,
            'clf__C' :c,
              'clf__solver': solvers}

In [None]:
lr_randSearch2 = RandomizedSearchCV(estimator=pipe3, param_distributions=param_grid_lr,scoring='accuracy',cv=10, n_iter=8)
lr_randSearch2.fit(X_train, y_train)


In [None]:
print("Best Configuration found by Grid Search")
print(lr_randSearch2.best_score_)
print(lr_randSearch2.best_params_)
print(lr_randSearch2.best_estimator_)

In [None]:
pipe3=lr_randSearch.best_estimator_
pipe3.fit(X_train, y_train)
y_pred=pipe3.predict(X_test)
confusion_mat=confusion_matrix(y_test, y_pred)
print("Accuracy: ", accuracy_score(y_test, y_pred))
print("Recall: ",recall_score(y_test, y_pred,average=None))
print("Precision: ",precision_score(y_test, y_pred,average=None))
print("F1: ",f1_score(y_test, y_pred,average=None))


In [None]:

cnf_matrix = confusion_matrix(y_test, y_pred)
np.set_printoptions(precision=2)
plt.figure()
plot_confusion_matrix(cnf_matrix, classes=class_names, title='Confusion matrix')
plt.show()

Model did not improve much. Model can differentiate between low and high very well. For distinguishing between middle and low or middle and upper it has a little more difficulty.

## SVM

In [None]:
pipe_svm1 = Pipeline([('stdSc', StandardScaler()),
                 ('clf', svm.SVC() )
                ])

In [None]:
pipe_svm1.fit(X_train, y_train)

In [None]:
y_pred=pipe_svm1.predict(X_test)


In [None]:
print('Training Accuracy: %.3f' % pipe_svm1.score(X_train, y_train)) 
print('Test Accuracy: %.3f' % pipe_svm1.score(X_test, y_test)) 

In [None]:
cnf_mat = confusion_matrix(y_test, y_pred)
plot_confusion_matrix(cnf_mat,class_names)

In [None]:
print ("Accuracy:       ",accuracy_score(y_test, y_pred))
print ("Precision:      ",precision_score(y_test, y_pred,average=None))
print ("Recall:         ",recall_score(y_test, y_pred,average=None))
print ("F1-Score:       ",f1_score(y_test, y_pred,average=None))

## SVM iteration 2
Let's try optimizing this model


In [None]:
pipe_svm1.get_params().keys()

In [None]:
param_grid_svm = [
 {'clf__C': [1, 10, 100, 1000], 'clf__kernel': ['linear']},
 {'clf__C': [1, 10, 100, 1000], 'clf__gamma': [0.001, 0.0001], 'clf__kernel': ['rbf']},
]

In [None]:
svm_GridSearch = GridSearchCV(estimator=pipe_svm1, param_grid=param_grid_svm,scoring='accuracy',cv=10)
svm_GridSearch.fit(X_train, y_train)


In [None]:
print("Best Configuration found by Grid Search")
print(svm_GridSearch.best_score_)
print(svm_GridSearch.best_params_)
print(svm_GridSearch.best_estimator_)

In [None]:
pipe_svm2=svm_GridSearch.best_estimator_
pipe_svm2.fit(X_train, y_train)
y_pred=pipe_svm2.predict(X_test)
confusion_mat=confusion_matrix(y_test, y_pred)
print("Accuracy: ", accuracy_score(y_test, y_pred))
print("Recall: ",recall_score(y_test, y_pred,average=None))
print("Precision: ",precision_score(y_test, y_pred,average=None))
print("F1: ",f1_score(y_test, y_pred,average=None))


In [None]:

cnf_matrix = confusion_matrix(y_test, y_pred)
np.set_printoptions(precision=2)
plt.figure()
plot_confusion_matrix(cnf_matrix, classes=class_names, title='Confusion matrix')
plt.show()

In [None]:
# add model to summary
opt_models.append(("SVM", pipe_svm2.steps[1][1]))


## Comparison of Models

In [None]:
opt_models

In [None]:
seed=7
# evaluate each model in turn
opt_results = []
opt_names = []
scoring = 'accuracy'
for name, model in opt_models:
    kfold = KFold(n_splits=10, random_state=seed)
    cv_results = cross_val_score(model, X, y, cv=kfold, scoring=scoring)
    opt_results.append(cv_results)
    opt_names.append(name)
    msg = "%s: %f (%f)" % (name, cv_results.mean(), cv_results.std())
    print(msg)
# boxplot algorithm comparison
fig = plt.figure()
fig.suptitle('Algorithm Comparison')
ax = fig.add_subplot(111)
plt.boxplot(opt_results)
ax.set_xticklabels(opt_names)
plt.show()

Logistic Regression performs much better