In [2]:
import pandas as pd
import numpy as np
from sklearn import svm
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import LabelEncoder
from pandas.api.types import CategoricalDtype
import time

In [3]:
data = pd.read_csv('car_kick.csv')
# Deleting Unwanted Columns of Data
data = data.drop(['PurchDate','Color', 'VNZIP1'], axis=1)

# Deleting Missing Values
data = data[data['MMRAcquisitionAuctionAveragePrice']!=0]
data = data[data['MMRAcquisitionAuctionCleanPrice']!=0]
data = data[data['MMRAcquisitionRetailAveragePrice']!=0]
data = data[data['MMRAcquisitonRetailCleanPrice']!=0]

X = data.iloc[:, :data.shape[1]-1]
Y = data.iloc[:, data.shape[1]-1]

In [4]:
# Name of Data Attributes
N = np.array(X.columns)
# print(N)

encoder = LabelEncoder()

for i in range(len(N)):
    if type(X.loc[0, N[i]]) == str :
        X[N[i]] = encoder.fit_transform(X[N[i]])

# Normailising the Data
for i in range(len(N)):
    X[N[i]] = ( ( X[N[i]] - X[N[i]].min() ) /  (X[N[i]].max() - X[N[i]].min()))

In [5]:
# Classifying Attributes as Ordinal and Nominal
# Classifying Attributes as Ordinal and Nominal
X['Auction'] = X['Auction'].astype(CategoricalDtype(ordered=False))
X['Make'] = X['Make'].astype(CategoricalDtype(ordered=False))
X['Model'] = X['Model'].astype(CategoricalDtype(ordered=False))
X['Trim'] = X['Trim'].astype(CategoricalDtype(ordered=False))
X['SubModel'] = X['SubModel'].astype(CategoricalDtype(ordered=False))
X['Transmission'] = X['Transmission'].astype(CategoricalDtype(ordered=False))
X['WheelTypeID'] = X['WheelTypeID'].astype(CategoricalDtype(ordered=False))
X['WheelType'] = X['WheelType'].astype(CategoricalDtype(ordered=False))
X['Nationality'] = X['Nationality'].astype(CategoricalDtype(ordered=False))
X['TopThreeAmericanName'] = X['TopThreeAmericanName'].astype(CategoricalDtype(ordered=False))
X['BYRNO'] = X['BYRNO'].astype(CategoricalDtype(ordered=False))
X['VNST'] = X['VNST'].astype(CategoricalDtype(ordered=False))

In [6]:
x_time = time.time()
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, random_state = 40, train_size = 0.5, test_size = 0.14)
clf = svm.SVC()
# clf.fit(X_train, Y_train)
y_time = time.time()
print("Time in (sec)", y_time-x_time)

Time in (sec) 0.05092883110046387


In [7]:
param_grid = {
    'kernel' : [ 'linear', 'poly', 'rbf', 'sigmoid' ]
}

x_time = time.time()
gs = GridSearchCV( clf, param_grid=param_grid, cv=2, n_jobs=2 )
gs.fit(X_train, Y_train)
y_time = time.time()
print("Time in (min)", (y_time-x_time)/60)

print('Best Hyperparameters : ', gs.best_params_)
print('Best score : ', gs.best_score_)

Time in (min) 23.307499563694
Best Hyperparameters :  {'kernel': 'poly'}
Best score :  0.9048737167625873


In [None]:
clf_best = svm.SVC(**gs.best_params_)
clf_best.fit(X_train, Y_train) 

In [10]:
x_time = time.time()
Y_pred_train = clf_best.predict( X_train )
Y_pred_test = clf_best.predict( X_test )
y_time = time.time()
print("Time in (sec)", y_time-x_time)
print('Accuracy Score on train data: ', accuracy_score(y_true=Y_train, y_pred=Y_pred_train))
print('Accuracy Score on test data: ', accuracy_score(y_true=Y_test, y_pred=Y_pred_test))

Time in (sec) 1.151637077331543
Accuracy Score on train data:  0.9042673289681644
Accuracy Score on test data:  0.900129004515158
