In [1]:
import pandas as pd
import numpy as np
cardata = pd.read_csv('ebay_car_scrapped_dataset_cleaned_initial.csv')
cardata['Year'] = cardata['Year'].apply(str)

In [2]:
cardata['price'] = cardata['Car Price Original']

In [3]:
bins = [0, 15000, 50000,999999]
labels = ['1', '2', '3']
cardata['price'] = pd.cut(x = cardata['price'], bins = bins, labels = labels, include_lowest = True)

In [11]:
cardata['price'].value_counts()

2    1416
3     974
1     556
Name: price, dtype: int64

In [4]:
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier

traindata = pd.get_dummies(cardata.drop(['Car Price','Report Car Name','VIN','Make','price','Car Price Original'],axis=1))
#traindata = pd.get_dummies(cardata.drop(['price'],axis=1))
testdata = cardata['price']
xtrain,xtest,ytrain,ytest = train_test_split(traindata,testdata,test_size=0.2,random_state=101)

rd = RandomForestClassifier()
rd.fit(xtrain,ytrain)
pred = rd.predict(xtest)
print(classification_report(ytest,pred))

              precision    recall  f1-score   support

           1       0.83      0.63      0.71       113
           2       0.77      0.85      0.81       287
           3       0.84      0.82      0.83       190

    accuracy                           0.80       590
   macro avg       0.81      0.77      0.78       590
weighted avg       0.80      0.80      0.80       590



In [5]:
xtrain.shape

(2356, 70)

In [5]:
#FEATURE SELECTION BASED ON RD FEATURE IMPORTANCES

from sklearn.feature_selection import SelectFromModel
from numpy import sort

thresholds = sort(rd.feature_importances_)
max_accuracy = 0
max_threshold = 0
features = 0
report = ""
for thresh in thresholds:
    # select features using threshold
    selection = SelectFromModel(rd, threshold=thresh, prefit=True)
    select_X_train = selection.transform(xtrain)
    # train model
    selection_model = RandomForestClassifier()
    selection_model.fit(select_X_train, ytrain)
    # eval model
    select_X_test = selection.transform(xtest)
    y_pred = selection_model.predict(select_X_test)
    if(classification_report(ytest,y_pred,output_dict=True)['accuracy']>max_accuracy):
        max_accuracy = classification_report(ytest,y_pred,output_dict=True)['accuracy']
        max_threshold = thresh
        features = select_X_train.shape[1]
        report = classification_report(ytest,y_pred)
print(max_accuracy)
print(max_threshold)
print(features)
print(report)









0.8067796610169492
0.004608906868747044
51
              precision    recall  f1-score   support

           1       0.88      0.64      0.74       113
           2       0.77      0.87      0.82       287
           3       0.84      0.81      0.82       190

    accuracy                           0.81       590
   macro avg       0.83      0.77      0.79       590
weighted avg       0.81      0.81      0.80       590





In [14]:
#SVC BASE MODEL
from sklearn.svm import SVC
svc = SVC()
svc.fit(xtrain,ytrain)
print(classification_report(ytest,svc.predict(xtest)))

              precision    recall  f1-score   support

           1       0.61      0.30      0.40       113
           2       0.59      0.76      0.67       287
           3       0.68      0.59      0.64       190

    accuracy                           0.62       590
   macro avg       0.63      0.55      0.57       590
weighted avg       0.63      0.62      0.61       590



In [15]:
#OPTIMIZED SVC WITH PARAMETER TUNING
param_grid = {'C': [0.1,1, 10, 100, 1000], 'gamma': [1,0.1,0.01,0.001,0.0001], 'kernel': ['rbf']} 
from sklearn.model_selection import GridSearchCV
from sklearn.svm import SVC
grid = GridSearchCV(SVC(),param_grid,refit=True,verbose=3)
grid.fit(xtrain,ytrain)
grid.best_params_

Fitting 5 folds for each of 25 candidates, totalling 125 fits
[CV 1/5] END ........C=0.1, gamma=1, kernel=rbf;, score=0.479 total time=   0.4s
[CV 2/5] END ........C=0.1, gamma=1, kernel=rbf;, score=0.478 total time=   0.4s
[CV 3/5] END ........C=0.1, gamma=1, kernel=rbf;, score=0.480 total time=   0.4s
[CV 4/5] END ........C=0.1, gamma=1, kernel=rbf;, score=0.480 total time=   0.4s
[CV 5/5] END ........C=0.1, gamma=1, kernel=rbf;, score=0.480 total time=   0.4s
[CV 1/5] END ......C=0.1, gamma=0.1, kernel=rbf;, score=0.479 total time=   0.4s
[CV 2/5] END ......C=0.1, gamma=0.1, kernel=rbf;, score=0.478 total time=   0.4s
[CV 3/5] END ......C=0.1, gamma=0.1, kernel=rbf;, score=0.480 total time=   0.4s
[CV 4/5] END ......C=0.1, gamma=0.1, kernel=rbf;, score=0.480 total time=   0.4s
[CV 5/5] END ......C=0.1, gamma=0.1, kernel=rbf;, score=0.480 total time=   0.4s
[CV 1/5] END .....C=0.1, gamma=0.01, kernel=rbf;, score=0.492 total time=   0.4s
[CV 2/5] END .....C=0.1, gamma=0.01, kernel=rbf

[CV 2/5] END .......C=1000, gamma=1, kernel=rbf;, score=0.571 total time=   0.4s
[CV 3/5] END .......C=1000, gamma=1, kernel=rbf;, score=0.586 total time=   0.4s
[CV 4/5] END .......C=1000, gamma=1, kernel=rbf;, score=0.586 total time=   0.4s
[CV 5/5] END .......C=1000, gamma=1, kernel=rbf;, score=0.590 total time=   0.4s
[CV 1/5] END .....C=1000, gamma=0.1, kernel=rbf;, score=0.610 total time=   0.4s
[CV 2/5] END .....C=1000, gamma=0.1, kernel=rbf;, score=0.597 total time=   0.4s
[CV 3/5] END .....C=1000, gamma=0.1, kernel=rbf;, score=0.622 total time=   0.4s
[CV 4/5] END .....C=1000, gamma=0.1, kernel=rbf;, score=0.628 total time=   0.3s
[CV 5/5] END .....C=1000, gamma=0.1, kernel=rbf;, score=0.631 total time=   0.4s
[CV 1/5] END ....C=1000, gamma=0.01, kernel=rbf;, score=0.617 total time=   0.3s
[CV 2/5] END ....C=1000, gamma=0.01, kernel=rbf;, score=0.603 total time=   0.4s
[CV 3/5] END ....C=1000, gamma=0.01, kernel=rbf;, score=0.622 total time=   0.4s
[CV 4/5] END ....C=1000, gam

{'C': 100, 'gamma': 0.01, 'kernel': 'rbf'}

In [85]:
#OPTIMIZED MODEL - TUNED SVC WITH FEAURE SELECTION 
from sklearn.feature_selection import RFE
rfe_selector = RFE(estimator=SVC(), n_features_to_select=850, step=5, verbose=5)
rfe_selector.fit(xtrain,ytrain)
rfe_support = rfe_selector.get_support()
rfe_feature = xtrain.loc[:,rfe_support].columns.tolist()
svc = SVC(C=100,gamma=0.01,kernel = 'rbf')
svc.fit(xtrain[rfe_feature],ytrain)
predictions = svc.predict(xtest[rfe_feature])
print(classification_report(ytest,predictions))

              precision    recall  f1-score   support

           1       0.58      0.32      0.41       113
           2       0.61      0.82      0.70       287
           3       0.75      0.57      0.65       190

    accuracy                           0.64       590
   macro avg       0.65      0.57      0.59       590
weighted avg       0.65      0.64      0.63       590



In [86]:
#BASE MODEL FOR NB 
from sklearn.naive_bayes import GaussianNB
nb = GaussianNB()
nb.fit(xtrain,ytrain)
print(classification_report(ytest,nb.predict(xtest)))

              precision    recall  f1-score   support

           1       0.54      0.40      0.46       113
           2       0.60      0.51      0.55       287
           3       0.57      0.80      0.67       190

    accuracy                           0.58       590
   macro avg       0.57      0.57      0.56       590
weighted avg       0.58      0.58      0.57       590



In [89]:
#PARAMETER TUNING FOR NB
param_grid_nb = {
    'var_smoothing': np.logspace(0,-9, num=100)
}
from sklearn.naive_bayes import GaussianNB
from sklearn.model_selection import GridSearchCV
nbModel_grid = GridSearchCV(estimator=GaussianNB(), param_grid=param_grid_nb, verbose=1, cv=10, n_jobs=-1)
nbModel_grid.fit(xtrain, ytrain)
print(nbModel_grid.best_estimator_)

Fitting 10 folds for each of 100 candidates, totalling 1000 fits
GaussianNB(var_smoothing=0.8111308307896871)


In [90]:
#OPTIMIZED MODEL FOR TUNED NB WITH FEATURE SELECTION
from sklearn.feature_selection import RFE
rfe_selector = RFE(estimator=GaussianNB(var_smoothing=0.8111308307896871), n_features_to_select=850, step=5, verbose=5)
rfe_selector.fit(xtrain,ytrain)
rfe_support = rfe_selector.get_support()
rfe_feature = xtrain.loc[:,rfe_support].columns.tolist()
nb = GaussianNB(var_smoothing=0.8111308307896871)
nb.fit(xtrain[rfe_feature],ytrain)
predictions = nb.predict(xtest[rfe_feature])
print(classification_report(ytest,predictions))

              precision    recall  f1-score   support

           1       0.67      0.12      0.21       113
           2       0.58      0.79      0.67       287
           3       0.66      0.62      0.64       190

    accuracy                           0.61       590
   macro avg       0.63      0.51      0.50       590
weighted avg       0.62      0.61      0.57       590



In [None]:

import yellowbrick.classifier import ROCAUC

def plot_ROC_curve(model, xtrain, ytrain, xtest, ytest):

    # Creating visualization with the readable labels
    visualizer = ROCAUC(model, encoder={1: 'functional', 
                                        2: 'needs repair', 
                                        3: 'nonfunctional'})
                                        
    # Fitting to the training data first then scoring with the test data                                    
    visualizer.fit(xtrain, ytrain)
    visualizer.score(xtest, ytest)
    visualizer.show()
    
    return visualizer
