In [1]:
import os
import pandas as pd
import numpy as np
import sklearn
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import roc_curve, auc
from sklearn.metrics import roc_auc_score
from sklearn.metrics import f1_score, precision_score, recall_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn import metrics
from sklearn import preprocessing
from sklearn.metrics import accuracy_score

from pathlib import Path

In [2]:
# Test saves file reads properly

data_path = '../datasets/autoria/autoria_data.csv'
clean_data_path = '../datasets/autoria_clean_data.csv'

In [3]:
clean_df = pd.read_csv(clean_data_path)
clean_df

Unnamed: 0,id,brand,price_USD,mileage_kkm,fuel_type,transmission_type,pub_date,year_made,model,engine_size
0,0,Mercedes-Benz,8999,159,gas/petrol,automatic,2021-05-21,2007,E 280 4MATIC AT CLASSIC,3.0
1,1,Mercedes-Benz,79900,103,diesel,automatic,2021-05-15,2015,G 350 Designo,3.0
2,2,BMW,15999,145,petrol,automatic,2021-04-18,2010,550,4.4
3,3,MINI,10200,111,petrol,automatic,2021-05-21,2014,Countryman S,1.6
4,4,Nissan,13900,97,petrol,automatic,2021-05-19,2010,X-Trail,2.0
...,...,...,...,...,...,...,...,...,...,...
146809,146809,Renault,6100,260,diesel,manual,2021-10-05,2008,Kangoo пасс.,1.5
146810,146810,Audi,2100,352,petrol,manual,2021-10-05,1988,80 1.8 S,
146811,146811,BMW,18000,70,petrol,automatic,2021-10-05,2011,520 F10,2.0
146812,146812,ВАЗ,400,278,petrol,manual,2021-10-05,2006,1118,


In [4]:
# Converting strings to digits
le = preprocessing.LabelEncoder()
enumerate_df = clean_df.apply(le.fit_transform)

In [5]:
# Selecting features and output
X = enumerate_df.loc[:, enumerate_df.columns != 'price_USD']
y = enumerate_df['price_USD']

In [6]:
# DIviding data into training sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

In [None]:
# Applying model
clf = DecisionTreeClassifier()

model = clf.fit(X_train, y_train)
predictions = clf.predict(X_test)

In [None]:
# Output plots of accuracy
def plot_roc_curve(clf, X_train, X_test, y_train, y_test):
    predictions = clf.fit(X_train, y_train).predict(X_test)
    fp, tp, th = roc_curve(y_test, predictions)
    roc_auc_mla = auc(fp, tp)
    plt.plot(fp, tp, lw=2, alpha=0.3)
    plt.title('ROC Curve comparison')
    plt.legend(bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0.)
    plt.plot([0,1],[0,1],'r--')
    plt.xlim([0,1])
    plt.ylim([0,1])
    plt.ylabel('True Positive Rate')
    plt.xlabel('False Positive Rate')    
    plt.show()

plot_roc_curve(clf, X_train, X_test, y_train, y_test)

In [None]:
# Output parameters of accuracy
roc_auc = roc_auc_score(y_test, predictions)

print("AUC score ", roc_auc)
print("precision score", precision_score(y_test, predictions))
print("recall score", recall_score(y_test, predictions))
print("f1 score",f1_score(y_test, predictions))
print("accuracy",accuracy_score(y_test, predictions))