In [1]:
# Imports

import pandas as pd

from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV

import env

# Aquire and Prepare Sample Data

In [2]:
# Aquire data from sql
url = f'mysql+pymysql://{env.user}:{env.password}@{env.host}/used_cars'
cars = pd.read_sql('SELECT * FROM cars', url)
cars.head()

Unnamed: 0,Id,Price,Year,Mileage,City,State,Vin,Make,Model
0,1,16472,2015,18681,Jefferson City,MO,KL4CJBSBXFB267643,Buick,EncoreConvenience
1,2,15749,2015,27592,Highland,IN,KL4CJASB5FB245057,Buick,EncoreFWD
2,3,16998,2015,13650,Boone,NC,KL4CJCSB0FB264921,Buick,EncoreLeather
3,4,15777,2015,25195,New Orleans,LA,KL4CJASB4FB217542,Buick,EncoreFWD
4,5,16784,2015,22800,Las Vegas,NV,KL4CJBSB3FB166881,Buick,EncoreConvenience


In [3]:
# preparing data

cars.columns = [c.lower() for c in cars] # lowercase column names
cars.set_index('id', inplace=True) # setting index to id

# adding column for sold for more than the average
cars['avg_saleprice'] = cars.groupby(['year', 'make', 'model']).price.transform('mean')
cars['gt_avg'] = (cars.price > cars.avg_saleprice).astype(int)

# dropping unneeded columns
cars.drop(columns=['price', 'city', 'vin', 'avg_saleprice'], inplace=True)

# Encoding categorical columns
for col in ['state', 'make', 'model', 'year']:
    le = LabelEncoder().fit(cars[col])
    cars[col] = le.transform(cars[col])
    
cars.head()

Unnamed: 0_level_0,year,mileage,state,make,model,gt_avg
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
1,18,18681,28,7,523,0
2,18,27592,19,7,525,0
3,18,13650,32,7,526,0
4,18,25195,22,7,525,0
5,18,22800,38,7,523,0


# Split Data into Train, Validate, and Test

In [4]:
# split into features and target
X, y = cars.drop(columns='gt_avg'), cars.gt_avg

# split into train and test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.2)

# further spliting into train test validate
X_train, X_validate, y_train, y_validate = train_test_split(X_train, y_train, test_size=.3333)

In [5]:
X_train.shape

(158887, 5)

In [6]:
X_validate.shape

(79432, 5)

In [7]:
X_test.shape

(59580, 5)

# Train Decision Tree Model and Get Accuracy Score

In [8]:
# create classifier object
clf = DecisionTreeClassifier(max_depth=5, random_state=123)

#fit model on training data
tree = clf.fit(X_train, y_train)

# print result
print(f"Accuracy of Decision Tree on train data is {tree.score(X_train, y_train)}")
print(f"Accuracy of Decision Tree on validate data is {tree.score(X_validate, y_validate)}")
print(f"Accuracy of Decision Tree on test data is {tree.score(X_test, y_test)}")

Accuracy of Decision Tree on train data is 0.6524133503685009
Accuracy of Decision Tree on validate data is 0.6485421492597442
Accuracy of Decision Tree on test data is 0.6473816717019134


# Split Data into Train and Test

In [9]:
# split into features and target
X, y = cars.drop(columns='gt_avg'), cars.gt_avg

# split into train and test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.2)

In [10]:
X_train.shape

(238319, 5)

In [11]:
y_train.shape

(238319,)

# Train Model using Cross Validate use cv=3 and cv=4 to Mesure Accuracy

In [12]:
tree = DecisionTreeClassifier(max_depth=3)

print("cross val with c = 3:", cross_val_score(tree, X_train, y_train, cv=3))
print("cross val with c = 4:", cross_val_score(tree, X_train, y_train, cv=4))

cross val with c = 3: [0.62868832 0.63396274 0.63505331]
cross val with c = 4: [0.62974152 0.6350621  0.62950655 0.63634838]


In [13]:
print("Mean of Cross Val with c = 3:", cross_val_score(tree, X_train, y_train, cv=3).mean())
print("Mean of Cross Val with c = 4:", cross_val_score(tree, X_train, y_train, cv=4).mean())

Mean of Cross Val with c = 3: 0.632568122915875
Mean of Cross Val with c = 4: 0.6326646372456268


# Train Model using Grid Search With Multiple Peramiters

In [14]:
# dictionary of peramiters to use
params = {'max_depth': [2, 3, 4],
          'max_features': [None, 1, 3]}

# classifier object
tree = DecisionTreeClassifier()

# gridsearch object
grid = GridSearchCV(tree, params, cv=3)

# train gridsearch object with training data
grid.fit(X_train, y_train)

# get results of testing
results = grid.cv_results_

# Get Dataframe of Mean Accuracy for Each Peramiter Set

In [22]:
# get mean accuracy results for tested models
scores = results['mean_test_score']

# get paramaters used in evaluation
params = results['params']

# add score key to each dictionary in params with corisponding score in scors
for param, score in zip(params, scores):
    param['score'] = score
    
pd.DataFrame(params).sort_values(by='score')

Unnamed: 0,max_depth,max_features,score
1,2,1.0,0.538606
4,3,1.0,0.56327
7,4,1.0,0.565448
2,2,3.0,0.586613
0,2,,0.593197
5,3,3.0,0.605995
8,4,3.0,0.629064
3,3,,0.632568
6,4,,0.638057
