# Exercises
- Within your codeup-data-science directory, create a new repo named advanced-topics. This will be where you do your work for this module. Create a repository on GitHub with the same name, and link your local repository to GitHub.

- Save this work in your advanced-topics repo. Then add, commit, and push your changes.

- Do your work for this exercise in a jupyter notebook or python script named cross_validation.

- Use the cross validation techniques discussed in the lesson to figure out what kind of model works best with the cars dataset used in the lesson.

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import pydataset

from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score, GridSearchCV, train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression

In [2]:
# Acquire data
df= pd.read_csv('cars.csv')

In [3]:
#Looking at first 5 values before any changes
df.head()

Unnamed: 0,Id,Price,Year,Mileage,City,State,Vin,Make,Model
0,1,16472,2015,18681,Jefferson City,MO,KL4CJBSBXFB267643,Buick,EncoreConvenience
1,2,15749,2015,27592,Highland,IN,KL4CJASB5FB245057,Buick,EncoreFWD
2,3,16998,2015,13650,Boone,NC,KL4CJCSB0FB264921,Buick,EncoreLeather
3,4,15777,2015,25195,New Orleans,LA,KL4CJASB4FB217542,Buick,EncoreFWD
4,5,16784,2015,22800,Las Vegas,NV,KL4CJBSB3FB166881,Buick,EncoreConvenience


In [4]:
# create a feature that determines whether a car sold for over the average sale price, and try to predict this.
# lowercase all the column names
df.columns = [c.lower() for c in df]

#set the id column as the index
df.set_index('id', inplace=True)

#print out the shape
print('{} rows x {} cols'.format(*df.shape))

#Looking at first 5 values of dataframe
df.head()

297899 rows x 8 cols


Unnamed: 0_level_0,price,year,mileage,city,state,vin,make,model
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
1,16472,2015,18681,Jefferson City,MO,KL4CJBSBXFB267643,Buick,EncoreConvenience
2,15749,2015,27592,Highland,IN,KL4CJASB5FB245057,Buick,EncoreFWD
3,16998,2015,13650,Boone,NC,KL4CJCSB0FB264921,Buick,EncoreLeather
4,15777,2015,25195,New Orleans,LA,KL4CJASB4FB217542,Buick,EncoreFWD
5,16784,2015,22800,Las Vegas,NV,KL4CJBSB3FB166881,Buick,EncoreConvenience


In [5]:
#create avg saleprice column
df['avg_saleprice'] = df.groupby(['year', 'make', 'model']).price.transform('mean')

#create column that say whether car was sold over the average price
df['gt_avg'] = (df.price > df.avg_saleprice).astype(int)

In [6]:
# encode the categorical columns
from sklearn.preprocessing import LabelEncoder

for col in ['state', 'make', 'model', 'year']:
    le = LabelEncoder().fit(df[col])
    df[col] = le.transform(df[col])

In [7]:
#Looking at dataframe after encoding and adding new columns
df.head()

Unnamed: 0_level_0,price,year,mileage,city,state,vin,make,model,avg_saleprice,gt_avg
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
1,16472,18,18681,Jefferson City,28,KL4CJBSBXFB267643,7,523,17291.768786,0
2,15749,18,27592,Highland,19,KL4CJASB5FB245057,7,525,16721.350598,0
3,16998,18,13650,Boone,32,KL4CJCSB0FB264921,7,526,19080.632911,0
4,15777,18,25195,New Orleans,22,KL4CJASB4FB217542,7,525,16721.350598,0
5,16784,18,22800,Las Vegas,38,KL4CJBSB3FB166881,7,523,17291.768786,0


In [10]:
#Remove features we won't use
df.drop(columns=['price', 'city', 'vin', 'avg_saleprice'], inplace=True)

In [11]:
#Looking at dataframe after encoding and adding new columns and dropping columns
df.head()

Unnamed: 0_level_0,year,mileage,state,make,model,gt_avg
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
1,18,18681,28,7,523,0
2,18,27592,19,7,525,0
3,18,13650,32,7,526,0
4,18,25195,22,7,525,0
5,18,22800,38,7,523,0


In [12]:
# split data
X, y = df.drop(columns='gt_avg'), df.gt_avg

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.2)

In [13]:
#split further into training and validate datasets
X_train, X_validate, y_train, y_validate = train_test_split(X_train, y_train, test_size=.3333)

In [14]:
# use cross_val_score function to automate the splitting process
import sklearn.metrics as m
from sklearn.model_selection import cross_val_score

tree = DecisionTreeClassifier(max_depth=2)

cross_val_score(tree, X_train, y_train, cv=3)

array([0.59405623, 0.59314225, 0.59127299])

In [15]:
#using precision as the metric
cross_val_score(tree, X_train, y_train, cv=3, scoring='precision')

array([0.59035536, 0.64509337, 0.64192624])

In [16]:
# using Grid Search CV to quickly try out many different combinations of hyperparameters

from sklearn.model_selection import GridSearchCV

params = {'max_depth': [2, 3, 4],
          'max_features': [None, 1, 3]}

#create the object
tree = DecisionTreeClassifier()


grid = GridSearchCV(tree, params, cv=3)

#fit it
grid.fit(X_train, y_train)

GridSearchCV(cv=3, estimator=DecisionTreeClassifier(),
             param_grid={'max_depth': [2, 3, 4], 'max_features': [None, 1, 3]})

In [17]:
#Looking at the cross validation results in the cv_results_ property of the object we created.
results = grid.cv_results_
results.keys()

dict_keys(['mean_fit_time', 'std_fit_time', 'mean_score_time', 'std_score_time', 'param_max_depth', 'param_max_features', 'params', 'split0_test_score', 'split1_test_score', 'split2_test_score', 'mean_test_score', 'std_test_score', 'rank_test_score'])

In [18]:
#Looking at test scores for all the models
test_scores = results['mean_test_score']
test_scores

array([0.59282382, 0.53538676, 0.59211262, 0.63253761, 0.53570774,
       0.62359409, 0.63806354, 0.58516453, 0.6381013 ])

In [19]:
#Looking at the parameters for all the models
params = results['params']
params

[{'max_depth': 2, 'max_features': None},
 {'max_depth': 2, 'max_features': 1},
 {'max_depth': 2, 'max_features': 3},
 {'max_depth': 3, 'max_features': None},
 {'max_depth': 3, 'max_features': 1},
 {'max_depth': 3, 'max_features': 3},
 {'max_depth': 4, 'max_features': None},
 {'max_depth': 4, 'max_features': 1},
 {'max_depth': 4, 'max_features': 3}]

In [21]:
#Combining the test scores and parameters using zip
for p, s in zip(params, test_scores):
    p['score'] = s

#convert to dataframe    
pd.DataFrame(params).sort_values(by='score')

Unnamed: 0,max_depth,max_features,score
1,2,1.0,0.535387
4,3,1.0,0.535708
7,4,1.0,0.585165
2,2,3.0,0.592113
0,2,,0.592824
5,3,3.0,0.623594
3,3,,0.632538
6,4,,0.638064
8,4,3.0,0.638101


### Our best model on the test dataset is model # 8. It has a max_depth of 4 and max_features of 3.0