<div style='background-color:orange'>
<a id="TableOfContents"></a>
    <h1 style='text-align:center ; top-padding:5px'>
        <b><i>
            TABLE OF CONTENTS:
        </i></b></h1>
    <li><a href='#imports'>Imports</a>
    <li><a href="#lecture">Lecture Portion</a>
    <li><a href='#dtc'>DTC</a>
    <li><a href="#rfc">RFC</a>
    <li><a href='#knn'>KNN</a>
    <li><a href='#lr'>LR</a>
    <li><a href='#misc'>Miscellaneous</a>
    </li>
</div>

<div style='background-color:orange'>
<a id="imports"></a>
    <h1 style='text-align:center ; top-padding:5px'>
        <b><i>
            Imports
        </i></b></h1>
    <li><a href='#TableOfContents'>Table of Contents</a>
    </li>
</div>

In [1]:
# Vectorization and tables
import numpy as np
import pandas as pd

# Visualization
import matplotlib.pyplot as plt
import seaborn as sns

# sklearn stuff
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import LabelEncoder

# .py files
import wrangle as w
import env

<div style='background-color:orange'>
<a id="lecture"></a>
    <h1 style='text-align:center ; top-padding:5px'>
        <b><i>
            Lecture Portion
        </i></b></h1>
    <li><a href='#TableOfContents'>Table of Contents</a>
    </li>
</div>

In [2]:
# Load the cars.csv data
cars = pd.read_csv('cars.csv')
cars.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 297899 entries, 0 to 297898
Data columns (total 8 columns):
 #   Column   Non-Null Count   Dtype 
---  ------   --------------   ----- 
 0   price    297899 non-null  int64 
 1   year     297899 non-null  int64 
 2   mileage  297899 non-null  int64 
 3   city     297899 non-null  object
 4   state    297899 non-null  object
 5   vin      297899 non-null  object
 6   make     297899 non-null  object
 7   model    297899 non-null  object
dtypes: int64(3), object(5)
memory usage: 18.2+ MB


In [3]:
# Which cars have sold high?
cars.groupby(['make', 'model', 'year']).price.mean()

make     model          year
AM       General        1997     62489.250000
                        1998     47499.500000
                        1999     48097.500000
                        2000     58658.142857
                        2001     71748.000000
                                    ...      
Porsche  PanameraTurbo  2013     72924.000000
                        2014     81624.333333
                        2015     88990.000000
                        2017    148993.333333
         Panamerabase   2013     43296.833333
Name: price, Length: 5833, dtype: float64

In [4]:
# Attach mean price to respective groups
cars['mean_price'] = cars.groupby(['make', 'model', 'year']).price.transform('mean')
cars.mean_price.sample()

255755    17757.925532
Name: mean_price, dtype: float64

In [5]:
# Identify which cars sold high
cars['sold_high'] = (cars.price > cars['mean_price']).astype(int)
cars.sold_high.value_counts(normalize=True)

0    0.532127
1    0.467873
Name: sold_high, dtype: float64

In [6]:
# Identify number columns
num_feats = [col for col in cars.columns if (np.issubdtype(cars[col], np.number) and cars[col].nunique() > 25)]
num_feats.remove('mean_price')

In [7]:
# Remove city, vin, mean_price, price
cars = cars.drop(columns=['city', 'vin', 'mean_price', 'price'])

In [8]:
# Identify categorical columns
cat_cols = [col for col in cars.columns if col not in num_feats]
cat_cols

['year', 'state', 'make', 'model', 'sold_high']

In [9]:
cat_cols.remove('sold_high')

In [10]:
for col in ['state', 'make', 'model']:
    encoder = LabelEncoder()
    cars[col] = encoder.fit_transform(cars[col])

In [11]:
cars.head()

Unnamed: 0,year,mileage,state,make,model,sold_high
0,2015,18681,28,7,523,0
1,2015,27592,19,7,525,0
2,2015,13650,32,7,526,0
3,2015,25195,22,7,525,0
4,2015,22800,38,7,523,0


In [12]:
# Using cross-validation score:
# Split into train and test:
X, y = cars.drop(columns='sold_high'), cars[['sold_high']]

In [13]:
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.8, random_state=1349)

In [16]:
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((238319, 5), (59580, 5), (238319, 1), (59580, 1))

In [18]:
# model object is what we need to proceed
# let's make one of those
dtc = DecisionTreeClassifier(max_depth=4)
# note we are not fitting this model on our
# training set before we feed it into
# cross_val score the function is splitting
cross_val_score(dtc, X_train, y_train, cv=3)

array([0.63843152, 0.63429003, 0.63806191])

In [19]:
# Let's do a grid-search
# Define a parameter grid
# a parameter grid will be a dictionary
# of whatever hyperparameters that you want to use
# For a DTC, it'll look like this
param_grid = {
    'max_depth' : [None, 10, 4, 3, 2],
    'min_samples_leaf' : [1, 3, 5, 20],
    'criterion' : ['gini', 'entropy']
}
gsearch = GridSearchCV(DecisionTreeClassifier(),
                      param_grid)

In [21]:
gsearch

GridSearchCV(estimator=DecisionTreeClassifier(),
             param_grid={'criterion': ['gini', 'entropy'],
                         'max_depth': [None, 10, 4, 3, 2],
                         'min_samples_leaf': [1, 3, 5, 20]})

In [22]:
gsearch.fit(X_train, y_train)

GridSearchCV(estimator=DecisionTreeClassifier(),
             param_grid={'criterion': ['gini', 'entropy'],
                         'max_depth': [None, 10, 4, 3, 2],
                         'min_samples_leaf': [1, 3, 5, 20]})

In [24]:
results = gsearch.cv_results_

In [25]:
results.keys()

dict_keys(['mean_fit_time', 'std_fit_time', 'mean_score_time', 'std_score_time', 'param_criterion', 'param_max_depth', 'param_min_samples_leaf', 'params', 'split0_test_score', 'split1_test_score', 'split2_test_score', 'split3_test_score', 'split4_test_score', 'mean_test_score', 'std_test_score', 'rank_test_score'])

In [27]:
results_df_init = pd.DataFrame(results)
results_df_init.shape

(40, 16)

In [31]:
params = pd.DataFrame(results['params'])
params.head(2)

Unnamed: 0,criterion,max_depth,min_samples_leaf
0,gini,,1
1,gini,,3


In [32]:
results_df_init.head(2)

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_criterion,param_max_depth,param_min_samples_leaf,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
0,0.867858,0.033409,0.020674,0.001398,gini,,1,"{'criterion': 'gini', 'max_depth': None, 'min_...",0.616713,0.615727,0.612999,0.617447,0.617922,0.616162,0.001746,32
1,0.807179,0.011104,0.018062,0.00085,gini,,3,"{'criterion': 'gini', 'max_depth': None, 'min_...",0.622063,0.62265,0.621287,0.623846,0.62367,0.622703,0.000966,29


In [40]:
splits = [col for col in results.keys() if 'split' in col]
chopped = pd.concat([params, results_df_init[splits]], axis=1)
chopped['model_type'] = 'decision_tree'
chopped.head(2)

Unnamed: 0,criterion,max_depth,min_samples_leaf,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,model_type
0,gini,,1,0.616713,0.615727,0.612999,0.617447,0.617922,decision_tree
1,gini,,3,0.622063,0.62265,0.621287,0.623846,0.62367,decision_tree


<div style='background-color:orange'>
<a id="dtc"></a>
    <h1 style='text-align:center ; top-padding:5px'>
        <b><i>
            DTC
        </i></b></h1>
    <li><a href='#TableOfContents'>Table of Contents</a>
    </li>
</div>

In [None]:
# DTC Params grid
dtc = DecisionTreeClassifier()
dtc_params = {
    
}


<div style='background-color:orange'>
<a id="rfc"></a>
    <h1 style='text-align:center ; top-padding:5px'>
        <b><i>
            RFC
        </i></b></h1>
    <li><a href='#TableOfContents'>Table of Contents</a>
    </li>
</div>

<div style='background-color:orange'>
<a id="knn"></a>
    <h1 style='text-align:center ; top-padding:5px'>
        <b><i>
            KNN
        </i></b></h1>
    <li><a href='#TableOfContents'>Table of Contents</a>
    </li>
</div>

<div style='background-color:orange'>
<a id="lr"></a>
    <h1 style='text-align:center ; top-padding:5px'>
        <b><i>
            LR
        </i></b></h1>
    <li><a href='#TableOfContents'>Table of Contents</a>
    </li>
</div>

<div style='background-color:orange'>
<a id="misc"></a>
    <h1 style='text-align:center ; top-padding:5px'>
        <b><i>
            Miscellaneous
        </i></b></h1>
    <li><a href='#TableOfContents'>Table of Contents</a>
    </li>
</div>