In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder

from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression

## Exercise
#### 1. Obtain the cars.csv file from the google classroom and read it into python with pandas.
#### 2. Create a feature named gt_avg, which should be either 1 or 0. The value should indicate whether or not a given price is greater than the average price for that car's combination of year, make, and model.

In [2]:
df = pd.read_csv('cars.csv')

In [3]:
df.columns = [c.lower() for c in df]
# df.set_index('id', inplace=True)

In [4]:
df['avg_saleprice'] = df.groupby(['year', 'make', 'model']).price.transform('mean')

In [5]:
df['gt_avg'] = (df.price > df.avg_saleprice).astype(int)

#### 3. Drop the Id, City, and Vin columns.

In [6]:
df.drop(columns = ['id', 'price','city','vin','avg_saleprice'], inplace=True)

#### 4. Encode the categorical features as necessary.

In [7]:
from sklearn.preprocessing import LabelEncoder

In [8]:
for col in ['state','make','model','year']:
    le = LabelEncoder().fit(df[col])  # this creates a numerical label encoder for each value in each column.
    df[col] = le.transform(df[col]) # this transforms each item and overwrites the original item in each cell

In [9]:
df.head()

Unnamed: 0,year,mileage,state,make,model,gt_avg
0,18,18681,28,7,523,0
1,18,27592,19,7,525,0
2,18,13650,32,7,526,0
3,18,25195,22,7,525,0
4,18,22800,38,7,523,0


#### 5. Split the data into training and test sets.

In [10]:
X, y = df.drop(columns='gt_avg'), df.gt_avg

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.2, random_state=123)

In [12]:
X.sample(2)

Unnamed: 0,year,mileage,state,make,model
62564,19,21413,44,15,652
193362,7,87936,49,5,13


### By Hand
- Now we can further split our data into training and validate data sets:

- This creates an additional "test" data set, or "VALIDATE" data set, by doing the same split on the training data itself.

- this allows us to test this data on different model/methodology... so we can then pick the best one and THEN use THAT ONE on the test data.

In [13]:
X_train, X_validate, y_train, y_validate = train_test_split(X_train, y_train, test_size=.3333)

- Now we can explore and feature creation on this smaller training data set, and test it on the validate, like we would normally do on train/test sets.  When it looks good, we can then actually test it on the actual test set.

- Now do the k-crossvalidation, by breaking it into several smaller chunks, and cross testing of the chunks.
- Breaking train into 3 K's, means training on chunk 1-2, and testing on 3, training on chunks 2-3, and testing on 1, and training on 1-3 and testing on 2

In [14]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import cross_val_score

In [19]:
tree = DecisionTreeClassifier(max_depth=4)
cross_val_score(tree, X_train, y_train, cv=4)  # 'cv' = number of cross validation folds

array([0.63235405, 0.63790846, 0.6348531 , 0.64469676])

In [20]:
tree = DecisionTreeClassifier(max_depth=5)
cross_val_score(tree, X_train, y_train, cv=4)  # 'cv' = number of cross validation folds

array([0.64599854, 0.65346659, 0.64935425, 0.65715868])

- throwing ".mean()" at the end averages the scores spit out by the cross_val_score tool.  See below.  This is how you'd compare models.

- Below, it generates the cross_val_score, like in a confusion matrix.  We can score each of the confusion matrix measures.

In [23]:
tree = DecisionTreeClassifier(max_depth=5)
cross_val_score(tree, X_train, y_train, cv=4, scoring='precision').mean()  # 'cv' = number of cross validation folds

0.6313004436812996

- By default, sklearn gives the accuracy measure from the confusion matrix.

In [24]:
tree = DecisionTreeClassifier(max_depth=5)
cross_val_score(tree, X_train, y_train, cv=4, scoring='accuracy').mean()  # 'cv' = number of cross validation folds

0.6514945150948327

## What happens in the multi-class scenario?

In [27]:
from pydataset import data
from sklearn.metrics import classification_report

iris = data('iris')
iris.columns = [c.lower().replace('.', '_') for c in iris]

X, y = iris.drop(columns = 'species'), iris.species

tree = DecisionTreeClassifier(max_depth=3)

tree.fit(X, y)

actual = y
predictions = tree.predict(X)

print(classification_report(actual, predictions))

              precision    recall  f1-score   support

      setosa       1.00      1.00      1.00        50
  versicolor       0.98      0.94      0.96        50
   virginica       0.94      0.98      0.96        50

   micro avg       0.97      0.97      0.97       150
   macro avg       0.97      0.97      0.97       150
weighted avg       0.97      0.97      0.97       150



In [29]:
cross_val_score(tree, X, y, scoring='precision_macro')



array([0.98148148, 0.92156863, 0.98039216])

## Grid Search

- Below is using the X_train and y_train from the cars data above:

In [31]:
from sklearn.model_selection import GridSearchCV

hyperparameters = {
    'max_depth' : [3, 4, 5],
    'max_features' : [None, 2, 3],
}


grid = GridSearchCV(DecisionTreeClassifier(), param_grid = hyperparameters, cv=3)
# we can also change the confusion matrix measures here
# like precision, recall, accuracy, etc.

grid.fit(X_train, y_train)

GridSearchCV(cv=3, error_score='raise-deprecating',
       estimator=DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=None,
            splitter='best'),
       fit_params=None, iid='warn', n_jobs=None,
       param_grid={'max_depth': [3, 4, 5], 'max_features': [None, 2, 3]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=0)

In [32]:
results = grid.cv_results_

results.keys()

dict_keys(['mean_fit_time', 'std_fit_time', 'mean_score_time', 'std_score_time', 'param_max_depth', 'param_max_features', 'params', 'split0_test_score', 'split1_test_score', 'split2_test_score', 'mean_test_score', 'std_test_score', 'rank_test_score', 'split0_train_score', 'split1_train_score', 'split2_train_score', 'mean_train_score', 'std_train_score'])

In [34]:
scores = results['mean_test_score']
scores

array([0.62966133, 0.61949058, 0.5758558 , 0.63750338, 0.59276719,
       0.62430532, 0.64997766, 0.61558214, 0.63024036])

In [35]:
params = results['params']
params

[{'max_depth': 3, 'max_features': None},
 {'max_depth': 3, 'max_features': 2},
 {'max_depth': 3, 'max_features': 3},
 {'max_depth': 4, 'max_features': None},
 {'max_depth': 4, 'max_features': 2},
 {'max_depth': 4, 'max_features': 3},
 {'max_depth': 5, 'max_features': None},
 {'max_depth': 5, 'max_features': 2},
 {'max_depth': 5, 'max_features': 3}]

In [37]:
for s, p in zip(scores, params):
    p['score'] = s
    
params

[{'max_depth': 3, 'max_features': None, 'score': 0.6296613316382083},
 {'max_depth': 3, 'max_features': 2, 'score': 0.6194905813565615},
 {'max_depth': 3, 'max_features': 3, 'score': 0.5758557968870959},
 {'max_depth': 4, 'max_features': None, 'score': 0.6375033829073492},
 {'max_depth': 4, 'max_features': 2, 'score': 0.5927671867427795},
 {'max_depth': 4, 'max_features': 3, 'score': 0.624305323909445},
 {'max_depth': 5, 'max_features': None, 'score': 0.6499776570770421},
 {'max_depth': 5, 'max_features': 2, 'score': 0.6155821432842209},
 {'max_depth': 5, 'max_features': 3, 'score': 0.6302403595007773}]

In [38]:
pd.DataFrame(params).sort_values(by='score')

Unnamed: 0,max_depth,max_features,score
2,3,3.0,0.575856
4,4,2.0,0.592767
7,5,2.0,0.615582
1,3,2.0,0.619491
5,4,3.0,0.624305
0,3,,0.629661
8,5,3.0,0.63024
3,4,,0.637503
6,5,,0.649978
