In [None]:
# Import our libraries 

# Pandas and numpy for data wrangling

# Seaborn / matplotlib for visualization 

# Import the trees from sklearn
from sklearn import tree

# Helper function to split our data


# Helper fuctions to evaluate our model.


# Helper function for hyper-parameter turning.


# Import our Decision Tree


# Import our Random Forest 


# Library for visualizing our tree
# If you get an error, run 'conda install python-graphviz' in your terminal (without the quotes).
import graphviz 

# Use inline so our visualizations display in notebook
%matplotlib inline

```py
# Import our libraries 

# Pandas and numpy for data wrangling
import numpy as np
import pandas as pd


# Seaborn / matplotlib for visualization 
import matplotlib.pyplot as plt
import seaborn as sns

sns.set()

# Import the trees from sklearn
from sklearn import tree

# Helper function to split our data
from sklearn.model_selection import train_test_split

# Helper fuctions to evaluate our model.
from sklearn.metrics import accuracy_score, precision_score, recall_score, confusion_matrix, f1_score, roc_auc_score 


# Helper function for hyper-parameter turning.
from sklearn.model_selection import GridSearchCV

# Import our Decision Tree
from sklearn.tree import DecisionTreeClassifier 

# Import our Random Forest 
from sklearn.ensemble import RandomForestClassifier



# Library for visualizing our tree
# If you get an error, run 'conda install python-graphviz' in your terminal (without the quotes).
import graphviz 

# Use inline so our visualizations display in notebook
%matplotlib inline
```

## Main Steps when building a Machine Learning Model. 
1. Inspect and explore data.
2. Select and engineer features.
3. Build and train model.
4. Evaluate model.

# #1 Inspect and explore data.
* Load titanic data
* Visualize all the data using sns.pairplot
* Check for null values

In [None]:
# Load in the titanic data set.


In [None]:
# Visualize all the data using sns.pairplot


In [None]:
# Check for null values


```py
# Load in the titanic data set.

df = pd.read_csv('data/titanic.csv')
df.head()

```

![image.png](attachment:4d099844-7132-4eab-8069-03a35fe19374.png)

```py
# Visualize all the data using sns.pairplot

sns.pairplot(df, hue="survived")

```

![image.png](attachment:e497c9d8-2888-4d6a-8943-753cbdaf477a.png)

```py
# Check for null values
df.isnull().sum()
```




# #2 Select and engineer features.
1. Fill age null values with -999
1. Convert to numerical values if need be by using `pd.get_dummies()`
1. Create a list of the features you are going to use.  In this case use as many or as little as you would like.
1. Define our `X` and `y`
1. Split our data into trainig and testing sets.

In [None]:
# Fill age null values with -999


In [None]:
# 1. Convert to numerical values if need be by using `pd.get_dummies()`


In [None]:
# 2. Create a list of the features we are going to use.
selected_features = ???


In [None]:
# Define our `X` and `y`



In [None]:
# Split our data into trainig and testing sets.

print('Lenght of our Training data:', ???, '\nLength of our Testing data:', ???)

```py
# Fill age null values with -999

df['age'] = df['age'].fillna(-999)


# 1. Convert to numerical values if need be by using `pd.get_dummies()`

df.head(3)


df = pd.get_dummies(df, columns=['sex', 'pclass', 'embarked'], drop_first=True)

df.head()

# 2. Create a list of the features we are going to use.
selected_features = ['fare', 'age',
                     'sex_male', 'pclass_2', 
                     'pclass_3',  'sibsp', 'parch', 
                     'embarked_Q','embarked_S']



# Define our `X` and `y`
X = df[selected_features]

y = df['survived']

# Split our data into trainig and testing sets.

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=45)

print('Lenght of our Training data:', X_train.shape, '\nLength of our Testing data:', y_test.shape)

```

# #3 Build and train model.
1. For our first pass, initialize our model with `max_depth=2`.
2. Fit our model with our training data. 
3. Make predictions of our testing data. 
4. Evaluate and print our model scores using accuracy, precision, recall, f1 scores, and auc scores. 
    * To calculate auc score you have to get the predicted probabilites for the Survived class using `model.predict_proba(X_test)[:,1]`
5. Visualize our Decision Tree using provided code. 


In [None]:
# For our first pass, initialize our model with `max_depth=2`.

model = ???

In [None]:
# Fit our model with our training data. 



In [None]:
# Make predictions of our testing data. 



In [None]:
# 4. Evaluate and print our model scores using accuracy, precision, recall, f1 scores, and auc scores. 
accuracy = ???
print("Accuracy Score: %f" % accuracy)

precision = ???
print("Precision Score: %f" % precision)

recall = ???
print("Recall Score: %f" % recall)

f1 = ???
print('F1 Score: %f' % f1)

# Calculate predicted probabilities
y_pred_proba = model.predict_proba(???)

# Keep only the proba for True
y_pred_proba = y_pred_proba[:,1]

# Compute auc score
auc = ???
print('AUC Score: %f' % auc)

```py
# For our first pass, initialize our model with `max_depth=2`.

model = DecisionTreeClassifier(max_depth=2)

# Fit our model with our training data. 

model.fit(X_train,y_train)


# Make predictions of our testing data. 

y_pred = model.predict(X_test)


# 4. Evaluate and print our model scores using accuracy, precision, recall, f1 scores, and auc scores. 
accuracy = accuracy_score(y_true=y_test, y_pred=y_pred)
print("Accuracy Score: %f" % accuracy)

precision = precision_score(y_true=y_test, y_pred=y_pred)
print("Precision Score: %f" % precision)

recall = recall_score(y_true=y_test, y_pred=y_pred)
print("Recall Score: %f" % recall)

f1 = f1_score(y_true=y_test, y_pred=y_pred)
print('F1 Score: %f' % f1)

# Calculate predicted probabilities
y_pred_proba = model.predict_proba(X_test)

# Keep only the proba for True
y_pred_proba = y_pred_proba[:,1]

# Compute auc score
auc =  roc_auc_score(y_true=y_test, y_score=y_pred_proba)
print('AUC Score: %f' % auc)

```

![image.png](attachment:66061d15-8463-42b8-93e8-8e4ff5983c61.png)

# OPTIONAL:  Visualize your decision tree. 
* If you get an error, you may need to install the graphviz library.
* Run this command in your terminal to install the graphviz library. 
    * `conda install python-graphviz`
* If that does not work, then try installing the library using pip. 
    * `pip install graphviz`

* If neither of those work, you can just skip this step.

In [None]:
# OPTIONAL 
# Visualize your decision tree. 
dot_data = tree.export_graphviz(model, out_file=None, 
                     feature_names=selected_features,
                     class_names=['died','survived'],
                     filled=True, rounded=True,  
                     special_characters=True)  
graph = graphviz.Source(dot_data)  

# Picking the right parameters...

# Parameter tuning of your Decision Tree using GridSearch

1. Make a dictionary of at least 3 parameters and a list of 3 values for each for your grid search. 
1. Initalize your GridSearchCV with a DecisionTreeClassifier, your param_grid, and what you are optimizing for.  Choose any of the five optimization strategies; accuracy, precision, recall, f1, or roc_auc.
1. Fit your GridSearchCV with your training data. 
1. Print the parameters of your best model. 
1. Evaluate your best model using accuracy, precision, recall, f1 scores, and auc scores. 
1. Visualize your best tree.
1. Which feature was your most important feature?

```python
tree.DecisionTreeClassifier(
    *,
    criterion='gini',
    splitter='best',
    max_depth=None,
    min_samples_split=2,
    min_samples_leaf=1,
    min_weight_fraction_leaf=0.0,
    max_features=None,
    random_state=None,
    max_leaf_nodes=None,
    min_impurity_decrease=0.0,
    min_impurity_split=None,
    class_weight=None,
    presort='deprecated',
    ccp_alpha=0.0,
)
```


```py

```

[Tips on how to customize / set the paramters in the decision tree.](https://scikit-learn.org/stable/modules/tree.html#tips-on-practical-use)

In [None]:
# 1. Make a dictionary of at least 3 parameters and a list of 3 values for each for your grid search.from sklearn.model_selection import GridSearchCV
params = { 
    'PARAMETER_NAME': ['LIST', 'OF', 'VALUES'], ??? }

In [None]:
# 1. Initalize your GridSearchCV with a DecisionTreeClassifier, your param_grid, and what you are optimizing for.  Choose any of the five optimization strategies; accuracy, precision, recall, f1, or roc_auc.
grid_search_cv =  GridSearchCV( ??? )

In [None]:
# 1. Fit your GridSearchCV with your training data. 


In [None]:
# 1. Print the parameters of your best model. 
# Print the best parameters it found
print( ??? )

In [None]:
# 1. Evaluate your best model using accuracy, precision, recall, f1 scores, and auc scores. 

# This command gives you the best tree
model = ???

# Now lets evaluate our model
y_pred = ???

accuracy = ???
print("Accuracy Score: %f" % accuracy)

precision = ???
print("Precision Score: %f" % precision)

recall = ???
print("Recall Score: %f" % recall)

f1 = ???
print('F1 Score: %f' % f1)

# Calculate predicted probabilities, keep only probability for when class = 1
y_pred_proba = model.predict_proba(???)[:,1]
auc = ???
print('AUC Score: %f' % auc)

In [None]:
# OPTIONAL!

# 1. Visualize your best tree
dot_data = tree.export_graphviz(model, out_file=None, 
                     feature_names=selected_features,
                     class_names=['died','survived'],
                     filled=True, rounded=True,  
                     special_characters=True)  
graph = graphviz.Source(dot_data)  
graph

In [None]:
# 1. Which feature was your most important feature?
# Now lets look at our feature importances
feature_imp = pd.DataFrame.from_dict( {'feature_importance': model.feature_importances_,
                                       'feature':selected_features }).sort_values('feature_importance', ascending=False)
feature_imp

```py
# 1. Make a dictionary of at least 3 parameters and a list of 3 values for each for your grid search.from sklearn.model_selection import GridSearchCV
params = {
    'criterion' : ['gini', 'entropy'],
    'max_depth': [2, 5, 7, 100], 
    'min_samples_split': [2, 10, 20, 100],
    'min_samples_leaf': [1, 10, 20, 30],
}
```


```py
# 1. Initalize your GridSearchCV with a DecisionTreeClassifier, your param_grid, and what you are optimizing for.  Choose any of the five optimization strategies; accuracy, precision, recall, f1, or roc_auc.
grid_search_cv =  GridSearchCV( 
    estimator = DecisionTreeClassifier(), 
    param_grid = params, 
    scoring = 'f1')
```
    
```py
# 1. Fit your GridSearchCV with your training data. 
grid_search_cv.fit(X_train, y_train)
```

```py
# 1. Print the parameters of your best model. 
# Print the best parameters it found

print(grid_search_cv.best_params_)
```
{'criterion': 'entropy', 'max_depth': 100, 'min_samples_leaf': 10, 'min_samples_split': 2}


```py
# 1. Evaluate your best model using accuracy, precision, recall, f1 scores, and auc scores. 

# This command gives you the best tree
model = grid_search_cv.best_estimator_

# Now lets evaluate our model
y_pred = model.predict(X_test)

accuracy = accuracy_score(y_true=y_test, y_pred=y_pred)
print("Accuracy Score: %f" % accuracy)

precision = precision_score(y_true=y_test, y_pred=y_pred)
print("Precision Score: %f" % precision)

recall = recall_score(y_true=y_test, y_pred=y_pred)
print("Recall Score: %f" % recall)

f1 = f1_score(y_true=y_test, y_pred=y_pred)
print('F1 Score: %f' % f1)

# Calculate predicted probabilities, keep only probability for when class = 1
y_pred_proba = model.predict_proba(X_test)[:,1]
auc = roc_auc_score(y_true=y_test, y_score=y_pred_proba)
print('AUC Score: %f' % auc)
```
![image.png](attachment:51377986-181e-42c5-90f8-630420d95e56.png)


```py
# OPTIONAL!

# 1. Visualize your best tree
dot_data = tree.export_graphviz(model, out_file=None, 
                     feature_names=selected_features,
                     class_names=['died','survived'],
                     filled=True, rounded=True,  
                     special_characters=True)  
graph = graphviz.Source(dot_data)  
graph
```

```py
# 1. Which feature was your most important feature?
# Now lets look at our feature importances
feature_imp = pd.DataFrame.from_dict( {'feature_importance': model.feature_importances_,
                                       'feature':selected_features }).sort_values('feature_importance', ascending=False)
feature_imp


```

![image.png](attachment:c8d8e06b-1611-4145-b419-d91906d36cc6.png)

# Now onto Random Forests...
Were going to do the same with, but this time with a random forest. Remeber... Repetition is the father of learning.

1. Make a dictionary of at least 3 parameters and a list of 3 values for each for your grid search. 
1. Initalize your GridSearchCV with a RandomForestClassifer, your param_grid, and what you are optimizing for.  Choose any of the five optimization strategies; accuracy, precision, recall, f1, or roc_auc.
1. Fit your GridSearchCV with your training data. 
1. Print the parameters of your best model. 
1. Evaluate your best model using accuracy, precision, recall, f1 scores, and auc scores. 
1. Which feature was your most important feature?


# Parameters of the Random Forest Classifier

```python
RandomForestClassifier(
    n_estimators=100,
    *,
    criterion='gini',
    max_depth=None,
    min_samples_split=2,
    min_samples_leaf=1,
    min_weight_fraction_leaf=0.0,
    max_features='auto',
    max_leaf_nodes=None,
    min_impurity_decrease=0.0,
    min_impurity_split=None,
    bootstrap=True,
    oob_score=False,
    n_jobs=None,
    random_state=None,
    verbose=0,
    warm_start=False,
    class_weight=None,
    ccp_alpha=0.0,
    max_samples=None,
)
```

In [None]:
# 1. Make a dictionary of at least 3 parameters and a list of 3 values for each for your grid search. 
params = {
    'PARAMETER_1_NAME': ['LIST', 'OF', 'VALUES'], 
    'PARAMETER_2_NAME': ['LIST', 'OF', 'VALUES'],
    'PARAMETER_3_NAME': ['LIST', 'OF', 'VALUES'],
}

In [None]:
# 1. Initalize your GridSearchCV with a RandomForestClassifer, your param_grid, and what you are optimizing for.  Choose any of the five optimization strategies; accuracy, precision, recall, f1, or roc_auc.

grid_search_cv = GridSearchCV( ??? )

In [None]:
# 1. Fit your GridSearchCV with your training data. 


In [None]:
# 1. Print the parameters of your best model. 
# Print the best parameters it found
print(???)




In [None]:
# 1. Evaluate your best model using accuracy, precision, recall, f1 scores, and auc scores. 

# This command gives you tree that has the highest f1-score. 
model = grid_search_cv.best_estimator_


# Now lets evaluate our model
y_pred = ???

accuracy = ???
print("Accuracy Score: %f" % accuracy)

precision = ???
print("Precision Score: %f" % precision)

recall = ???
print("Recall Score: %f" % recall)

f1 = ???
print('F1 Score: %f' % f1)

# Calculate predicted probabilities, keep only probability for when class = 1
y_pred_proba = model.predict_proba(X_test)[:,1]

auc = ???
print('AUC Score: %f' % auc)

In [None]:
# 1. Which feature was your most important feature?
# Now lets look at our feature importances
feature_imp = pd.Series(model.feature_importances_,index=selected_features).sort_values(ascending=False)
feature_imp

```py
# 1. Make a dictionary of at least 3 parameters and a list of 3 values for each for your grid search. 
params = {
    'n_estimators' : [5, 10, 50, 100],
    'criterion' : ['gini', 'entropy'],
    'max_depth': [5, 10, 100], 
    'min_samples_split': [2, 10, 100],
    'max_features': [2, 4, 'auto']
}
```

```py
# 1. Initalize your GridSearchCV with a RandomForestClassifer, your param_grid, and what you are optimizing for.  Choose any of the five optimization strategies; accuracy, precision, recall, f1, or roc_auc.

grid_search_cv = GridSearchCV( 
    estimator=RandomForestClassifier(), 
    param_grid=params,
    scoring='f1', )
```

```py
# 1. Fit your GridSearchCV with your training data. 
grid_search_cv.fit(X_train, y_train)
```
![image.png](attachment:bde45b70-c57f-46c8-a510-4bdd81480ca4.png)


```py
# 1. Print the parameters of your best model. 
# Print the best parameters it found
print(grid_search_cv.best_params_)
```
{'criterion': 'entropy', 'max_depth': 10, 'max_features': 4, 'min_samples_split': 10, 'n_estimators': 10}


```py
# 1. Evaluate your best model using accuracy, precision, recall, f1 scores, and auc scores. 

# This command gives you tree that has the highest f1-score. 
model = grid_search_cv.best_estimator_


# Now lets evaluate our model
y_pred = model.predict(X_test)

accuracy = accuracy_score(y_true=y_test, y_pred=y_pred)
print("Accuracy Score: %f" % accuracy)

precision = precision_score(y_true=y_test, y_pred=y_pred)
print("Precision Score: %f" % precision)

recall = recall_score(y_true=y_test, y_pred=y_pred)
print("Recall Score: %f" % recall)

f1 = f1_score(y_true=y_test, y_pred=y_pred)
print('F1 Score: %f' % f1)

# Calculate predicted probabilities, keep only probability for when class = 1
y_pred_proba = model.predict_proba(X_test)[:,1]

auc = roc_auc_score(y_true=y_test, y_score=y_pred_proba)
print('AUC Score: %f' % auc)
```
![image.png](attachment:f53aee23-1ebc-4f91-b0c4-7abe781917b9.png)

```py
# 1. Which feature was your most important feature?
# Now lets look at our feature importances
feature_imp = pd.Series(model.feature_importances_,index=selected_features).sort_values(ascending=False)
feature_imp
```

![image.png](attachment:6e8e024c-80be-4831-b32b-d6383418e039.png)


# Build a random forest using the ny-vs-sf-housing.csv data. 
* Your target variable, aka the column you are trying to predict, aka your `y` variable is `in_sf`. 
* Can you get an accuracy above %88.8889?
* What was your most important feature?


In [None]:
df = pd.read_csv('data/ny-vs-sf-houses.csv')
df.head()

In [None]:
# BUILD, TRAIN, AND EVAULATE A RANDOM FOREST MODEL BELOW. 



```py
df = pd.read_csv('data/ny-vs-sf-houses.csv')
df.head()
```
![image.png](attachment:e8587703-afc9-4144-b9e9-c4cc6c1057a6.png)

```py
df.isnull().sum()
```
![image.png](attachment:ca8239ec-0bc0-4235-b308-32f56fc3ca8b.png)

```py
df.groupby("in_sf")["in_sf"].count().head()
```
![image.png](attachment:6b510361-a925-4d70-9bfe-01bb5d4e254f.png)


```py
sns.pairplot(df, hue="in_sf")
```
![image.png](attachment:0af3315f-90d1-493e-8672-2ae693802e5a.png)


# Awesome difficult extra credit below:
Build a classifier using the adult_income.csv data.  
* The target variable is 'class'
* Start with just using these features `selected_features = ['age', 'fnlwgt', 'capital_gain', 'capital_loss', 'hours_per_week']`
* You have to include the pos_label in your precision, recall, and f1 scores. It just tells the classifier which one is the posotive label.  I provided the proper way below.

* See if you can get above 50% f1 score.  
* See some [super tricks and tips here](https://www.kaggle.com/code/jieyima/income-classification-model)

In [None]:
df = pd.read_csv('data/adult_income.csv')
df.head()



In [None]:

y_pred = model.predict(X_test)

accuracy = accuracy_score(y_true=y_test, y_pred=y_pred)
print("Accuracy Score: %f" % accuracy)

precision = precision_score(y_true=y_test, y_pred=y_pred)
print("Precision Score: %f" % precision)

recall = recall_score(y_true=y_test, y_pred=y_pred)
print("Recall Score: %f" % recall)

f1 = f1_score(y_true=y_test, y_pred=y_pred)
print('F1 Score: %f' % f1)

```py

```