In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
import acquire
import prepare

In [None]:
from sklearn.tree import DecisionTreeClassifier, plot_tree, export_text
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix, plot_confusion_matrix

In [None]:
titanic2 = pd.read_csv('titanic_df.csv')
titanic2.head()

In [None]:
titanic2.info()

##  Question 1 - Using the titanic data, in your classification-exercises repository, create a notebook, model.ipynb where you will do the following:

- What is your baseline prediction? 

- What is your baseline accuracy? remember: your baseline prediction for a classification problem is predicting the most prevelant class in the training dataset (the mode). 

- When you make those predictions, what is your accuracy? This is your baseline accuracy.

In [None]:
df = acquire.get_titanic_data()
df.head(2)


In [None]:
df = prepare.prep_titanic_data(df)
df.head()


In [None]:
df.info()

In [None]:
df

In [None]:
df.age = df.age.fillna(df.age.mean())


In [None]:
df.info()


In [None]:
df = df.drop(columns=['pclass', 'embarked', 'embarked_encode', 'passenger_id'])


In [None]:
df = pd.get_dummies(df, ['sex', 'class', 'embark_town'], drop_first=True)
df.head()


In [None]:
train, validate, test = prepare.split(df, stratify_by='survived')


In [None]:
X_train = train.drop(columns=["survived"])
y_train = train.survived

X_validate = validate.drop(columns=["survived"])
y_validate = validate.survived

X_test = test.drop(columns=["survived"])
y_test = test.survived

In [None]:
X_train.head()


In [None]:
# X = titanic[['pclass', 'fare']]
# y = titanic.survived

# X_train_and_validate, X_test, y_train_and_validate, y_test = train_test_split(X, y, random_state=123, test_size=.3)
# X_train, X_validate, y_train, y_validate = train_test_split(X_train_and_validate, y_train_and_validate, random_state=123, test_size=.2)

# print("train: ", X_train.shape, ", validate: ", X_validate.shape, ", test: ", X_test.shape)
# print("train: ", y_train.shape, ", validate: ", y_validate.shape, ", test: ", y_test.shape)

In [None]:
y_train.value_counts()


In [None]:
baseline = y_train.mode()


matches_baseline_prediction = (y_train == 0)

baseline_accuracy = matches_baseline_prediction.mean()
print(f"Baseline accuracy: {round(baseline_accuracy, 2)}")

## Question 2

- Fit the decision tree classifier to your training sample and transform (i.e. make predictions on the training sample)

In [None]:
tree = DecisionTreeClassifier(max_depth = 3, random_state=123)

In [None]:
tree = tree.fit(X_train, y_train)


In [None]:
print(export_text(tree, feature_names=X_train.columns.tolist()))

In [None]:
y_predictions = tree.predict(X_train)
y_predictions

In [None]:
plt.figure(figsize=(12, 7))
plot_tree(tree, feature_names=X_train.columns, class_names=['0','1'])
plt.show()

## Question 3

- Evaluate your in-sample results using the model score, confusion matrix, and classification report.

In [None]:
#confusematrix
pd.DataFrame(confusion_matrix(y_train, y_predictions))

In [None]:
pd.crosstab(y_train, y_predictions)

In [None]:
print(classification_report(y_train, y_predictions))

## Question 4

Compute: Accuracy, true positive rate, false positive rate, true negative rate, false negative rate, precision, recall, f1-score, and support.

In [None]:
print('Accuracy of Decision Tree classifier on training set: {:.2f}'
      .format(tree.score(X_train, y_train)))

In [None]:
print(classification_report(y_train, y_predictions))

In [None]:
# Produce the classification report on the actual y values and this model's predicted y values
report = classification_report(y_train, y_predictions, output_dict=True)
print("Tree of 1 depth")
pd.DataFrame(report)


## Question 4

- Compute: Accuracy, true positive rate, false positive rate, true negative rate, false negative rate, precision, recall, f1-score, and support.

In [None]:
TN, FP, FN, TP = confusion_matrix(y_train, y_predictions).ravel()


In [None]:
TN, FP, FN, TP


In [None]:
negative_cases = TN + FP
positive_cases = FN + TP
print(f"Negative Cases: {negative_cases}")
print(f"Positive Cases: {positive_cases}")
print(y_train.value_counts())

In [None]:
ALL = TP + FP + FN + TN

accuracy = (TP + TN)/ALL
true_positive_rate = sensitivity = recall = power = TP/(TP+FN)
false_positive_rate = false_alarm_ratio = fallout = FP/(FP+TN)
true_negative_rate = specificity = selectivity = TN/(TN+FP)
false_negative_rate = miss_rate = FN/(FN+TP)
precision = PPV = TP/(TP+FP)
f1_score = 2*(precision*recall)/(precision+recall)
support_pos = TP + FN
support_neg = FP + TN

print(f"Accuracy: {accuracy}", '\n')
print(f"True Positive Rate/Sensitivity/Recall/Power: {true_positive_rate}", '\n')
print(f"False Positive Rate/False Alarm Ratio/Fall-out: {false_positive_rate}", '\n')
print(f"True Negative Rate/Specificity/Selectivity: {true_negative_rate}", '\n')
print(f"False Negative Rate/Miss Rate: {false_negative_rate}", '\n')
print(f"Precision/PPV: {precision}", '\n')
print(f"F1 Score: {f1_score}", '\n')
print(f"Support (0): {support_neg}", '\n')
print(f"Support (1): {support_pos}")

## Question 5

Run through steps 2-4 using a different max_depth value.

In [None]:
for i in range(1, 21):
    # Make the model
    tree = DecisionTreeClassifier(max_depth=i, random_state=123)

    # Fit the model (on train and only train)
    tree = tree.fit(X_train, y_train)

    # Use the model
    # We'll evaluate the model's performance on train, first
    y_predictions = tree.predict(X_train)

    # Produce the classification report on the actual y values and this model's predicted y values
    report = classification_report(y_train, y_predictions, output_dict=True)
    print(f" Tree with max depth of {i}")
    print(pd.DataFrame(report))
    print()

## Question 6 

- Which model performs better on your in-sample data?

In [None]:
# Max depth of 15+ produces the highest accuracy

## Question 7 

- Which model performs best on your out-of-sample data, the validate set?

In [None]:
# tree.score(X_validate, y_validate) 

In [None]:
# y_validate.value_counts(normalize=True)

In [None]:
#better accurace to guess not survived in the data

In [None]:
metrics = []

for i in range(1, 25):
    # Make the model
    tree = DecisionTreeClassifier(max_depth=i, random_state=123)

    # Fit the model (on train and only train)
    tree = tree.fit(X_train, y_train)

    # Use the model
    # We'll evaluate the model's performance on train, first
    in_sample_accuracy = tree.score(X_train, y_train)
    
    out_of_sample_accuracy = tree.score(X_validate, y_validate)

    output = {
        "max_depth": i,
        "train_accuracy": in_sample_accuracy,
        "validate_accuracy": out_of_sample_accuracy
    }
    
    metrics.append(output)
    
df = pd.DataFrame(metrics)
df["difference"] = df.train_accuracy - df.validate_accuracy
df

In [None]:
plt.figure(figsize=(12, 6))
plt.plot(df.max_depth, df.train_accuracy, marker = 'o')
plt.plot(df.max_depth, df.validate_accuracy, marker = 'o')
plt.title('Overfitting Occurs at Higher Values for Max Depth')
plt.xlabel('Max Depth')
plt.ylabel('Accuracy')
plt.show()


In [None]:
df[df.difference <= 0.10].sort_values(by=['validate_accuracy', 'difference'], ascending = [False,True])


 ## Work through these same exercises using the Telco dataset.

- What is your baseline prediction? What is your baseline accuracy? remember: your baseline prediction for a classification problem is predicting the most prevelant class in the training dataset (the mode). 
- When you make those predictions, what is your accuracy? This is your baseline accuracy.

In [None]:
telco = pd.read_csv('telco.csv')
telco.head()

In [None]:
telco.isna().sum()

## Random forest Excercizes 

- Fit the Random Forest classifier to your training sample and transform (i.e. make predictions on the training sample) setting the random_state accordingly and setting min_samples_leaf = 1 and max_depth = 10.

In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import export_graphviz
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix

import warnings
warnings.filterwarnings("ignore")

import acquire
import prepare

In [None]:
# Acquire Step
df = acquire.get_titanic_data()
df.head(2)

In [None]:
# prepare the data
df = prepare.prep_titanic_data(df)
df.head()

In [None]:
df.age = df.age.fillna(df.age.mean())

In [None]:
df = df.drop(columns=['pclass', 'embarked', 'embarked_encode', 'passenger_id'])


In [None]:
df = pd.get_dummies(df, ['sex', 'class', 'embark_town'], drop_first=True)
df.head()

In [None]:
train, validate, test = prepare.split(df, stratify_by='survived')


In [None]:
X_train = train.drop(columns=["survived"])
y_train = train.survived

X_validate = validate.drop(columns=["survived"])
y_validate = validate.survived

X_test = test.drop(columns=["survived"])
y_test = test.survived

In [None]:
X_train.head()


In [None]:
X_train.shape, X_validate.shape, X_test.shape


In [None]:
y_train.value_counts()


In [None]:
baseline = y_train.mode()

# Produce a boolean array with True representing a match between the baseline prediction and reality
matches_baseline_prediction = y_train == 0

baseline_accuracy = matches_baseline_prediction.mean()
print(f"Baseline prediction: {baseline[0]}")
print(f"Baseline accuracy: {round(baseline_accuracy, 2)}")

In [None]:
forest1 = RandomForestClassifier(max_depth=3, random_state=123)

# Fit the model (on train and only train)
forest1.fit(X_train, y_train)

# Use the model
# We'll evaluate the model's performance on train, first
y_predictions = forest1.predict(X_train)

# Produce the classification report on the actual y values and this model's predicted y values
report = classification_report(y_train, y_predictions, output_dict=True)
print("Tree of depth 3")
pd.DataFrame(report)

In [None]:
pd.DataFrame(confusion_matrix(y_predictions, y_train))


In [None]:
TN, FP, FN, TP = confusion_matrix(y_train,y_predictions).ravel()
ALL = TP + TN + FP + FN

TP, TN, FP, FN

In [None]:
accuracy = (TP + TN)/ALL
print(f"Accuracy: {accuracy}")

true_positive_rate = TP/(TP+FN)
print(f"True Positive Rate: {true_positive_rate}")

false_positive_rate = FP/(FP+TN)
print(f"False Positive Rate: {false_positive_rate}")

true_negative_rate = TN/(TN+FP)
print(f"True Negative Rate: {true_negative_rate}")

false_negative_rate = FN/(FN+TP)
print(f"False Negative Rate: {false_negative_rate}")

precision = TP/(TP+FP)
print(f"Precision: {precision}")

recall = TP/(TP+FN)
print(f"Recall: {recall}")

f1_score = 2*(precision*recall)/(precision+recall)
print(f"F1 Score: {f1_score}")

support_pos = TP + FN
print(f"Support (0): {support_pos}")

support_neg = FP + TN
print(f"Support (1): {support_neg}")

In [None]:
for i in range(2, 11):
    # Make the model
    forest = RandomForestClassifier(max_depth=i, random_state=123)

    # Fit the model (on train and only train)
    forest = forest.fit(X_train, y_train)

    # Use the model
    # We'll evaluate the model's performance on train, first
    y_predictions = forest.predict(X_train)

    # Produce the classification report on the actual y values and this model's predicted y values
    report = classification_report(y_train, y_predictions, output_dict=True)
    print(f"Tree with max depth of {i}")
    print(pd.DataFrame(report))
    print()

In [None]:
metrics = []

for i in range(2, 25):
    # Make the model
    forest = RandomForestClassifier(max_depth=i, random_state=123)

    # Fit the model (on train and only train)
    forest = forest.fit(X_train, y_train)

    # Use the model
    # We'll evaluate the model's performance on train, first
    in_sample_accuracy = forest.score(X_train, y_train)
    
    out_of_sample_accuracy = forest.score(X_validate, y_validate)

    output = {
        "max_depth": i,
        "train_accuracy": in_sample_accuracy,
        "validate_accuracy": out_of_sample_accuracy
    }
    
    metrics.append(output)
    
df = pd.DataFrame(metrics)
df["difference"] = df.train_accuracy - df.validate_accuracy
df

In [None]:
df.set_index('max_depth').plot(figsize = (16,9))
plt.ylabel('Accuracy')
plt.xticks(np.arange(0,21,1))
plt.grid()


- Increasing min_samples_per_leaf, decreasing max_depth

In [None]:
metrics = []
max_depth = 20

for i in range(2, max_depth):
    # Make the model
    depth = max_depth - i
    n_samples = i
    forest = RandomForestClassifier(max_depth=depth, min_samples_leaf=n_samples, random_state=123)

    # Fit the model (on train and only train)
    forest = forest.fit(X_train, y_train)

    # Use the model
    # We'll evaluate the model's performance on train, first
    in_sample_accuracy = forest.score(X_train, y_train)
    
    out_of_sample_accuracy = forest.score(X_validate, y_validate)

    output = {
        "min_samples_per_leaf": n_samples,
        "max_depth": depth,
        "train_accuracy": in_sample_accuracy,
        "validate_accuracy": out_of_sample_accuracy
    }
    
    metrics.append(output)
    
df = pd.DataFrame(metrics)
df["difference"] = df.train_accuracy - df.validate_accuracy
df

In [None]:
df.set_index('max_depth')[['train_accuracy', 'validate_accuracy','difference']].plot(figsize = (16,9))
plt.ylabel('Accuracy')
plt.xticks(np.arange(0,21,1))
plt.grid()

- What about a fixed depth and increasing min_samples_leaf

In [None]:
metrics = []


for i in range(2, 50):
    # Make the model
    depth = 6
    n_samples = i
    forest = RandomForestClassifier(max_depth=depth, min_samples_leaf=n_samples, random_state=123)

    # Fit the model (on train and only train)
    forest = forest.fit(X_train, y_train)

    # Use the model
    # We'll evaluate the model's performance on train, first
    in_sample_accuracy = forest.score(X_train, y_train)
    
    out_of_sample_accuracy = forest.score(X_validate, y_validate)

    output = {
        "min_samples_per_leaf": n_samples,
        "max_depth": depth,
        "train_accuracy": in_sample_accuracy,
        "validate_accuracy": out_of_sample_accuracy
    }
    
    metrics.append(output)
    
df = pd.DataFrame(metrics)
df["difference"] = df.train_accuracy - df.validate_accuracy
df

In [None]:
df.set_index('min_samples_per_leaf')[['train_accuracy', 'validate_accuracy', 'difference']].plot(figsize = (16,9))
plt.ylabel('Accuracy')
plt.xticks(np.arange(0,50,5))
plt.grid()

In [None]:
# train, validate, test = train_validate_test_split(titanic, target='survived', seed=123)

# # Explore your data here. 

# # create X & y version of train, where y is a series with just the target variable and X are all the features. 

# X_train = train.drop(columns=['survived'])
# y_train = train.survived

# X_validate = validate.drop(columns=['survived'])
# y_validate = validate.survived

# X_test = test.drop(columns=['survived'])
# y_test = test.survived


In [None]:
# X_train

In [None]:
from sklearn.ensemble import RandomForestClassifier

In [None]:
rf = RandomForestClassifier(max_depth=3, 
                            random_state=123)
rf

In [None]:
rf.fit(X_train, y_train)

In [None]:
y_pred = rf.predict(X_train)
y_pred

- Evaluate your results using the model score, confusion matrix, and classification report.

In [None]:
print('Accuracy of random forest classifier on test set: {:.2f}'
     .format(rf.score(X_validate, y_validate)))

In [None]:
print(confusion_matrix(y_train, y_pred))

In [None]:
print(classification_report(y_train, y_pred))

- Print and clearly label the following: Accuracy, true positive rate, false positive rate, true negative rate, false negative rate, precision, recall, f1-score, and support.



In [None]:
forest1 = RandomForestClassifier(max_depth=1, random_state=123)

# Fit the model (on train and only train)
forest1.fit(X_train, y_train)

# Use the model
# We'll evaluate the model's performance on train, first
y_predictions = forest1.predict(X_train)

# Produce the classification report on the actual y values and this model's predicted y values
report = classification_report(y_train, y_predictions, output_dict=True)
pd.DataFrame(report)

In [None]:
TN, FP, FN, TP = confusion_matrix(y_train, y_predictions).ravel()

TN, FP, FN, TP

In [None]:
ALL = TP + FP + FN + TN

accuracy = (TP + TN)/ALL
true_positive_rate = sensitivity = recall = power = TP/(TP+FN)
false_positive_rate = false_alarm_ratio = fallout = FP/(FP+TN)
true_negative_rate = specificity = selectivity = TN/(TN+FP)
false_negative_rate = miss_rate = FN/(FN+TP)
precision = PPV = TP/(TP+FP)
f1_score = 2*(precision*recall)/(precision+recall)
support_pos = TP + FN
support_neg = FP + TN

print(f"Accuracy: {accuracy}", '\n')
print(f"True Positive Rate/Sensitivity/Recall/Power: {true_positive_rate}", '\n')
print(f"False Positive Rate/False Alarm Ratio/Fall-out: {false_positive_rate}", '\n')
print(f"True Negative Rate/Specificity/Selectivity: {true_negative_rate}", '\n')
print(f"False Negative Rate/Miss Rate: {false_negative_rate}", '\n')
print(f"Precision/PPV: {precision}", '\n')
print(f"F1 Score: {f1_score}", '\n')
print(f"Support (0): {support_neg}", '\n')
print(f"Support (1): {support_pos}")

- Run through steps increasing your min_samples_leaf and decreasing your max_depth.

In [None]:
for i in range(6, 15):
    # Make the model
    tree = DecisionTreeClassifier(max_depth=i, random_state=123)

    # Fit the model (on train and only train)
    tree = tree.fit(X_train, y_train)

    # Use the model
    # We'll evaluate the model's performance on train, first
    y_predictions = tree.predict(X_train)

    # Produce the classification report on the actual y values and this model's predicted y values
    report = classification_report(y_train, y_predictions, output_dict=True)
    print(f" Tree with max depth of {i}")
    print(pd.DataFrame(report))
    print()

- What are the differences in the evaluation metrics? Which performs better on your in-sample data? Why?

In [None]:
metrics = []

for i in range(1, 25):
    # Make the model
    tree = DecisionTreeClassifier(max_depth=i, random_state=123)

    # Fit the model (on train and only train)
    tree = tree.fit(X_train, y_train)

    # Use the model
    # We'll evaluate the model's performance on train, first
    in_sample_accuracy = tree.score(X_train, y_train)
    
    out_of_sample_accuracy = tree.score(X_validate, y_validate)

    output = {
        "max_depth": i,
        "train_accuracy": in_sample_accuracy,
        "validate_accuracy": out_of_sample_accuracy
    }
    
    metrics.append(output)
    
df = pd.DataFrame(metrics)
df["difference"] = df.train_accuracy - df.validate_accuracy
df

In [None]:
plt.figure(figsize=(12, 6))
plt.plot(df.max_depth, df.train_accuracy, marker = 'o')
plt.plot(df.max_depth, df.validate_accuracy, marker = 'o')
plt.title('Overfitting Occurs at Higher Values for Max Depth')
plt.xlabel('Max Depth')
plt.ylabel('Accuracy')
plt.show()

# KNN exercizes 

#### - Fit a K-Nearest Neighbors classifier to your training sample and transform (i.e. make predictions on the training sample)

In [2]:
# Acquire Step
df = acquire.get_titanic_data()
df.head(2)

Unnamed: 0,passenger_id,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,deck,embark_town,alone
0,0,0,3,male,22.0,1,0,7.25,S,Third,,Southampton,0
1,1,1,1,female,38.0,1,0,71.2833,C,First,C,Cherbourg,0


In [3]:
df = prepare.prep_titanic_data(df)
df.head()

Unnamed: 0,passenger_id,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,embark_town,alone,embarked_encode
0,0,0,3,male,22.0,1,0,7.25,S,Third,Southampton,0,3
1,1,1,1,female,38.0,1,0,71.2833,C,First,Cherbourg,0,0
2,2,1,3,female,26.0,0,0,7.925,S,Third,Southampton,1,3
3,3,1,1,female,35.0,1,0,53.1,S,First,Southampton,0,3
4,4,0,3,male,35.0,0,0,8.05,S,Third,Southampton,1,3


In [4]:
df.age = df.age.fillna(df.age.mean())

In [5]:
df = df.drop(columns=['embarked', 'embarked_encode', 'passenger_id'])

In [6]:
df = pd.get_dummies(df, ['sex', 'class', 'embark_town'], drop_first=True)
df.head()

Unnamed: 0,survived,pclass,age,sibsp,parch,fare,alone,sex_male,class_Second,class_Third,embark_town_Other,embark_town_Queenstown,embark_town_Southampton
0,0,3,22.0,1,0,7.25,0,1,0,1,0,0,1
1,1,1,38.0,1,0,71.2833,0,0,0,0,0,0,0
2,1,3,26.0,0,0,7.925,1,0,0,1,0,0,1
3,1,1,35.0,1,0,53.1,0,0,0,0,0,0,1
4,0,3,35.0,0,0,8.05,1,1,0,1,0,0,1


In [None]:
train, validate, test = prepare.split(df, stratify_by='survived')

In [None]:
X_train = train.drop(columns=["survived"])
y_train = train.survived

X_validate = validate.drop(columns=["survived"])
y_validate = validate.survived

X_test = test.drop(columns=["survived"])
y_test = test.survived

In [None]:
X_train.head()

In [None]:
from sklearn.neighbors import KNeighborsClassifier

In [None]:
knn1 = KNeighborsClassifier(n_neighbors=1, weights='uniform')

In [None]:
knn1.fit(X_train, y_train)

In [None]:
y_pred = knn1.predict(X_train)
y_pred[:5]

In [None]:
y_train[:5]

### - Evaluate your results using the model score, confusion matrix, and classification report.



In [None]:
print(confusion_matrix(y_train, y_pred))

In [None]:
pd.crosstab(y_train, y_pred)

In [None]:
print(classification_report(y_train, y_pred))

### - Print and clearly label the following: Accuracy, true positive rate, false positive rate, true negative rate, false negative rate, precision, recall, f1-score, and support.

In [None]:
TN, FP, FN, TP = confusion_matrix(y_train, y_pred).ravel()
TN, FP, FN, TP

In [None]:
TP = 307
FP = 0
FN = 2
TN = 189

ALL = TP + FP + FN + TN

accuracy = (TP + TN)/ALL
true_positive_rate = sensitivity = recall = power = TP/(TP+FN)
false_positive_rate = false_alarm_ratio = fallout = FP/(FP+TN)
true_negative_rate = specificity = selectivity = TN/(TN+FP)
false_negative_rate = miss_rate = FN/(FN+TP)
precision = PPV = TP/(TP+FP)
f1_score = 2*(precision*recall)/(precision+recall)
support_pos = TP + FN
support_neg = FP + TN

print(f"Accuracy: {accuracy}", '\n')
print(f"True Positive Rate/Sensitivity/Recall/Power: {true_positive_rate}", '\n')
print(f"False Positive Rate/False Alarm Ratio/Fall-out: {false_positive_rate}", '\n')
print(f"True Negative Rate/Specificity/Selectivity: {true_negative_rate}", '\n')
print(f"False Negative Rate/Miss Rate: {false_negative_rate}", '\n')
print(f"Precision/PPV: {precision}", '\n')
print(f"F1 Score: {f1_score}", '\n')
print(f"Support (0): {support_neg}", '\n')
print(f"Support (1): {support_pos}")

### - Run through steps 2-4 setting k to 10

In [None]:
knn10 = KNeighborsClassifier(n_neighbors=10, weights='uniform')

In [None]:
knn10.fit(X_train, y_train)

In [None]:
y_pred = knn10.predict(X_train)
y_pred[:10]

In [None]:
print(confusion_matrix(y_train, y_pred))

In [None]:
print(classification_report(y_train, y_pred))

In [None]:
TP = 273
FP = 34
FN = 95
TN = 96

ALL = TP + FP + FN + TN

accuracy = (TP + TN)/ALL
true_positive_rate = sensitivity = recall = power = TP/(TP+FN)
false_positive_rate = false_alarm_ratio = fallout = FP/(FP+TN)
true_negative_rate = specificity = selectivity = TN/(TN+FP)
false_negative_rate = miss_rate = FN/(FN+TP)
precision = PPV = TP/(TP+FP)
f1_score = 2*(precision*recall)/(precision+recall)
support_pos = TP + FN
support_neg = FP + TN

print(f"Accuracy: {accuracy}", '\n')
print(f"True Positive Rate/Sensitivity/Recall/Power: {true_positive_rate}", '\n')
print(f"False Positive Rate/False Alarm Ratio/Fall-out: {false_positive_rate}", '\n')
print(f"True Negative Rate/Specificity/Selectivity: {true_negative_rate}", '\n')
print(f"False Negative Rate/Miss Rate: {false_negative_rate}", '\n')
print(f"Precision/PPV: {precision}", '\n')
print(f"F1 Score: {f1_score}", '\n')
print(f"Support (0): {support_neg}", '\n')
print(f"Support (1): {support_pos}")

### - Run through setps 2-4 setting k to 20

In [None]:
knn20 = KNeighborsClassifier(n_neighbors=20, weights='uniform')

In [None]:
knn20.fit(X_train, y_train)

In [None]:
y_pred = knn20.predict(X_train)
y_pred[:20]

In [None]:
print(confusion_matrix(y_train, y_pred))

In [None]:
print(classification_report(y_train, y_pred))

In [None]:
TP = 270
FP = 37
FN = 103
TN = 88

ALL = TP + FP + FN + TN

accuracy = (TP + TN)/ALL
true_positive_rate = sensitivity = recall = power = TP/(TP+FN)
false_positive_rate = false_alarm_ratio = fallout = FP/(FP+TN)
true_negative_rate = specificity = selectivity = TN/(TN+FP)
false_negative_rate = miss_rate = FN/(FN+TP)
precision = PPV = TP/(TP+FP)
f1_score = 2*(precision*recall)/(precision+recall)
support_pos = TP + FN
support_neg = FP + TN

print(f"Accuracy: {accuracy}", '\n')
print(f"True Positive Rate/Sensitivity/Recall/Power: {true_positive_rate}", '\n')
print(f"False Positive Rate/False Alarm Ratio/Fall-out: {false_positive_rate}", '\n')
print(f"True Negative Rate/Specificity/Selectivity: {true_negative_rate}", '\n')
print(f"False Negative Rate/Miss Rate: {false_negative_rate}", '\n')
print(f"Precision/PPV: {precision}", '\n')
print(f"F1 Score: {f1_score}", '\n')
print(f"Support (0): {support_neg}", '\n')
print(f"Support (1): {support_pos}")

### - What are the differences in the evaluation metrics? Which performs better on your in-sample data? Why?

In [None]:
print('Accuracy of KNN classifier on training set: {:.2f}'
     .format(knn1.score(X_train, y_train)))

In [None]:
#looks like this one is better becasue of the accuracy at 1

In [None]:
print('Accuracy of KNN classifier on training set: {:.2f}'
     .format(knn10.score(X_train, y_train)))

In [None]:
print('Accuracy of KNN classifier on training set: {:.2f}'
     .format(knn20.score(X_train, y_train)))

### -Which model performs best on our out-of-sample data from validate?

In [None]:
print('Accuracy of KNN (k=1) classifier on validate set: {:.2f}'
     .format(knn1.score(X_validate, y_validate)))

print('Accuracy of KNN (k=10) classifier on validate set: {:.2f}'
     .format(knn10.score(X_validate, y_validate)))

print('Accuracy of KNN (k=20) classifier on validate set: {:.2f}'
     .format(knn20.score(X_validate, y_validate)))

In [None]:
#looks like 10 or 20 is the best 

In [None]:
import matplotlib.pyplot as plt
k_range = range(1, 20)
train_scores = []
validate_scores = []
test_scores = []
for k in k_range:
    knn = KNeighborsClassifier(n_neighbors = k)
    knn.fit(X_train, y_train)
    train_scores.append(knn.score(X_train, y_train))
    validate_scores.append(knn.score(X_validate, y_validate))
    test_scores.append(knn.score(X_test, y_test))
plt.figure()
plt.xlabel('k')
plt.ylabel('accuracy')
plt.plot(k_range, train_scores, label='Train')
plt.plot(k_range, validate_scores, label='Validate')
plt.plot(k_range, test_scores, label='Test')
plt.legend()
plt.xticks([0,5,10,15,20])
plt.show()

 ## Once you have completed work on the titanic dataset, try building some knn models with your telco data.

In [None]:
df = acquire.get_telco_data()
df.head(2)

In [None]:
df.isna().sum()

## Logistic Regression 

- Create a model that includes age in addition to fare and pclass. Does this model perform better than your baseline?

In [11]:

from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split

In [8]:
df

Unnamed: 0,survived,pclass,age,sibsp,parch,fare,alone,sex_male,class_Second,class_Third,embark_town_Other,embark_town_Queenstown,embark_town_Southampton
0,0,3,22.000000,1,0,7.2500,0,1,0,1,0,0,1
1,1,1,38.000000,1,0,71.2833,0,0,0,0,0,0,0
2,1,3,26.000000,0,0,7.9250,1,0,0,1,0,0,1
3,1,1,35.000000,1,0,53.1000,0,0,0,0,0,0,1
4,0,3,35.000000,0,0,8.0500,1,1,0,1,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...
886,0,2,27.000000,0,0,13.0000,1,1,1,0,0,0,1
887,1,1,19.000000,0,0,30.0000,1,0,0,0,0,0,1
888,0,3,29.699118,1,2,23.4500,0,0,0,1,0,0,1
889,1,1,26.000000,0,0,30.0000,1,1,0,0,0,0,0


In [16]:
train, validate, test = prepare.split(df, stratify_by='survived')
train

Unnamed: 0,survived,pclass,age,sibsp,parch,fare,alone,sex_male,class_Second,class_Third,embark_town_Other,embark_town_Queenstown,embark_town_Southampton
583,0,1,36.000000,0,0,40.1250,1,1,0,0,0,0,0
165,1,3,9.000000,0,2,20.5250,0,1,0,1,0,0,1
50,0,3,7.000000,4,1,39.6875,0,1,0,1,0,0,1
259,1,2,50.000000,0,1,26.0000,0,0,1,0,0,0,1
306,1,1,29.699118,0,0,110.8833,1,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
313,0,3,28.000000,0,0,7.8958,1,1,0,1,0,0,1
636,0,3,32.000000,0,0,7.9250,1,1,0,1,0,0,1
222,0,3,51.000000,0,0,8.0500,1,1,0,1,0,0,1
485,0,3,29.699118,3,1,25.4667,0,0,0,1,0,0,1


In [17]:
X_train = train.drop(columns=["survived"])
y_train = train.survived

X_validate = validate.drop(columns=["survived"])
y_validate = validate.survived

X_test = test.drop(columns=["survived"])
y_test = test.survived

In [18]:
train.survived.value_counts()

0    307
1    191
Name: survived, dtype: int64

In [22]:
X_train = train.drop(columns=["survived", "fare", "pclass"])
y_train = train.survived

X_validate = validate.drop(columns=["survived", "fare", "pclass"])
y_validate = validate.survived

X_test = test.drop(columns=["survived", "fare", "pclass"])
y_test = test.survived

In [23]:
logit = LogisticRegression(C=1 , random_state=123)

In [24]:
logit.fit(X_train, y_train)

LogisticRegression(C=1, random_state=123)

In [26]:
y_pred = logit.predict(X_train)
y_pred

array([0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 1, 1, 0,
       1, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 1, 1,
       0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 1,
       1, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0, 1, 0, 1, 1, 0, 1, 1, 0, 0, 0, 0,
       0, 0, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 1, 0, 1, 1, 1, 0,
       0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 1, 1, 0,
       1, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0,
       1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 1, 1, 0, 1, 0, 0, 1, 1, 0, 0,
       0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 0, 1, 0,
       1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1,
       1, 0, 1, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 0, 0, 1, 1,
       0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0,
       1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0,

In [27]:
print("Logistic Regression using age, pclass, and fare features")
print('Accuracy of Logistic Regression classifier on training set: {:.2f}'
     .format(logit.score(X_train, y_train)))

Logistic Regression using age, pclass, and fare features
Accuracy of Logistic Regression classifier on training set: 0.82


- Include sex in your model as well. Note that you'll need to encode or create a dummy variable of this feature before including it in a model.

In [40]:
X_train = train.drop(columns=["survived", "fare", "pclass", "sex_male"])
y_train = train.survived

X_validate = validate.drop(columns=["survived", "fare", "pclass", "sex_male"])
y_validate = validate.survived

X_test = test.drop(columns=["survived", "fare", "pclass", "sex_male"])
y_test = test.survived

In [41]:
logit2 = LogisticRegression(C=1 , random_state=123)

In [42]:
logit2.fit(X_train, y_train)

LogisticRegression(C=1, random_state=123)

In [44]:
print("Logistic Regression using age, pclass, and fare features")
print('Accuracy of Logistic Regression classifier on training set: {:.2f}'
     .format(logit2.score(X_train, y_train)))

Logistic Regression using age, pclass, and fare features
Accuracy of Logistic Regression classifier on training set: 0.71


- Try out other combinations of features and models.

In [45]:
X_train = train.drop(columns=["survived", "fare", "sex_male"])
y_train = train.survived

X_validate = validate.drop(columns=["survived", "fare", "sex_male"])
y_validate = validate.survived

X_test = test.drop(columns=["survived", "fare", "sex_male"])
y_test = test.survived



In [46]:
logit3 = LogisticRegression(C=1 , random_state=123)

In [47]:
logit3.fit(X_train, y_train)

LogisticRegression(C=1, random_state=123)

In [49]:
print("Logistic Regression using age, pclass, and fare features")
print('Accuracy of Logistic Regression classifier on training set: {:.2f}'
     .format(logit3.score(X_train, y_train)))

Logistic Regression using age, pclass, and fare features
Accuracy of Logistic Regression classifier on training set: 0.72


In [55]:
X_train = train.drop(columns=["survived", "fare"])
y_train = train.survived

X_validate = validate.drop(columns=["survived", "fare"])
y_validate = validate.survived

X_test = test.drop(columns=["survived", "fare"])
y_test = test.survived

In [56]:
logit4 = LogisticRegression(C=1 , random_state=123)

In [57]:
logit4.fit(X_train, y_train)

LogisticRegression(C=1, random_state=123)

In [58]:
print("Logistic Regression using age, pclass, and fare features")
print('Accuracy of Logistic Regression classifier on training set: {:.2f}'
     .format(logit4.score(X_train, y_train)))

Logistic Regression using age, pclass, and fare features
Accuracy of Logistic Regression classifier on training set: 0.82


- Use you best 3 models to predict and evaluate on your validate sample.

In [50]:
y_pred_validate = logit.predict(X_validate)
y_pred_validate2 = logit3.predict(X_validate)

In [53]:
print("Model 1: solver = lbfgs, c = 1")

print('Accuracy: {:.2f}'.format(logit.score(X_validate, y_validate)))

print(classification_report(y_validate, y_pred_validate))

print("Model 2: solver = lbfgs, c = 1")

print('Accuracy: {:.2f}'.format(logit3.score(X_validate, y_validate)))

print(classification_report(y_validate, y_pred_validate2))

Model 1: solver = lbfgs, c = 1
Accuracy: 0.72
              precision    recall  f1-score   support

           0       0.74      0.85      0.79       132
           1       0.68      0.51      0.58        82

    accuracy                           0.72       214
   macro avg       0.71      0.68      0.69       214
weighted avg       0.71      0.72      0.71       214

Model 2: solver = lbfgs, c = 1
Accuracy: 0.72
              precision    recall  f1-score   support

           0       0.74      0.85      0.79       132
           1       0.68      0.51      0.58        82

    accuracy                           0.72       214
   macro avg       0.71      0.68      0.69       214
weighted avg       0.71      0.72      0.71       214



- Choose you best model from the validation performation, and evaluate it on the test dataset. How do the performance metrics compare to validate? to train?

In [None]:
features = ["age", "pclass", "fare", "is_female"]

y_pred = logit1.predict(X_validate[features])

print('Logit1 model using age, pclass, fare, and is_female as the features')
print(classification_report(y_validate, y_pred))

In [None]:
y_pred = logit2.predict(X_validate)

print("Logit2 model using all features and all model defaults")
print(classification_report(y_validate, y_pred))

In [None]:
y_pred = logit3.predict(X_validate)

print("Logit3 model using all features, class_weight='balanced', and all other hyperparameters as default")
print(classification_report(y_validate, y_pred))