# Logistic Regression

In [5]:
from pydataset import data
import pandas as pd
import seaborn as sns
import numpy as np
import matplotlib.pyplot as plt

import warnings
warnings.filterwarnings('ignore')

import env
import acquire


from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix

### Do your work for these exercises in either a notebook or a python script named model.

1. Fit the logistic regression classifier to your training sample and transform, i.e. make predictions on the training sample
2. Evaluate your in-sample results using the model score, confusion matrix, and classification report.
3. Print and clearly label the following: Accuracy, true positive rate, false positive rate, true negative rate, false negative rate, precision, recall, f1-score, and support.
4. Look in the scikit-learn documentation to research the solver parameter. What is your best option(s) for the particular problem you are trying to solve and the data to be used?
5. Run through steps 2-4 using another solver (from question 5)
6. Which performs better on your in-sample data?

In [None]:
iris = acquire.get_iris_data()
iris.head()

In [None]:
X = iris[['sepal_length', 'sepal_width', 'petal_length', 'petal_width']]
y = iris[['species']]

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = .30, random_state = 123, stratify=None)

In [None]:
X_train.head()

In [None]:
y_train.head()

In [None]:
logit = LogisticRegression(random_state = 123, solver='newton-cg')

In [None]:
logit.fit(X_train, y_train)

In [None]:
y_pred=logit.predict(X_train)
y_pred_proba=logit.predict_proba(X_train)
y_pred_proba
# y_pred

In [None]:
print('Accuracy of Logistic Regression classifier on training set: {:.2f}'.format(logit.score(X_train,y_train)))

In [None]:
confusion_matrix(y_train,y_pred)

In [None]:
#Making the confusion matrix "pretty"
labels=sorted(y_train.species.unique())
pretty_cm=pd.DataFrame(confusion_matrix(y_train,y_pred),index=labels,columns=labels)

In [None]:
pretty_cm

In [None]:
print(classification_report(y_train, y_pred))

In [None]:
type(classification_report(y_train, y_pred))

my project deserves a better presenter than me

# Decision Tree

### Continue working in your model file.

1. Fit the decision tree classifier to your training sample and transform (i.e. make predictions on the training sample)
2. Evaluate your in-sample results using the model score, confusion matrix, and classification report.
3. Print and clearly label the following: Accuracy, true positive rate, false positive rate, true negative rate, false negative rate, precision, recall, f1-score, and support.
4. Run through steps 2-4 using entropy as your measure of impurity.
5. Which performs better on your in-sample data?

In [None]:
iris = acquire.get_iris_data()
iris.head()

In [None]:
X = iris[['sepal_length', 'sepal_width', 'petal_length', 'petal_width']]
y = iris[['species']]

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = .30, random_state = 123, stratify=None)

In [None]:
X_train.head()

In [None]:
y_train.head()

In [6]:
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.tree import export_graphviz
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix

In [None]:
clf = DecisionTreeClassifier(criterion='entropy', max_depth=3, random_state=123)

In [None]:
clf.fit(X_train, y_train)

In [None]:
print(clf.feature_importances_)

In [None]:
y_pred = clf.predict(X_train)
y_pred[0:5]

In [None]:
y_pred_proba = clf.predict_proba(X_train)
y_pred_proba

In [None]:
print('Accuracy of Decision Tree classifier on training set: {:.2f}'
     .format(clf.score(X_train, y_train)))

In [None]:
confusion_matrix(y_train, y_pred)

In [None]:
sorted(y_train.species.unique())

In [None]:
y_train.species.value_counts()

In [None]:
labels = sorted(y_train.species.unique())

pd.DataFrame(confusion_matrix(y_train, y_pred), index=labels, columns=labels)

In [None]:
print(classification_report(y_train, y_pred))

In [None]:
print('Accuracy of Decision Tree classifier on test set: {:.2f}'
     .format(clf.score(X_test, y_test)))

In [None]:
from sklearn.datasets import load_iris

iris = load_iris()
clf = DecisionTreeClassifier()
clf = clf.fit(iris.data, iris.target)

import graphviz

from graphviz import Graph

dot_data = export_graphviz(clf, out_file=None) 
graph = graphviz.Source(dot_data) 

graph.render('iris_decision_tree', view=True)

# Random Forest

### Continue working in your model file.

1. Fit the Random Forest classifier to your training sample and transform (i.e. make predictions on the training sample) setting the random_state accordingly and setting min_samples_leaf = 1 and max_depth = 20.
2. Evaluate your results using the model score, confusion matrix, and classification report.
3. Print and clearly label the following: Accuracy, true positive rate, false positive rate, true negative rate, false negative rate, precision, recall, f1-score, and support.
4. Run through steps increasing your min_samples_leaf to 5 and decreasing your max_depth to 3.
5. What are the differences in the evaluation metrics? Which performs better on your in-sample data? Why?

In [7]:
import warnings
warnings.filterwarnings("ignore")

import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix


import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns

import acquire
import prepare

In [None]:
iris = acquire.get_iris_data()
iris.head()

In [None]:
X = iris[['sepal_length', 'sepal_width', 'petal_length', 'petal_width']]
y = iris[['species']]

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = .20, random_state = 123, stratify=None)

In [None]:
rf = RandomForestClassifier(bootstrap=True, 
                            class_weight=None, 
                            criterion='gini',
                            min_samples_leaf=3,
                            n_estimators=100,
                            max_depth=3, 
                            random_state=123)

In [None]:
rf.fit(X_train, y_train)

In [None]:
print(rf.feature_importances_)

In [None]:
y_pred = rf.predict(X_train)

In [None]:
y_pred_proba = rf.predict_proba(X_train)

In [None]:
print('Accuracy of random forest classifier on training set: {:.2f}'
     .format(rf.score(X_train, y_train)))

In [None]:
print(confusion_matrix(y_train, y_pred))

In [None]:
print(classification_report(y_train, y_pred))

In [None]:
print('Accuracy of random forest classifier on test set: {:.2f}'
     .format(rf.score(X_test, y_test)))

# K-Nearest Neighbor

### Continue working in your model notebook or python script.

1. Fit a K-Nearest Neighbors classifier to your training sample and transform (i.e. make predictions on the training sample)
2. Evaluate your results using the model score, confusion matrix, and classification report.
3. Print and clearly label the following: Accuracy, true positive rate, false positive rate, true negative rate, false negative rate, precision, recall, f1-score, and support.
4. Run through steps 2-4 setting k to 10
5. Run through setps 2-4 setting k to 20
6. What are the differences in the evaluation metrics? Which performs better on your in-sample data? Why?


Test

For both the iris and the titanic data,

1. Determine which model (with hyperparameters) performs the best (try reducing the number of features to the top 4 features in terms of information gained for each feature individually).
2. Create a new dataframe with top 4 features.
3. Use the top performing algorithm with the metaparameters used in that model. Create the object, fit, transform on in-sample data, and evaluate the results with the training data. Compare your evaluation metrics with those from the original model (with all the features).
4. Run your final model on your out-of-sample dataframe (test_df). Evaluate the results.


Feature Engineering

Titanic Data
Create a feature named who, this should be either man, woman, or child. How does including this feature affect your model's performance?
Create a feature named adult_male that is either a 1 or a 0. How does this affect your model's predictions?
Iris Data
Create features named petal_area and sepal_area.


In [8]:
import warnings
warnings.filterwarnings("ignore")

import pandas as pd
import numpy as np

%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns


from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix

import acquire
import prepare

### Iris dataset

In [None]:
iris = acquire.get_iris_data()
iris.head()

In [None]:
X = iris[['sepal_length', 'sepal_width', 'petal_length', 'petal_width']]
y = iris[['species']]

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = .20, random_state = 123, stratify=None)

In [None]:
knn = KNeighborsClassifier(n_neighbors=5, weights='uniform')

In [None]:
knn.fit(X_train, y_train)

In [None]:
y_pred = knn.predict(X_train)

In [None]:
y_pred_proba = knn.predict_proba(X_train)

In [None]:
print('Accuracy of KNN classifier on training set: {:.2f}'
     .format(knn.score(X_train, y_train)))

In [None]:
print(confusion_matrix(y_train, y_pred))

In [None]:
print(classification_report(y_train, y_pred))

In [None]:
print('Accuracy of KNN classifier on test set: {:.2f}'
     .format(knn.score(X_test, y_test)))

In [None]:
import matplotlib.pyplot as plt
k_range = range(1, 20)
scores = []
for k in k_range:
    knn = KNeighborsClassifier(n_neighbors = k)
    knn.fit(X_train, y_train)
    scores.append(knn.score(X_test, y_test))
plt.figure()
plt.xlabel('k')
plt.ylabel('accuracy')
plt.scatter(k_range, scores)
plt.xticks([0,5,10,15,20])

### Titanic dataset

In [12]:
titanic = acquire.get_titanic_data()
titanic.head()

Unnamed: 0,passenger_id,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,deck,embark_town,alone
0,0,0,3,male,22.0,1,0,7.25,S,Third,,Southampton,0
1,1,1,1,female,38.0,1,0,71.2833,C,First,C,Cherbourg,0
2,2,1,3,female,26.0,0,0,7.925,S,Third,,Southampton,1
3,3,1,1,female,35.0,1,0,53.1,S,First,C,Southampton,0
4,4,0,3,male,35.0,0,0,8.05,S,Third,,Southampton,1


In [13]:
dft = titanic

In [14]:
dft.embarked.value_counts(dropna=False)

S      644
C      168
Q       77
NaN      2
Name: embarked, dtype: int64

In [15]:
dft.fillna(np.nan, inplace=True)

In [16]:
dft.head()

Unnamed: 0,passenger_id,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,deck,embark_town,alone
0,0,0,3,male,22.0,1,0,7.25,S,Third,,Southampton,0
1,1,1,1,female,38.0,1,0,71.2833,C,First,C,Cherbourg,0
2,2,1,3,female,26.0,0,0,7.925,S,Third,,Southampton,1
3,3,1,1,female,35.0,1,0,53.1,S,First,C,Southampton,0
4,4,0,3,male,35.0,0,0,8.05,S,Third,,Southampton,1


In [18]:
X = dft[['pclass', 'age', 'fare', 'embark_town']]
y = dft[['survived']]

In [19]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = .20, random_state = 123, stratify=None)

In [20]:
knn = KNeighborsClassifier(n_neighbors=5, weights='uniform')

In [21]:
knn.fit(X_train, y_train)

ValueError: could not convert string to float: 'Cherbourg'