# Our first machine learning model: Logistic Regression

In [None]:
# Import our libraries 

```py
# Import our libraries 

# Pandas and numpy for data wrangling
import pandas as pd
import numpy as np

# Seaborn / matplotlib for visualization 
import seaborn as sns
sns.set()
import matplotlib.pyplot as plt
%matplotlib inline


# Helper function to split our data
from sklearn.model_selection import train_test_split

# This is our Logit model
from sklearn.linear_model import LogisticRegression

# Helper fuctions to evaluate our model.
from sklearn.metrics import accuracy_score, precision_score, recall_score, confusion_matrix , f1_score




```

# Import and inspect the Titanic dataset.
* Load the titanic data set into a pandas dataframe.

In [None]:
# Load the titanic data set into a pandas dataframe.

```py
# Load the titanic data set into a pandas dataframe.
# Load data
df = pd.read_csv('data/titanic.csv')

# Display data
df.head()
```

## Data dictionary
<img src='https://miro.medium.com/max/1260/1*rr3UGlpEv_PSMc1pyqa4Uw.png'>

# Identify which columns have null values. 
Inspect which varibles may be good / not good for using as features based on null values. 


In [None]:
# Identify which columns have null values. 


```py
# Identify which columns have null values. 
df.isnull().sum()


```

# Check to see if our data has any duplicate rows.
If so, remove the duplicates.

In [None]:
# Check to see if our data has any duplicate rows.

```py
# Check to see if our data has any duplicate rows.

df.duplicated().sum()


```

# Use sns.pariplot to visualize.
* Set the hue='survived'.

In [1]:
# Use sns.pariplot to visualize.

```py
# Use sns.pariplot to visualize.

sns.pairplot(df, hue="survived")

```

# Feature Engineering
For your first model, only include use the `fare` and `sex` as features.
* Convert the `sex` feature to a continuous value by using `pd.get_dummies()`.
* Drop the `sex_female` column as it is the identical inverse of `sex_male`. 
    * Hint, you can use `drop_first=True` in the `pd.get_dummies()` function to have this done automatically.
* Create a `selected_features` variable that is a list of `fare` and `sex_male`.  
* Define your X and y variables.
    * `X` is your selected features
    * `y` is your target features (survived). 
* Split your data into training and testing groups by using `train_test_split()`
    * __IMPORTANT: In `train_test_split` set `random_state=45`, so when you make another model, you can run it on the same random split of data.__

In [None]:
# Convert the sex column into a continuous variable by using pd.get_dummies


```py
# Convert the sex column into a continuous variable by using pd.get_dummies
df = pd.get_dummies(df, columns=["sex"], drop_first=True)

df.head()

```

# Select our features 
   * only include use the `fare` and `sex_male` as features for this model.

In [None]:
# Select our features
selected_features = []

# Set X to be the features we are going to use.
X = ???

# Set y to be our target variable. 
y = ???

```py
# Select our features
selected_features = ["fare", "sex_male"]

# Set X to be the features we are going to use.
X = df[selected_features]

# Set y to be our target variable. 
y = df["survived"]
```

# Split our data into the testing and training groups. 

In [None]:
# Split our data into testing and training.
X_train, X_test, y_train, y_test = ???

# Print the length and width of our testing data.
print(X_train.shape, X_test.shape)

```py
# Split our data into testing and training.
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 45)

# Print the length and width of our testing data.
print(X_train.shape, X_test.shape)

```

# Build and train your model
* Initialize an empty Logistic Regression model. 
* Fit your model with your training data. 
* Predict the values of your testing data

In [None]:
# Initalize our model


# Train our model using our training data.



```py
# from sklearn.linear_model import LogisticRegression

# Initalize our model
model = LogisticRegression()

# Train our model using our training data.
model.fit(X_train, y_train)

```

# Evaluate your model
1. Make predictions of your test data and save them as `y_pred`. 
1. Calculate and print the accuracy, precision, recall, and f1 scores of your model.
    * Hint, sklearn provides helper functions for this.
1. Plot the confusion matrix of your predicted results. 
    * How many True Positives and True Negatives did your model get?

In [None]:
# 1. Make predictions of your test data and save them as `y_pred`. 


In [None]:
# 2. Calculate and print the accuracy, precision, recall, and f1 scores of your model.

# Calculate our accuracy
accuracy = ???

# Calculate our precision score
precision = ???

# Calculate our recall score
recall = ???

f1 = ???

# Print each of our scores to inspect performance.
print("Accuracy Score: %f" % accuracy)
print("Precision Score: %f" % precision)
print("Recall Score: %f" % recall)
print('F1 Score %f' % f1)

In [None]:
# 1. Plot a confusion matrix of your predicted results. 
import matplotlib.pyplot as plt 
fig = plt.figure(figsize=(8,8))
plt.xlabel('Predicted')
plt.ylabel('Actual');




In [None]:
# How many True Positives and True Negatives did your model get?
print('??? True Negatives and ??? True Positives')

```py
# 1. Make predictions of your test data and save them as `y_pred`. 

y_pred = model.predict(X_test)
y_pred


# 2. Calculate and print the accuracy, precision, recall, and f1 scores of your model.

# from sklearn.metrics import accuracy_score, precision_score, recall_score, confusion_matrix, f1_score

# Calculate our accuracy
accuracy = accuracy_score(y_test, y_pred)

# Calculate our precision score
precision = precision_score(y_test, y_pred)

# Calculate our recall score
recall = recall_score(y_test, y_pred)

f1 = f1_score(y_test, y_pred)

# Print each of our scores to inspect performance.
print("Accuracy Score: %f" % accuracy)
print("Precision Score: %f" % precision)
print("Recall Score: %f" % recall)
print('F1 Score %f' % f1)

# 1. Plot a confusion matrix of your predicted results. 
import matplotlib.pyplot as plt 

cm = confusion_matrix(y_test, y_pred)
cm = cm.round(2)


tn, fp, fn, tp = confusion_matrix(y_test, y_pred).ravel()


fig = plt.figure(figsize=(8,8))
ax = sns.heatmap(cm, annot=True, cmap='Greens', fmt='g')
plt.title("Confusion Matrix of Titanic Suvivors")
plt.xlabel('Predicted')
plt.ylabel('Actual');


print('true-negitive:', tn, 
      '\nfalse-positive:', fp, 
      '\nfalse-negative:', fn, 
      '\ntrue-positive:', tp )



```

# Create another model, call this `model_2`.  This time also include the p_class and embarked features. 
1. Run `pd.get_dummies()` on pclass and embarked of your DataFrame.
1. Update your `selected_features` to include the new pclass, embarked, sibsp, and parch features.
1. Define your `X` and `y` variables.
1. Break your data into training and testing groups.
    * __IMPORTANT, In `train_test_split` set `random_state=45` so we will be using the same data rows as our first model__.
1. Initialize a new model, call this one `model_2`
1. Fit / Train your new model
1. Make predictions of your test data and save them as `y_pred`. 
1. Calculate and print the accuracy, precision, recall, and f1 scores of your model.
1. Plot the confusion matrix of your predicted results. 
    * How many True Positives and True Negatives did your model get?
    
Compare the results to your first model. Which model had a better accuracy, recall, precision, and f1 score.

In [None]:
df = pd.read_csv('data/titanic.csv')

# Run pd.get_dummies on pclass and embarked of your DataFrame.


# Update your `selected_features` to include the new pclass and embarked features. 

# Define your X and y variables


# Split our data into testing and training.
# !!! Remeber to use the same random state as you used before


# Initalize our model_2
model_2 = LogisticRegression()

# Fit / Train our model using our training data.

# Make new predicitions using our testing data. 

# Calculate our accuracy
accuracy_2 = ???

# Calculate our precision score
precision_2 = ???

# Calculate our recall score
recall_2 = ???

# Calculate your f1-score
f1_2 = ???

# Print each of our scores to inspect performance.
print("Accuracy Score: %f" % accuracy_2)
print("Precision Score: %f" % precision_2)
print("Recall Score: %f" % recall_2)
print('F1 Score %f' % f1_2)

# Plot your confusion matrix.
fig = plt.figure(figsize=(8,8))
plt.xlabel('Predicted')
plt.ylabel('Actual');



```py
df = pd.read_csv('data/titanic.csv')

# Run pd.get_dummies on pclass and embarked of your DataFrame.

df = pd.get_dummies(df, columns = ['pclass', 'embarked'], drop_first = True)
df.head()

```

```py

# Update your `selected_features` to include the new pclass and embarked features. 

selected_features = ['pclass_2', 'pclass_3', 'embarked_Q', 'embarked_S', 'sibsp', 'parch']

# Define your X and y variables

x = df[selected_features]

y = df['survived']

# Split our data into testing and training.
# !!! Remeber to use the same random state as you used before

X_train, X_test, y_train, y_test = train_test_split(x, y, test_size = 0.2, random_state = 45)


# Initalize our model_2
model_2 = LogisticRegression()

# Fit / Train our model using our training data.
model_2.fit(X_train, y_train)

# Make new predicitions using our testing data. 
y_pred2 = model_2.predict(X_test)

```

```

# Calculate our accuracy
accuracy_2 = accuracy_score(y_test, y_pred2)

# Calculate our precision score
precision_2 = precision_score(y_test, y_pred2)

# Calculate our recall score
recall_2 = recall_score(y_test, y_pred2)

f1_2 = f1_score(y_test, y_pred2)


# Print each of our scores to inspect performance.
print("Accuracy Score: %f" % accuracy_2)
print("Precision Score: %f" % precision_2)
print("Recall Score: %f" % recall_2)
print('F1 Score %f' % f1_2)

```

![image.png](attachment:a5fb852d-dfb0-4e70-a779-22ebdee8ff5e.png)

```py

# Plot your confusion matrix.

cm = confusion_matrix(y_test, y_pred2)

cm = cm.round(2)
tn, fp, fn, tp = confusion_matrix(y_test, y_pred2).ravel()

fig = plt.figure(figsize=(8,8))
ax = sns.heatmap(cm, annot = True, cmap="Greens", fmt = "g")
plt.xlabel('Predicted')
plt.ylabel('Actual');

```

![image.png](attachment:b5a16fc0-e5c6-4ba2-8bce-6ed1fbe37d0d.png)


# EXTRA CREDIT
* Use age as a feature. 
* How will you fill the null values?
    * Hint, use `df.age.fillna(???)`
* Make a new feature that 'traveled_alone'.  The sibsp and parch contain the amout of people they are traveling with. Mark everyone that has no sibsp or parch as traveled alone set to 1 and everyone else set to 0. 
    * Once you have this traveled_alone column, you dont need to use the the sibsp and parch cols in your model.

In [None]:
df = pd.read_csv('data/titanic.csv')

# Run pd.get_dummies on sex, pclass, and embarked of your DataFrame.


# Fill null age values with mean age.


# Create new traveled_alone feature


# Update your `selected_features` to include the new traveled alone and age


# Define your X and y variables


# Split our data into testing and training.
# Remeber to use the same random state as you used before
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=45)


# Initalize our model
model_3 = LogisticRegression()

# Fit / Train our model using our training data.

# Make new predicitions using our testing data. 


# Calculate our accuracy
accuracy_3 = 

# Calculate our precision score
precision_3 = 

# Calculate our recall score
recall_3 = 

# Calculate your f1-score
f1_3 = 

# Print each of our scores to inspect performance.
print("Accuracy Score: %f" % accuracy_3)
print("Precision Score: %f" % precision_3)
print("Recall Score: %f" % recall_3)
print('F1 Score %f' % f1_3)

# Plot your confusion matrix.
fig = plt.figure(figsize=(8,8))
plt.xlabel('Predicted')
plt.ylabel('Actual');

```py
df = pd.read_csv('data/titanic.csv')

# Run pd.get_dummies on sex, pclass, and embarked of your DataFrame.
df = pd.get_dummies(df, columns=['sex', 'pclass', 'embarked'], drop_first=True)

```

```py

# Fill null age values with mean age.
mean_age = df['age'].mean()

print(mean_age)
```

29.69911764705882


```py
df['age'].fillna(mean_age, inplace = True)
df['age']
```

```
0      29.699118
1      29.699118
2      29.699118
3      29.699118
4      29.699118
         ...    
886    29.699118
887    29.699118
888    29.699118
889    29.699118
890    29.699118
Name: age, Length: 891, dtype: float64
```

```py


# Create new traveled_alone feature

df["traveled_alone"] = np.where(((df["sibsp"] == 0) & (df["parch"] == 0)), 1, 0)
df["traveled_alone"]
df.head()

```

```py

# Update your `selected_features` to include the new traveled alone and age


selected_features = ['pclass_2', 'pclass_3', 'embarked_Q', 'embarked_S', 'traveled_alone', 'age']


# Define your X and y variables
X = df[selected_features]

# Split our data into testing and training.
y = df ['survived']


# Remeber to use the same random state as you used before
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=45)


# Initalize our model
model_3 = LogisticRegression()

# Fit / Train our model using our training data.

model_3.fit(X_train, y_train)

# Make new predicitions using our testing data. 
y_pred_3 = model_3.predict(X_test)

# Calculate our accuracy
accuracy_3 = accuracy_score(y_test, y_pred_3)

# Calculate our precision score
precision_3 = precision_score(y_test, y_pred_3)

# Calculate our recall score
recall_3 = recall_score(y_test, y_pred_3)

# Calculate your f1-score
f1_3 = f1_score(y_test, y_pred_3)

# Print each of our scores to inspect performance.
print("Accuracy Score: %f" % accuracy_3)
print("Precision Score: %f" % precision_3)
print("Recall Score: %f" % recall_3)
print('F1 Score %f' % f1_3)

# Plot your confusion matrix.

cm = confusion_matrix(y_test, y_pred_3)

cm = cm.round(2)

tn, fp, fn, tp = confusion_matrix(y_test, y_pred_3).ravel()
fig = plt.figure(figsize=(8,8))
ax = sns.heatmap(cm, annot=True, cmap='Greens', fmt='g')
plt.xlabel('Predicted')
plt.ylabel('Actual');
```

![image.png](attachment:11fa0346-4fa5-4ad8-9dc6-4e11a905b4dd.png)


