In [1]:
import numpy as np
import pandas as pd
from pydataset import data

%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns
import acquire
import prepare


from sklearn.tree import DecisionTreeClassifier, plot_tree, export_text

from sklearn.ensemble import RandomForestClassifier

from sklearn.metrics import accuracy_score, classification_report

from sklearn.model_selection import train_test_split

from sklearn.metrics import confusion_matrix

Using cached csv
Using cached csv


In [2]:
df = acquire.get_titanic_data()
df = prepare.prep_titanic(df)

Using cached csv


In [3]:
df= df.drop(columns = ['sex', 'embark_town'])
df.head()

Unnamed: 0,survived,pclass,sibsp,parch,fare,alone,sex_male,embark_town_Queenstown,embark_town_Southampton
0,0,3,1,0,7.25,0,1,0,1
1,1,1,1,0,71.2833,0,0,0,0
2,1,3,0,0,7.925,1,0,0,1
3,1,1,1,0,53.1,0,0,0,1
4,0,3,0,0,8.05,1,1,0,1


In [4]:
def train_validate_test_split(df, target, seed=123):
    '''
    This function takes in a dataframe, the name of the target variable
    (for stratification purposes), and an integer for a setting a seed
    and splits the data into train, validate and test. 
    Test is 20% of the original dataset, validate is .30*.80= 24% of the 
    original dataset, and train is .70*.80= 56% of the original dataset. 
    The function returns, in this order, train, validate and test dataframes. 
    '''
    train_validate, test = train_test_split(df, test_size=0.2, 
                                            random_state=seed, 
                                            stratify=df[target])
    train, validate = train_test_split(train_validate, test_size=0.3, 
                                       random_state=seed,
                                       stratify=train_validate[target])
    return train, validate, test

In [5]:
train, validate, test = train_validate_test_split(df, target='survived', seed = 123)

In [6]:
train.shape

(498, 9)

In [7]:
validate.shape

(214, 9)

In [8]:
test.shape

(179, 9)

In [9]:
# create X & y version of train, where y is a series with just the target variable and X are all the features. 

X_train = train.drop(columns=['survived'])
y_train = train.survived

X_validate = validate.drop(columns=['survived'])
y_validate = validate.survived

X_test = test.drop(columns=['survived'])
y_test = test.survived

In [10]:
X_train.shape

(498, 8)

In [11]:
y_train.value_counts()

0    307
1    191
Name: survived, dtype: int64

## QUESTION 1 = BASELINE

In [12]:
# The mode(MOST OCCURING) is a great baseline
baseline = y_train.mode()

# Produce a boolean array with True representing a match between the baseline prediction and reality
matches_baseline_prediction = (y_train == 0)

baseline_accuracy = matches_baseline_prediction.mean()
print(f"Baseline accuracy: {round(baseline_accuracy, 2)}")

Baseline accuracy: 0.62


## QUESTION 2: FIT - TRANSFORMATION

In [13]:
# make the model

tree1 = DecisionTreeClassifier(max_depth=1, random_state=123)

# Fit the model (on train and only train)
tree1 = tree1.fit(X_train, y_train)

# Use the model
# We'll evaluate the model's performance on train, first
y_predictions = tree1.predict(X_train)


## QUESTION 3: EVALUATE PERFORMANCE

In [14]:
print('Accuracy of Decision Tree classifier on training set: {:.2f}'
      .format(tree1.score(X_train, y_train)))

Accuracy of Decision Tree classifier on training set: 0.80


In [15]:
# confusion matrix - actual on left predicted on top
pd.DataFrame(confusion_matrix(y_train, y_predictions))

Unnamed: 0,0,1
0,265,42
1,58,133


In [16]:
# create classification report
print(classification_report(y_train, y_predictions))

              precision    recall  f1-score   support

           0       0.82      0.86      0.84       307
           1       0.76      0.70      0.73       191

    accuracy                           0.80       498
   macro avg       0.79      0.78      0.78       498
weighted avg       0.80      0.80      0.80       498



In [17]:
# Produce the classification report on the actual y values and this model's predicted

report = classification_report(y_train, y_predictions, output_dict = True)
print("Tree of 1 depth")
pd.DataFrame(report)

Tree of 1 depth


Unnamed: 0,0,1,accuracy,macro avg,weighted avg
precision,0.820433,0.76,0.799197,0.790217,0.797255
recall,0.863192,0.696335,0.799197,0.779764,0.799197
f1-score,0.84127,0.726776,0.799197,0.784023,0.797358
support,307.0,191.0,0.799197,498.0,498.0


## QUESTION 4: EXTRA RE-CALCULATED METRICS

In [18]:
# NOT SURVIVED IS OUR POSITIVE CASE '0'

TP = 265
FP = 58
FN = 42
TN = 133
ALL = TP + FP + FN + TN

accuracy = (TP + TN) / ALL
print(f'Accuracy: {accuracy}')

true_positive_rate = TP/(TP + TN)
print(f'True Positive Rate: {true_positive_rate}')

false_positive_rate = FP/(FP+TN)
print(f"False Positive Rate: {false_positive_rate}")

true_negative_rate = TN/(TN+FP)
print(f"True Negative Rate: {true_negative_rate}")

false_negative_rate = FN/(FN+TP)
print(f"False Negative Rate: {false_negative_rate}")

precision = TP/(TP+FP)
print(f"Precision: {precision}")

recall = TP/(TP+FN)
print(f"Recall: {recall}")

f1_score = 2*(precision*recall)/(precision+recall)
print(f"F1 Score: {f1_score}")

support_pos = TP + FN
print(f"Support (0): {support_pos}")

support_neg = FP + TN
print(f"Support (1): {support_neg}")


Accuracy: 0.7991967871485943
True Positive Rate: 0.6658291457286433
False Positive Rate: 0.3036649214659686
True Negative Rate: 0.6963350785340314
False Negative Rate: 0.13680781758957655
Precision: 0.8204334365325078
Recall: 0.8631921824104235
F1 Score: 0.8412698412698413
Support (0): 307
Support (1): 191


## QUESTION 5: FINDING OPTIMAL MAX_DEPTH

In [19]:
for i in range (2,21):
    # Make the model
    tree = DecisionTreeClassifier (max_depth = i, random_state = 123)
    
    # Fit the model ( on train only)
    tree = tree.fit(X_train, y_train)
    
    # Use the model
    # We'll  evaluate the model's performance on train, first
    y_predictions = tree.predict(X_train)
    
    # Produce the classification report on the actual y values and this model's predicted y values
    report = classification_report(y_train, y_predictions, output_dict = True)
    print(f'Tree of {i} depth')
    print(pd.DataFrame(report))
    print()

Tree of 2 depth
                    0           1  accuracy   macro avg  weighted avg
precision    0.820433    0.760000  0.799197    0.790217      0.797255
recall       0.863192    0.696335  0.799197    0.779764      0.799197
f1-score     0.841270    0.726776  0.799197    0.784023      0.797358
support    307.000000  191.000000  0.799197  498.000000    498.000000

Tree of 3 depth
                    0           1  accuracy   macro avg  weighted avg
precision    0.828829    0.812121  0.823293    0.820475      0.822421
recall       0.899023    0.701571  0.823293    0.800297      0.823293
f1-score     0.862500    0.752809  0.823293    0.807654      0.820430
support    307.000000  191.000000  0.823293  498.000000    498.000000

Tree of 4 depth
                    0           1  accuracy   macro avg  weighted avg
precision    0.829341    0.817073  0.825301    0.823207      0.824636
recall       0.902280    0.701571  0.825301    0.801925      0.825301
f1-score     0.864275    0.754930  0.825

## QUESTIONS 6: FURTHER EVALUATION

In [20]:
# max depth of 14+ produced the highest accuracy

## QUESTION 7: VALIDATION

In [21]:
metrics = []

for i in range (2, 25):
    tree = DecisionTreeClassifier(max_depth = i, random_state=123)
    
    # Fit the model
    tree = tree.fit(X_train, y_train)
    
    in_sample_accuracy = tree.score(X_train, y_train)
    
    out_of_sample_accuracy = tree.score(X_validate, y_validate)
    
    output = {
        'max_depth':i,
        'train_accuracy': in_sample_accuracy,
        'validate_accuracy': out_of_sample_accuracy
    }
    
    metrics.append(output)
    
df = pd.DataFrame(metrics)
df['difference'] = df.train_accuracy - df.validate_accuracy
df

Unnamed: 0,max_depth,train_accuracy,validate_accuracy,difference
0,2,0.799197,0.761682,0.037515
1,3,0.823293,0.785047,0.038246
2,4,0.825301,0.785047,0.040254
3,5,0.837349,0.757009,0.08034
4,6,0.859438,0.766355,0.093083
5,7,0.863454,0.761682,0.101772
6,8,0.89759,0.757009,0.140581
7,9,0.909639,0.761682,0.147956
8,10,0.923695,0.766355,0.15734
9,11,0.931727,0.761682,0.170045


In [22]:
df[df.difference <= 0.10].sort_values(by=['validate_accuracy', 'difference'], ascending =[False,True])

Unnamed: 0,max_depth,train_accuracy,validate_accuracy,difference
1,3,0.823293,0.785047,0.038246
2,4,0.825301,0.785047,0.040254
4,6,0.859438,0.766355,0.093083
0,2,0.799197,0.761682,0.037515
3,5,0.837349,0.757009,0.08034


In [23]:
df = acquire.get_titanic_data()
df = prepare.prep_titanic(df)

Using cached csv


In [24]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 891 entries, 0 to 890
Data columns (total 11 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   survived                 891 non-null    int64  
 1   pclass                   891 non-null    int64  
 2   sex                      891 non-null    object 
 3   sibsp                    891 non-null    int64  
 4   parch                    891 non-null    int64  
 5   fare                     891 non-null    float64
 6   embark_town              891 non-null    object 
 7   alone                    891 non-null    int64  
 8   sex_male                 891 non-null    uint8  
 9   embark_town_Queenstown   891 non-null    uint8  
 10  embark_town_Southampton  891 non-null    uint8  
dtypes: float64(1), int64(5), object(2), uint8(3)
memory usage: 65.3+ KB


In [25]:
df.head()

Unnamed: 0,survived,pclass,sex,sibsp,parch,fare,embark_town,alone,sex_male,embark_town_Queenstown,embark_town_Southampton
0,0,3,male,1,0,7.25,Southampton,0,1,0,1
1,1,1,female,1,0,71.2833,Cherbourg,0,0,0,0
2,1,3,female,0,0,7.925,Southampton,1,0,0,1
3,1,1,female,1,0,53.1,Southampton,0,0,0,1
4,0,3,male,0,0,8.05,Southampton,1,1,0,1


In [26]:
train_val, test = train_test_split(df,
                                  train_size = 0.8,
                                  random_state=123,
                                  stratify=df.survived)

In [27]:
train, val = train_test_split(train_val,
                                  train_size = 0.7,
                                  random_state=123,
                                  stratify=train_val.survived)

In [28]:
train.head()

Unnamed: 0,survived,pclass,sex,sibsp,parch,fare,embark_town,alone,sex_male,embark_town_Queenstown,embark_town_Southampton
583,0,1,male,0,0,40.125,Cherbourg,1,1,0,0
165,1,3,male,0,2,20.525,Southampton,0,1,0,1
50,0,3,male,4,1,39.6875,Southampton,0,1,0,1
259,1,2,female,0,1,26.0,Southampton,0,0,0,1
306,1,1,female,0,0,110.8833,Cherbourg,1,0,0,0


In [29]:
x_train = train.drop(columns=['survived'])

y_train = train[['survived']]

In [30]:
type(y_train)

pandas.core.frame.DataFrame

In [31]:
type(train['survived'])

pandas.core.series.Series

In [32]:
y_train.survived.value_counts()

0    307
1    191
Name: survived, dtype: int64

In [33]:
y_train['baseline'] = 0

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  y_train['baseline'] = 0


In [34]:
baseline_score = accuracy_score(y_train.survived, y_train.baseline)

In [35]:
baseline_score

0.6164658634538153

## Model creation
step one: Create the thing step two: fit the thing step three: use the thing

In [36]:
#create the thing

In [37]:
selected_feats = ['sex_male', 'pclass']

In [38]:
# create the classifier object

clf = RandomForestClassifier(random_state = 123)

In [39]:
# fit the thing:
# invoke the fit method, don't assign it to anything
# this will train our model

selected_feats

['sex_male', 'pclass']

In [40]:
#use the thing

clf.fit(x_train[selected_feats], y_train.survived)

RandomForestClassifier(random_state=123)

In [41]:
y_train['y_pred'] = clf.predict(x_train[selected_feats])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  y_train['y_pred'] = clf.predict(x_train[selected_feats])


In [42]:
y_train.head()

Unnamed: 0,survived,baseline,y_pred
583,0,0,0
165,1,0,0
50,0,0,0
259,1,0,1
306,1,0,1


# KNN EXERCISES

In [43]:
from sklearn.neighbors import KNeighborsClassifier

from sklearn.preprocessing import MinMaxScaler

In [44]:
train.head()


Unnamed: 0,survived,pclass,sex,sibsp,parch,fare,embark_town,alone,sex_male,embark_town_Queenstown,embark_town_Southampton
583,0,1,male,0,0,40.125,Cherbourg,1,1,0,0
165,1,3,male,0,2,20.525,Southampton,0,1,0,1
50,0,3,male,4,1,39.6875,Southampton,0,1,0,1
259,1,2,female,0,1,26.0,Southampton,0,0,0,1
306,1,1,female,0,0,110.8833,Cherbourg,1,0,0,0


In [45]:
kndf = acquire.get_titanic_data()
kndf = prepare.prep_titanic(kndf)

Using cached csv


In [46]:
kndf.head()

Unnamed: 0,survived,pclass,sex,sibsp,parch,fare,embark_town,alone,sex_male,embark_town_Queenstown,embark_town_Southampton
0,0,3,male,1,0,7.25,Southampton,0,1,0,1
1,1,1,female,1,0,71.2833,Cherbourg,0,0,0,0
2,1,3,female,0,0,7.925,Southampton,1,0,0,1
3,1,1,female,1,0,53.1,Southampton,0,0,0,1
4,0,3,male,0,0,8.05,Southampton,1,1,0,1


In [47]:
kndf = kndf.drop(columns = ['sex', 'embark_town'])

In [48]:
kndf.head(2)

Unnamed: 0,survived,pclass,sibsp,parch,fare,alone,sex_male,embark_town_Queenstown,embark_town_Southampton
0,0,3,1,0,7.25,0,1,0,1
1,1,1,1,0,71.2833,0,0,0,0


In [49]:
train_val, test = train_test_split(kndf,
                                  train_size=0.8,
                                  stratify=kndf.survived,
                                  random_state=123)



In [50]:
train, validate = train_test_split(train_val,
                                  train_size=0.7,
                                  stratify=train_val.survived,
                                  random_state=123)

In [51]:
train.shape, validate.shape, test.shape

((498, 9), (214, 9), (179, 9))

In [52]:
x_cols = ['fare', 'alone', 'pclass','sex_male']
y_col = 'survived'

X_train, y_train = train[x_cols], train[y_col]
X_validate, y_validate = validate[x_cols], validate[y_col]
X_test, y_test = test[x_cols], test[y_col]


In [53]:
X_train.head()

Unnamed: 0,fare,alone,pclass,sex_male
583,40.125,1,1,1
165,20.525,0,3,1
50,39.6875,0,3,1
259,26.0,0,2,0
306,110.8833,1,1,0


In [54]:
# knn classifier with n_neighbour = 1

knn1 = KNeighborsClassifier(1)
knn1.fit(X_train, y_train)
# get_classification_metrics(knn)
y_pred = knn1.predict(X_train)

In [55]:
print(classification_report(y_train, y_pred))

              precision    recall  f1-score   support

           0       0.92      0.96      0.94       307
           1       0.93      0.87      0.90       191

    accuracy                           0.92       498
   macro avg       0.92      0.91      0.92       498
weighted avg       0.92      0.92      0.92       498



In [56]:
report = classification_report(y_train, y_pred, output_dict=True)
print('n_neighbor = 1')
pd.DataFrame(report)

n_neighbor = 1


Unnamed: 0,0,1,accuracy,macro avg,weighted avg
precision,0.92163,0.927374,0.923695,0.924502,0.923833
recall,0.957655,0.86911,0.923695,0.913382,0.923695
f1-score,0.939297,0.897297,0.923695,0.918297,0.923189
support,307.0,191.0,0.923695,498.0,498.0


In [57]:
confusion_matrix(y_train, y_pred)

array([[294,  13],
       [ 25, 166]])

In [58]:
print('Actual on Left, Predicted on Top')
pd.crosstab(y_train, y_pred)

Actual on Left, Predicted on Top


col_0,0,1
survived,Unnamed: 1_level_1,Unnamed: 2_level_1
0,294,13
1,25,166


In [59]:
# Lets be nicer and say that the "positive" outcome is survival = 1
TN, FP, FN, TP = confusion_matrix(y_train,y_pred).ravel()
ALL = TP + TN + FP + FN

TN, FP, FN, TP 

(294, 13, 25, 166)

In [60]:
def show_scores(TN, FP, FN, TP):
    
    ALL = TP + TN + FP + FN
    
    accuracy = (TP + TN)/ALL # How often did the model get it right?
    precision = TP/(TP+FP) # What is the quality of a positive prediction made by the model?
    recall = TP/(TP+FN) # How many of the true positives were found?   
    
    true_positive_rate = TP/(TP+FN) # Same as recall, actually
    true_negative_rate = TN/(TN+FP) # How many of the true negatives were found?
    false_positive_rate = FP/(FP+TN) # How often did we miss the negative and accidentally call it positive?
    false_negative_rate = FN/(FN+TP) # How often did we miss the positive and accidentally call it negative?
    
    f1_score = 2*(precision*recall)/(precision+recall) # Harmonic mean, good for imbalanced data sets
    support_pos = TP + FN # Number of actual positives in the sample
    support_neg = FP + TN # Number of actual negatives in the sample
    
    print(f"Accuracy: {accuracy}")
    print(f"Precision: {precision}")
    print(f"Recall: {recall}")
    print(f"True Positive Rate: {true_positive_rate}")
    print(f"True Negative Rate: {true_negative_rate}")
    print(f"False Positive Rate: {false_positive_rate}")
    print(f"False Negative Rate: {false_negative_rate}")
    print(f"F1 Score: {f1_score}")
    print(f"Support (0): {support_pos}")
    print(f"Support (1): {support_neg}")

In [61]:
show_scores(TN, FP, FN, TP)

Accuracy: 0.9236947791164659
Precision: 0.9273743016759777
Recall: 0.8691099476439791
True Positive Rate: 0.8691099476439791
True Negative Rate: 0.9576547231270358
False Positive Rate: 0.04234527687296417
False Negative Rate: 0.13089005235602094
F1 Score: 0.8972972972972973
Support (0): 191
Support (1): 307


In [62]:
# knn classifier with n_neighbor = 10

knn2 = KNeighborsClassifier(10)
knn2.fit(X_train, y_train)
y_pred = knn2.predict(X_train)

report = classification_report(y_train, y_pred, output_dict=True)
print("n_neighbour = 10")
pd.DataFrame(report)

n_neighbour = 10


Unnamed: 0,0,1,accuracy,macro avg,weighted avg
precision,0.81875,0.747191,0.793173,0.782971,0.791305
recall,0.85342,0.696335,0.793173,0.774878,0.793173
f1-score,0.835726,0.720867,0.793173,0.778296,0.791674
support,307.0,191.0,0.793173,498.0,498.0


In [63]:
# Confusion matrix
print('Actual on Left, Predicted on Top')
pd.crosstab(y_train, y_pred)

Actual on Left, Predicted on Top


col_0,0,1
survived,Unnamed: 1_level_1,Unnamed: 2_level_1
0,262,45
1,58,133


In [64]:
# knn classifier with n_neighbor = 20

knn3 = KNeighborsClassifier(20)
knn3.fit(X_train, y_train)
y_pred = knn3.predict(X_train)

report = classification_report(y_train, y_pred, output_dict=True)
print('n_neighbor = 20')
pd.DataFrame(report)

n_neighbor = 20


Unnamed: 0,0,1,accuracy,macro avg,weighted avg
precision,0.769461,0.695122,0.74498,0.732292,0.740949
recall,0.837134,0.596859,0.74498,0.716996,0.74498
f1-score,0.801872,0.642254,0.74498,0.722063,0.740653
support,307.0,191.0,0.74498,498.0,498.0


In [65]:
# confusion_matrix
print('actual on left, predicted on top')
pd.crosstab(y_train, y_pred)

actual on left, predicted on top


col_0,0,1
survived,Unnamed: 1_level_1,Unnamed: 2_level_1
0,257,50
1,77,114


# LOGISTIC REGRESSION EXERCISES


In [66]:
from sklearn.linear_model import LogisticRegression

In [67]:
ldf = acquire.get_titanic_data()


ldf.head()

Using cached csv


Unnamed: 0,passenger_id,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,deck,embark_town,alone
0,0,0,3,male,22.0,1,0,7.25,S,Third,,Southampton,0
1,1,1,1,female,38.0,1,0,71.2833,C,First,C,Cherbourg,0
2,2,1,3,female,26.0,0,0,7.925,S,Third,,Southampton,1
3,3,1,1,female,35.0,1,0,53.1,S,First,C,Southampton,0
4,4,0,3,male,35.0,0,0,8.05,S,Third,,Southampton,1


In [68]:
avg_age = ldf.age.mean()
avg_age 

29.69911764705882

In [69]:

ldf.drop_duplicates(inplace =True )   
ldf = ldf.drop(columns = ['deck', 'embarked', 'class', 'passenger_id'] )
ldf['embark_town'] = ldf.embark_town.fillna('Southampton')
avg_age = ldf.age.mean()
ldf['age'] = ldf.age.fillna(avg_age)
dummy_df = pd.get_dummies(ldf[['sex', 'embark_town']], dummy_na=False, drop_first = [True, True])
ldf = pd.concat([ldf, dummy_df], axis =1)
ldf.head()

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embark_town,alone,sex_male,embark_town_Queenstown,embark_town_Southampton
0,0,3,male,22.0,1,0,7.25,Southampton,0,1,0,1
1,1,1,female,38.0,1,0,71.2833,Cherbourg,0,0,0,0
2,1,3,female,26.0,0,0,7.925,Southampton,1,0,0,1
3,1,1,female,35.0,1,0,53.1,Southampton,0,0,0,1
4,0,3,male,35.0,0,0,8.05,Southampton,1,1,0,1


In [70]:
ldf = ldf.drop(columns = ['sex', 'embark_town'])
ldf.head(2)

Unnamed: 0,survived,pclass,age,sibsp,parch,fare,alone,sex_male,embark_town_Queenstown,embark_town_Southampton
0,0,3,22.0,1,0,7.25,0,1,0,1
1,1,1,38.0,1,0,71.2833,0,0,0,0


In [71]:
train_val, test = train_test_split(ldf, train_size =.8, 
                                   random_state = 123,
                                  stratify = ldf.survived)

In [72]:
train, validate = train_test_split(train_val, train_size =.7,
                                  random_state = 123,
                                  stratify = train_val.survived)

In [73]:
train.shape, validate.shape, test.shape

((498, 10), (214, 10), (179, 10))

In [74]:
train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 498 entries, 583 to 744
Data columns (total 10 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   survived                 498 non-null    int64  
 1   pclass                   498 non-null    int64  
 2   age                      498 non-null    float64
 3   sibsp                    498 non-null    int64  
 4   parch                    498 non-null    int64  
 5   fare                     498 non-null    float64
 6   alone                    498 non-null    int64  
 7   sex_male                 498 non-null    uint8  
 8   embark_town_Queenstown   498 non-null    uint8  
 9   embark_town_Southampton  498 non-null    uint8  
dtypes: float64(2), int64(5), uint8(3)
memory usage: 32.6 KB


In [75]:
X_train = train.drop(columns = 'survived')
y_train = train.survived

X_validate = validate.drop(columns = 'survived')
y_validate = validate.survived

X_test = test.drop(columns = 'survived')
y_test = test.survived

In [76]:
y_train.value_counts()

0    307
1    191
Name: survived, dtype: int64

In [77]:
baseline = y_train.mode()

matches_baseline_prediction = (y_train == 0)

baseline_accuracy = matches_baseline_prediction.mean()
print(f'Baseline Accuracy: {round(baseline_accuracy, 3)}')

Baseline Accuracy: 0.616


In [78]:
# Define the logistic regression model
# C - default is 1.0 .. smaller values specify stronger regularization
logit = LogisticRegression(C =1, random_state = 123)


In [79]:
logit.fit(X_train[['age', 'pclass', 'fare']], y_train)

LogisticRegression(C=1, random_state=123)

In [80]:
# use the model to make predictions
y_pred = logit.predict(X_train[['age', 'pclass', 'fare']])

In [81]:
# take a look at the predictions (survived 1, or not survived 0)
y_pred

array([1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0,
       1, 1, 1, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0,
       1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0,
       0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 1, 0, 1, 0, 0, 1, 1, 0, 1, 0, 0,
       1, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 0, 0,
       0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 1, 1, 1, 0, 0, 1, 0, 0, 1, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0,
       1, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0,
       0, 0, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0,
       0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1,
       0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1,
       1, 1, 1, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0,

In [82]:
# look at predicted probabilities for the first 10 observations
logit.predict_proba(X_train[['age', 'pclass', 'fare']])[:10]

array([[0.36987005, 0.63012995],
       [0.63806591, 0.36193409],
       [0.61743881, 0.38256119],
       [0.70383651, 0.29616349],
       [0.30458292, 0.69541708],
       [0.56359759, 0.43640241],
       [0.65720071, 0.34279929],
       [0.55318395, 0.44681605],
       [0.7719031 , 0.2280969 ],
       [0.75619804, 0.24380196]])

In [83]:
# look at the classes from the predictions above

logit.classes_

array([0, 1])

In [84]:
# view raw probabilities (output from the model)

y_pred_proba = logit.predict_proba(X_train[['age', 'pclass', 'fare']])
y_pred_proba = pd.DataFrame(y_pred_proba, columns = [0,1])
y_pred_proba.head().round(3)

Unnamed: 0,0,1
0,0.37,0.63
1,0.638,0.362
2,0.617,0.383
3,0.704,0.296
4,0.305,0.695


In [85]:
# classification report
print(classification_report(y_train, y_pred))

              precision    recall  f1-score   support

           0       0.71      0.87      0.78       307
           1       0.67      0.44      0.53       191

    accuracy                           0.70       498
   macro avg       0.69      0.65      0.66       498
weighted avg       0.70      0.70      0.69       498



# MODEL 2

In [86]:
logit2 = LogisticRegression(C = 1, random_state = 123)

In [87]:
# adding 'sex_male' to the features we are using to predict

logit2.fit(X_train[['age', 'pclass', 'fare', 'sex_male']], y_train)

LogisticRegression(C=1, random_state=123)

In [88]:
# make prediction

y_pred2 = logit2.predict(X_train[['age', 'pclass', 'fare', 'sex_male']])

In [89]:
# classification report

print(classification_report(y_train, y_pred2))

              precision    recall  f1-score   support

           0       0.84      0.87      0.85       307
           1       0.77      0.73      0.75       191

    accuracy                           0.81       498
   macro avg       0.80      0.80      0.80       498
weighted avg       0.81      0.81      0.81       498



# MODEL 3

In [106]:
logit3 = LogisticRegression(C = 1, random_state = 123)

In [107]:
logit3.fit(X_train[['age', 'pclass', 'alone', 'sex_male']], y_train)

LogisticRegression(C=1, random_state=123)

In [108]:
y_pred3 = logit3.predict(X_train[['age', 'pclass', 'alone', 'sex_male']])

In [109]:
print(classification_report(y_train, y_pred3))

              precision    recall  f1-score   support

           0       0.83      0.86      0.85       307
           1       0.76      0.71      0.74       191

    accuracy                           0.81       498
   macro avg       0.80      0.79      0.79       498
weighted avg       0.80      0.81      0.80       498



## Evaluate model 1 and 2 performance on 'validate'

In [111]:
# make prediction for validate dataset
y_pred_validate = logit.predict(X_validate[['age', 'pclass', 'fare']])
y_pred_validate2 = logit2.predict(X_validate[['age', 'pclass', 'fare', 'sex_male']])

In [113]:
print("Model 1: solver = lbfgs, c = 1")

print('Accuracy: {:.2f}'.format(logit.score(X_validate[['age', 'pclass', 'fare']], y_validate)))

print(classification_report(y_validate, y_pred_validate))

print('--------------------------------------------------')

print("Model 2: solver = lbfgs, c = 1")

print('Accuracy: {:.2f}'.format(logit2.score(X_validate[['age', 'pclass', 'fare', 'sex_male']], y_validate)))

print(classification_report(y_validate, y_pred_validate2))

Model 1: solver = lbfgs, c = 1
Accuracy: 0.71
              precision    recall  f1-score   support

           0       0.71      0.89      0.79       132
           1       0.70      0.43      0.53        82

    accuracy                           0.71       214
   macro avg       0.71      0.66      0.66       214
weighted avg       0.71      0.71      0.69       214

--------------------------------------------------
Model 2: solver = lbfgs, c = 1
Accuracy: 0.78
              precision    recall  f1-score   support

           0       0.80      0.84      0.82       132
           1       0.72      0.67      0.70        82

    accuracy                           0.78       214
   macro avg       0.76      0.76      0.76       214
weighted avg       0.77      0.78      0.77       214



In [114]:
y_pred_test2 = logit2.predict(X_test[['age', 'pclass', 'fare', 'sex_male']])

In [116]:
# run on test dataset

print("Model 2: solver = lbfgs, c = 1")

print('Accuracy: {:.2f}'.format(logit2.score(X_test[['age', 'pclass', 'fare', 'sex_male']], y_test)))

print(classification_report(y_test, y_pred_test2))

Model 2: solver = lbfgs, c = 1
Accuracy: 0.80
              precision    recall  f1-score   support

           0       0.83      0.86      0.84       110
           1       0.77      0.71      0.74        69

    accuracy                           0.80       179
   macro avg       0.80      0.79      0.79       179
weighted avg       0.80      0.80      0.80       179



In [None]:
# you can see that the performance has increased 
#from the validate to the test datasets

# accuracy jumped from .78 on the validate data to .80 on the test data