In [1]:
# Python ≥3.5 is required
import sys, os, multiprocessing, csv
assert sys.version_info >= (3, 5)

from urllib import request, error
from PIL import Image
from io import BytesIO

# Scikit-Learn ≥0.20 is required
import sklearn
assert sklearn.__version__ >= "0.20"

# Common imports
import numpy as np
import os
import math

# to make this notebook's output stable across runs
np.random.seed(42)

# To plot pretty figures
%matplotlib inline
import matplotlib as mpl
import matplotlib.pyplot as plt
mpl.rc('axes', labelsize=14)
mpl.rc('xtick', labelsize=12)
mpl.rc('ytick', labelsize=12)

output = True

# Where to save the figures
PROJECT_ROOT_DIR = "."
CHAPTER_ID = "lab3"
IMAGES_PATH = os.path.join(PROJECT_ROOT_DIR, "images", CHAPTER_ID)
os.makedirs(IMAGES_PATH, exist_ok=True)

def save_fig(fig_id, tight_layout=True, fig_extension="png", resolution=300):
    path = os.path.join(IMAGES_PATH, fig_id + "." + fig_extension)
    print("Saving figure", fig_id)
    if tight_layout:
        plt.tight_layout()
    plt.savefig(path, format=fig_extension, dpi=resolution)

In [2]:
def load_data(data_file): #adapted from https://www.kaggle.com/xiuchengwang/python-dataset-download
    csvfile = open(data_file, 'r')
    csvreader = csv.reader(csvfile)
    key_url_list = [line[:12] for line in csvreader]
    return key_url_list[1:]  # Chop off header

In [3]:
import pandas as pd
train_passengers = pd.read_csv("C:\\Users\\Riley\\anaconda3\\handson-ml2-master\\Labs\\titanic\\train.csv")
test_passengers = pd.read_csv("C:\\Users\\Riley\\anaconda3\\handson-ml2-master\\Labs\\titanic\\test.csv")

In [4]:
train_passengers.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [5]:
test_passengers.head()

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0,,S
2,894,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,,Q
3,895,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,,S
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,,S


In [6]:
for passengers in (train_passengers, test_passengers):
    passengers.drop(columns=["Name", "Cabin", "PassengerId", "Ticket"], inplace = True)

In [7]:
def countNaN (dataset):
    NaNcount = 0
    for data in dataset:
        if math.isnan(data):
            NaNcount = NaNcount + 1
    return NaNcount

In [8]:
print(countNaN(train_passengers["Age"]))

177


In [9]:
print(countNaN(test_passengers["Age"]))

86


In [10]:
cols = list(train_passengers.columns) #reorder to make cols more consistent w/ test vals
temp = cols[0]
for catNum in range(len(cols[1:])):
    cols[catNum] = cols[catNum + 1]
cols[len(cols) - 1] = temp
new_train_passengers = train_passengers.reindex(columns = cols)

In [11]:
new_train_passengers.head()

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked,Survived
0,3,male,22.0,1,0,7.25,S,0
1,1,female,38.0,1,0,71.2833,C,1
2,3,female,26.0,0,0,7.925,S,1
3,1,female,35.0,1,0,53.1,S,1
4,3,male,35.0,0,0,8.05,S,0


In [12]:
from sklearn.preprocessing import OrdinalEncoder

x = [["male", "female"]]
ord_enc = OrdinalEncoder(categories = x)

train_sex_enc = ord_enc.fit_transform(new_train_passengers[["Sex"]])
test_sex_enc = ord_enc.fit_transform(test_passengers[["Sex"]])

In [13]:
import statistics

def fill_missing(frame):
    medlist = {}
    for catnum in range(len(frame.columns)):
        med = frame.median()[catnum]
        medlist.update({frame.columns[catnum]: med})
    #print(medlist)
    frame.fillna(value = medlist, axis = 0, inplace = True)

In [14]:
new_train_passengers.head(6)

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked,Survived
0,3,male,22.0,1,0,7.25,S,0
1,1,female,38.0,1,0,71.2833,C,1
2,3,female,26.0,0,0,7.925,S,1
3,1,female,35.0,1,0,53.1,S,1
4,3,male,35.0,0,0,8.05,S,0
5,3,male,,0,0,8.4583,Q,0


In [15]:
train_sex_enc[:10]

array([[0.],
       [1.],
       [1.],
       [1.],
       [0.],
       [0.],
       [0.],
       [0.],
       [1.],
       [1.]])

In [16]:
from sklearn.preprocessing import OneHotEncoder

x = [['S','C','Q']]
one_hot = OneHotEncoder(categories = x, sparse=False)

new_train_passengers["Embarked"].fillna(value='S', axis=0, inplace=True)
test_passengers["Embarked"].fillna(value='S', axis=0, inplace=True)

train_emb_enc = one_hot.fit_transform(new_train_passengers[["Embarked"]])
test_emb_enc = one_hot.fit_transform(test_passengers[["Embarked"]])

In [17]:
def add_enc_attribs(dataframe, names, encoded_atribs, keep_final=False):
    temp_num = 0
    if(keep_final):
        temp_num = 1
        
    for num in range(len(names)):
        dataframe.insert(len(dataframe.columns) - temp_num, names[num], encoded_atribs.T[num])

In [18]:
names = ["EmbS", "EmbC", "EmbQ"]

enc_train_passengers = new_train_passengers
enc_train_passengers[["Sex"]] = train_sex_enc
enc_train_passengers.drop(columns = "Embarked", inplace = True)
add_enc_attribs(enc_train_passengers, names, train_emb_enc, keep_final=True)
fill_missing(enc_train_passengers)

enc_test_passengers = test_passengers
enc_test_passengers[["Sex"]] = test_sex_enc
enc_test_passengers.drop(columns = "Embarked", inplace = True)
add_enc_attribs(enc_test_passengers, names, test_emb_enc)
fill_missing(enc_test_passengers)

In [19]:
enc_train_passengers.head()

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare,EmbS,EmbC,EmbQ,Survived
0,3,0.0,22.0,1,0,7.25,1.0,0.0,0.0,0
1,1,1.0,38.0,1,0,71.2833,0.0,1.0,0.0,1
2,3,1.0,26.0,0,0,7.925,1.0,0.0,0.0,1
3,1,1.0,35.0,1,0,53.1,1.0,0.0,0.0,1
4,3,0.0,35.0,0,0,8.05,1.0,0.0,0.0,0


In [20]:
enc_train_passengers.head(6)

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare,EmbS,EmbC,EmbQ,Survived
0,3,0.0,22.0,1,0,7.25,1.0,0.0,0.0,0
1,1,1.0,38.0,1,0,71.2833,0.0,1.0,0.0,1
2,3,1.0,26.0,0,0,7.925,1.0,0.0,0.0,1
3,1,1.0,35.0,1,0,53.1,1.0,0.0,0.0,1
4,3,0.0,35.0,0,0,8.05,1.0,0.0,0.0,0
5,3,0.0,28.0,0,0,8.4583,0.0,0.0,1.0,0


In [21]:
enc_train_passengers_X = enc_train_passengers.drop(columns = ["Survived"], inplace = False)
enc_train_passengers_y = []
for value in enc_train_passengers[["Survived"]].to_numpy():
    enc_train_passengers_y.append(value[0])

In [22]:
enc_train_passengers_X.loc[61]

Pclass     1.0
Sex        1.0
Age       38.0
SibSp      0.0
Parch      0.0
Fare      80.0
EmbS       1.0
EmbC       0.0
EmbQ       0.0
Name: 61, dtype: float64

In [23]:
from sklearn.linear_model import SGDClassifier
sgd_clf = SGDClassifier(random_state=42)
sgd_clf.fit(enc_train_passengers_X, enc_train_passengers_y)
SGDvalues = sgd_clf.predict(enc_test_passengers)

In [24]:
from sklearn.ensemble import RandomForestClassifier

forest_clf = RandomForestClassifier(n_estimators = 120, max_features = 7)
forest_clf.fit(enc_train_passengers_X, enc_train_passengers_y)
forestvalues = forest_clf.predict(enc_test_passengers)

In [25]:
forest_clf

RandomForestClassifier(max_features=7, n_estimators=120)

In [26]:
forest_clf.feature_importances_

array([0.10049212, 0.29663225, 0.2485398 , 0.0490769 , 0.02579159,
       0.24706265, 0.0155787 , 0.01114948, 0.0056765 ])

In [27]:
#len(SGDvalues)
#len(forestvalues)

In [28]:
SGDfilename = "C:\\Users\\Riley\\anaconda3\\handson-ml2-master\\Labs\\titanic\\SGDClassifier.csv"
RandomForestFilename = "C:\\Users\\Riley\\anaconda3\\handson-ml2-master\\Labs\\titanic\\RandomForestClassifier.csv"

filenames = {}

filenames.update({SGDfilename: SGDvalues})
filenames.update({RandomForestFilename: forestvalues})

if output == True:
    for filename in filenames:
        with open(filename, 'w', newline='') as csvfile:
            # creating a csv writer object  
            csvwriter = csv.writer(csvfile)  

            # writing the fields  
            csvwriter.writerow(["PassengerId","Survived"])
    
            for num in range(418):
                csvwriter.writerow([num+892, filenames[filename][num]])

'''
scores: (accuracy)
    SGD classifier 0.73205
    Random Forest 0.74880, bootstrap false 0.75598
'''


'\nscores: (accuracy)\n    SGD classifier 0.73205\n    Random Forest 0.74880, bootstrap false 0.75598\n'