# Kaggle Titanic Machine Learning Project

## Import Libraries and Modules

In [137]:
import numpy as np #Numpy for math operations
from scipy.misc import imread, imsave, imresize #Scipy to change images
from sklearn import datasets  #Scikit-learn for machine learning and modeling
from sklearn import metrics 
from sklearn.tree import DecisionTreeClassifier
import pandas as pd #Pandas for reading and writing files
import matplotlib.pyplot as plt #Matplotlib to explore data with graphs
from pathlib import Path #to find file paths
import os #to get working directory
from sklearn.linear_model import LogisticRegression #for a logistic regression function model

## Import the data

In [138]:
train_df = pd.read_csv("train.csv") #read files from working directory into Pandas DataFrame
test_df = pd.read_csv("test.csv")

## Investigate Data for NAs

### For Train Data

In [139]:
col_names = train_df.columns #get column names
train_dim = train_df.shape #dimensions of data row x col
col_types = train_df.dtypes #what sort of data is in the column
for i in col_names:
    col = train_df[i] #selects column
    num_NA = sum(col.isna()) #sums the number of NA values for selected column
    print("The number of NA values in the column ", i, " is: ", num_NA, ".")

The number of NA values in the column  PassengerId  is:  0 .
The number of NA values in the column  Survived  is:  0 .
The number of NA values in the column  Pclass  is:  0 .
The number of NA values in the column  Name  is:  0 .
The number of NA values in the column  Sex  is:  0 .
The number of NA values in the column  Age  is:  177 .
The number of NA values in the column  SibSp  is:  0 .
The number of NA values in the column  Parch  is:  0 .
The number of NA values in the column  Ticket  is:  0 .
The number of NA values in the column  Fare  is:  0 .
The number of NA values in the column  Cabin  is:  687 .
The number of NA values in the column  Embarked  is:  2 .


### For Test Data

In [140]:
col_names = test_df.columns #get column names
test_dim = test_df.shape #dimensions of data row x col
col_types = test_df.dtypes #what sort of data is in the column
for i in col_names:
    col = test_df[i] #selects column
    num_NA = sum(col.isna()) #sums the number of NA values for selected column
    print("The number of NA values in the column ", i, " is: ", num_NA, ".")

The number of NA values in the column  PassengerId  is:  0 .
The number of NA values in the column  Pclass  is:  0 .
The number of NA values in the column  Name  is:  0 .
The number of NA values in the column  Sex  is:  0 .
The number of NA values in the column  Age  is:  86 .
The number of NA values in the column  SibSp  is:  0 .
The number of NA values in the column  Parch  is:  0 .
The number of NA values in the column  Ticket  is:  0 .
The number of NA values in the column  Fare  is:  1 .
The number of NA values in the column  Cabin  is:  327 .
The number of NA values in the column  Embarked  is:  0 .


## Replace NAs and Remove Useless Columns

### For Train data

In [141]:
train_df = train_df.drop(columns = "Cabin") #remove Cabin variable
col_names = train_df.columns #New column Names
col_types = train_df.dtypes #New what type of data in the column
train_df["Age"] = train_df["Age"].fillna(train_df["Age"].mean()) #replace NA values with the column mean for Age
for i in col_names:
    col = train_df[i] #selects column
    num_NA = sum(col.isna()) #sums the number of NA values for selected column
    print("The number of NA values in the column ", i, " is: ", num_NA, ".")

The number of NA values in the column  PassengerId  is:  0 .
The number of NA values in the column  Survived  is:  0 .
The number of NA values in the column  Pclass  is:  0 .
The number of NA values in the column  Name  is:  0 .
The number of NA values in the column  Sex  is:  0 .
The number of NA values in the column  Age  is:  0 .
The number of NA values in the column  SibSp  is:  0 .
The number of NA values in the column  Parch  is:  0 .
The number of NA values in the column  Ticket  is:  0 .
The number of NA values in the column  Fare  is:  0 .
The number of NA values in the column  Embarked  is:  2 .


### For Test Data

In [142]:
test_df = test_df.drop(columns = "Cabin") #remove Cabin variable
col_names = test_df.columns #New column Names
col_types = test_df.dtypes #New what type of data in the column
test_df["Age"] = test_df["Age"].fillna(test_df["Age"].mean()) #replace NA values with the column mean for Age
for i in col_names:
    col = test_df[i] #selects column
    num_NA = sum(col.isna()) #sums the number of NA values for selected column
    print("The number of NA values in the column ", i, " is: ", num_NA, ".")

The number of NA values in the column  PassengerId  is:  0 .
The number of NA values in the column  Pclass  is:  0 .
The number of NA values in the column  Name  is:  0 .
The number of NA values in the column  Sex  is:  0 .
The number of NA values in the column  Age  is:  0 .
The number of NA values in the column  SibSp  is:  0 .
The number of NA values in the column  Parch  is:  0 .
The number of NA values in the column  Ticket  is:  0 .
The number of NA values in the column  Fare  is:  1 .
The number of NA values in the column  Embarked  is:  0 .


### Logistic Model to fill in Embarked Missing Vals in Train Data

In [143]:
# Gotta remove string variables and make dummies for other ones
sex_dummies = pd.get_dummies(train_df["Sex"])
train_df[list(sex_dummies)] = sex_dummies
log_train_df = train_df.drop(columns = ["PassengerId", "Name", "Ticket", "Sex"])
log_train_df.head()

Unnamed: 0,Survived,Pclass,Age,SibSp,Parch,Fare,Embarked,female,male
0,0,3,22.0,1,0,7.25,S,0,1
1,1,1,38.0,1,0,71.2833,C,1,0
2,1,3,26.0,0,0,7.925,S,1,0
3,1,1,35.0,1,0,53.1,S,1,0
4,0,3,35.0,0,0,8.05,S,0,1


In [146]:
# Separate Data into train and test
log_mod_train = log_train_df.loc[train_df["Embarked"].isna() == False]
log_mod_test = log_train_df.loc[train_df["Embarked"].isna() == True]
log_mod_test

# Separate Train and test into input/output 
x_train = log_mod_train.drop(columns = "Embarked")
y_train = log_mod_train["Embarked"]
x_test = log_mod_test.drop(columns = "Embarked")

#define the model for reference visit: https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.LogisticRegression.html#sklearn.linear_model.LogisticRegression
log_mod = LogisticRegression(solver='lbfgs', multi_class='multinomial', max_iter = 1000) #use lbfgs for multiple classification prediction, 

#fit the data
log_fit = log_mod.fit(x_train, y_train)

#make predictions
log_pred = log_fit.predict(x_test)
log_pred

array(['S', 'S'], dtype=object)

In [147]:
#Put the predictions into the dataframe in place of NAs
#this method is not functioning
counter = 0
for index, row in train_df.iterrows():
    if pd.isnull(row["Embarked"]): #use pd.isnull(something) to check if a string object is type None or NaN
        row["Embarked"] = log_pred[counter]
        counter += 1
#train_df["Embarked"].replace(to_replace = np.nan, value = "S")


In [148]:
train_df.loc[829, "Embarked"] = "S"
train_df.loc[61, "Embarked"] = "S"

In [149]:
col_names = train_df.columns #New column Names
col_types = train_df.dtypes #New what type of data in the column
#train_df["Age"] = train_df["Age"].fillna(train_df["Age"].mean()) #replace NA values with the column mean for Age
for i in col_names:
    col = train_df[i] #selects column
    num_NA = sum(col.isna()) #sums the number of NA values for selected column
    print("The number of NA values in the column ", i, " is: ", num_NA, ".")

The number of NA values in the column  PassengerId  is:  0 .
The number of NA values in the column  Survived  is:  0 .
The number of NA values in the column  Pclass  is:  0 .
The number of NA values in the column  Name  is:  0 .
The number of NA values in the column  Sex  is:  0 .
The number of NA values in the column  Age  is:  0 .
The number of NA values in the column  SibSp  is:  0 .
The number of NA values in the column  Parch  is:  0 .
The number of NA values in the column  Ticket  is:  0 .
The number of NA values in the column  Fare  is:  0 .
The number of NA values in the column  Embarked  is:  0 .
The number of NA values in the column  female  is:  0 .
The number of NA values in the column  male  is:  0 .


### Linear Model to predict Fare for NA in Test Data

In [150]:
#actually lets just do average fare
test_df["Fare"] = test_df["Fare"].fillna(test_df["Fare"].mean())

## Logistic Model

In [155]:
#divide data set into input/output
train_x = train_df.drop(columns = ["Survived", "PassengerId", "Name", "Sex", "Ticket"])
train_y = train_df["Survived"]

sex_dummies = pd.get_dummies(test_df["Sex"])
test_df[list(sex_dummies)] = sex_dummies
test_x = test_df.drop(columns = ["PassengerId", "Name", "Sex", "Ticket"])

col_names = test_x.columns #New column Names
col_types = test_x.dtypes #New what type of data in the column
#train_df["Age"] = train_df["Age"].fillna(train_df["Age"].mean()) #replace NA values with the column mean for Age
for i in col_names:
    col = test_x[i] #selects column
    num_NA = sum(col.isna()) #sums the number of NA values for selected column
    print("The number of NA values in the column ", i, " is: ", num_NA, ".")

The number of NA values in the column  Pclass  is:  0 .
The number of NA values in the column  Age  is:  0 .
The number of NA values in the column  SibSp  is:  0 .
The number of NA values in the column  Parch  is:  0 .
The number of NA values in the column  Fare  is:  0 .
The number of NA values in the column  Embarked  is:  0 .
The number of NA values in the column  female  is:  0 .
The number of NA values in the column  male  is:  0 .


### Change Embarked to Numeric

In [159]:
labels = ["S", "Q", "C"]
replace = [1, 2, 3]
for r in replace:
    train_x["Embarked"] = train_x["Embarked"].replace(labels[r-1], r)
    test_x["Embarked"] = test_x["Embarked"].replace(labels[r-1], r)
    
train_x["Embarked"]


0      1
1      3
2      1
3      1
4      1
5      2
6      1
7      1
8      1
9      3
10     1
11     1
12     1
13     1
14     1
15     1
16     2
17     1
18     1
19     3
20     1
21     1
22     2
23     1
24     1
25     1
26     3
27     1
28     2
29     1
      ..
861    1
862    1
863    1
864    1
865    1
866    3
867    1
868    1
869    1
870    1
871    1
872    1
873    1
874    3
875    3
876    1
877    1
878    1
879    3
880    1
881    1
882    1
883    1
884    1
885    2
886    1
887    1
888    1
889    3
890    2
Name: Embarked, Length: 891, dtype: int64

In [162]:
#define the model
log_mod = LogisticRegression(solver='lbfgs', multi_class='multinomial', max_iter = 1000) #use lbfgs for multiple classification prediction, 

#fit the training data
log_fit = log_mod.fit(train_x, train_y)

log_pred = log_fit.predict(test_x)
log_pred.to_csv(index = False)

AttributeError: 'numpy.ndarray' object has no attribute 'to_csv'