In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load


# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os

for dirname, _, filenames in os.walk("./"):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All"
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# 1. Frame the problem
Using the customer description, Define the problem your trying to solve in your own words (remember this is not technial but must be specific so the customer understands the project

Predict the probabilities of surviving the Titanic and determine the relationships between passenger attributes and survivability rate. 

# 2. Get the Data 
Define how you recieved the data (provided, gathered..)

The data was provided to us via csv file.

# 3. Explore the Data
Gain insights into the data you have from step 2, making sure to identify any bias

In [None]:
# import necessary libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [None]:
# read csv
df = pd.read_csv("data/titanic.csv")
df.head()

In [None]:
df.isna().any()

From this table, we can tell that the age, cabin, and embarked columns all have NaN values, so we'll have to clean up those, mostly likely by dropping the row altogether (mean imputation might cause biases due to gender).

In [None]:
print(f"Maximum age: {max(df['Age'])}")
print(f"Minimum age: {min(df['Age'])}")
print(f"DF of passengers whose ages are less than 1:\n {df[df['Age'] < 1]}")

The maximum age is 80, which is high and rare for this time, but not entirely out of the question since these passengers were mostly wealthy with easy access to medical help. The minimum age, on the other hand, is 0.42. This is a little weird since age isn't usually represented with decimals, however it might just be a baby who's not yet 1. Upon further analysis, it seems that many passengers had ages less than 1, with one paying a fare of 150 dollars. This now seems more like an error than actual fact.

From these current columns, I would think that a few wouldn't be helpful to our overall predictions. Ticket number, passengerid, sibsp (siblings/spouses onboard), parch (number of parents/children on board), embarked (port of embarkation), name, and cabin are the attributes I believe won't contribute as much towards out goal. Even if they make small differences (having an earlier ticket implies more wealth), I think they'll be neglible enough to exclude them for training purposes. 

label descriptions taken from: https://github.com/awesomedata/awesome-public-datasets/issues/351

Additionally, there are a few columns that need to be one-hot-encoded, like sex (0 for female, 1 for male). The fare and age might also need to be normalized. 

In [None]:
# graph age and survival rate
women = df[df["Sex"] == "female"][["Age", "Survived"]].dropna()
men = df[df["Sex"] == "male"][["Age", "Survived"]].dropna()

plt.hist([women["Age"], men["Age"]], bins=10, color=["Red", "Blue"], label=["Women", "Men"])
plt.title("Number of people on the Titanic vs. Age Separated by Gender")
plt.xlabel("Age")
plt.ylabel("Number of people")
plt.legend()
plt.show()

# get survival rates based on gender
w_survival = women["Survived"].mean()
print(f"Women survival rate: {w_survival}")

m_survival = men["Survived"].mean()
print(f"Men survival rate: {m_survival}")

In [None]:
# graph a histogram of ages, with gender separation, but show survival rates for each bin

# basically group the women into the bin categories, basically based on tens round(-1), then find the mean survivals of those

women["Age"] = round(women["Age"], -1)

grouped_women = women.groupby(women["Age"])

women_avgs = grouped_women["Survived"].mean()

women_avgs[70.0] = 0
women_avgs[80.0] = 0

print(women_avgs)


men["Age"] = round(men["Age"], -1)

grouped_men = men.groupby(men["Age"])

men_avgs = grouped_men["Survived"].mean()

print(men_avgs)

survival_df =  pd.DataFrame({"Women": women_avgs, "Men": men_avgs})

survival_df.plot(kind="bar", width=0.8, color=["red", "blue"])
plt.ylabel("Survival Rate")
plt.title("Age vs. Survival Rate based on Gender")
plt.show()

total_df = (survival_df["Women"] * len(survival_df["Women"]) + survival_df["Men"] * len(survival_df["Men"])) / (len(survival_df["Men"]) + len(survival_df["Women"]))
print(total_df)

total_df.plot(kind="bar", width=0.8)
plt.show()

In [None]:
classes_df = df[["Survived", "Pclass"]]
grouped_class = classes_df.groupby(df["Pclass"])

class_avgs = grouped_class.mean()

print(class_avgs)

class_avgs["Survived"].plot(kind="bar", ylim=(0, 1.0))
plt.xlabel("Passenger Class")
plt.ylabel("Survival Rate")
plt.title("Passenger Class vs. Survival Rates")
plt.show()

In [None]:
# relationship between fare prices and class on ship
fare_pclass_df = df[["Fare", "Pclass"]]

plt.scatter(fare_pclass_df["Pclass"], fare_pclass_df["Fare"])
plt.xticks([1, 2, 3])
plt.xlabel("Passenger Class")
plt.ylabel("Fare Cost")
plt.title("Passenger Class vs. Fare Cost")
plt.show()

grouped_fare = fare_pclass_df.groupby(df["Pclass"])
print(grouped_fare.mean())

plt.plot(grouped_fare.mean()["Fare"], marker=".", markersize=10)
plt.show()

From this graph, it's clear that fare prices and passenger class are pretty linearly related, therefore I believe it would be best to only use passenger class, as it doesn't need to be normalized and is already one-hot-encoded. I also notices that there was a singular ticket price that was $500, which is a clear outlier.

In [None]:
fare_pclass_df_noout = df[["Fare", "Pclass"]]
fare_pclass_df_noout = fare_pclass_df[fare_pclass_df["Fare"] < 400]

plt.scatter(fare_pclass_df_noout["Pclass"], fare_pclass_df_noout["Fare"])
plt.xticks([1, 2, 3])
plt.xlabel("Passenger Class")
plt.ylabel("Fare Cost")
plt.title("Passenger Class vs. Fare Cost")
plt.show()

grouped_fare_noout = fare_pclass_df.groupby(df["Pclass"])
print(grouped_fare_noout.mean())

plt.plot(grouped_fare_noout.mean()["Fare"], marker=".", markersize=10)
plt.xticks([1, 2, 3])
plt.xlabel("Passenger Class")
plt.ylabel("Mean Fare Cost")
plt.title("Passenger Class vs. Mean Fare Cost")
plt.show()

Even with removing that outlier, the data is still pretty skewed, indicating a large class divide between 1 and 2/3.

# 4.Prepare the Data


Apply any data transformations and explain what and why


# 5. Model the data
Using selected ML models, experment with your choices and describe your findings. Finish by selecting a Model to continue with


# 6. Fine Tune the Model

With the select model descibe the steps taken to acheve the best rusults possiable 


# 7. Present
In a customer faceing Document provide summery of finding and detail approach taken


# 8. Launch the Model System
Define your production run code, This should be self susficent and require only your model pramaters 


In [None]:
def infrence(prams):
    results = m.run(prams)
    return results 