In [146]:
%pylab
import pandas as pd
import matplotlib.pyplot as plt

df = pd.read_csv("train.csv")

Using matplotlib backend: Qt5Agg
Populating the interactive namespace from numpy and matplotlib


In [24]:
print(df.shape)
print(df.count())

(891, 12)
PassengerId    891
Survived       891
Pclass         891
Name           891
Sex            891
Age            714
SibSp          891
Parch          891
Ticket         891
Fare           891
Cabin          204
Embarked       889
dtype: int64


In [68]:
# y(survived), x(died,survived)
fig1 = plt.figure(figsize(18,6))

plt.subplot2grid((2,3),(0,0))
df.Survived.value_counts(normalize=True).plot(kind="bar", alpha = 0.5)
plt.title("Survived")
# About ~60% died and ~40% survived

# Since we have alot of the ages of the passengers, 
# we can take a look at the relationship between the age and the survivor rate.

plt.subplot2grid((2,3),(0,1))
plt.scatter(df.Survived, df.Age, alpha=0.1)
plt.title("Age wrt Survived")

plt.subplot2grid((2,3),(0,2))
df.Pclass.value_counts(normalize=True).plot(kind="bar", alpha = 0.5)
plt.title("Class")

plt.subplot2grid((2,3),(1,0), colspan=2)
for x in [1,2,3]:
    df.Age[df.Pclass == x].plot(kind = "kde") #kde = kernal density estimation
plt.title("Class wrt Age")
plt.legend(("1st","2nd","3rd"))
    # Seems like there is a relationship between the type of tickets they could buy
    # you have and the average ages. To make sense of information, try to visualize it

plt.subplot2grid((2,3),(1,2))
df.Embarked.value_counts(normalize=True).plot(kind="bar", alpha = 0.5)
plt.title("Embarked")

plt.show()

In [79]:
# visualize gender
fig2 = plt.figure(figsize(18,12))

# Amount of people that survived
plt.subplot2grid((3,4),(0,0))
df.Survived.value_counts(normalize=True).plot(kind="bar", alpha = 0.5)
plt.title("Survived")

# Male survival ratio (~20 % survived)
plt.subplot2grid((3,4),(0,1))
df.Survived[df.Sex == "male"].value_counts(normalize = True).plot(kind = "bar", alpha=0.5)
plt.title("Men Survived")

# Female survival ratio (~70 % survived)
female_color = "#FA0000"
plt.subplot2grid((3,4),(0,2))
df.Survived[df.Sex == "female"].value_counts(normalize = True).plot(kind = "bar", alpha=0.5, color = female_color)
plt.title("Female Survived")

# Men vs. Womens survival ratio
plt.subplot2grid((3,4),(0,3))
df.Sex[df.Survived == 1].value_counts(normalize = True).plot(kind = "bar", alpha=0.5, color = [female_color, 'b'])
plt.title("Sex of Survived")

# Going to use the Class of the passenger to show their survival rate
plt.subplot2grid((3,4),(1,0), colspan = 4)
for x in [1,2,3]:
    df.Survived[df.Pclass == x].plot(kind = "kde")
plt.title("Class wrt Survived")
plt.legend(("1st","2nd","3rd"))
    # We see that the mortality rate among 3rd class passengers is much higher than that of the 2nd 
    # and 3rd class passengers. Most who did survive was people in 1st class, then 2nd and lastly 3rd.

# What if you combine the information from Sex and the passenger Class?

# Rich Men Survived
plt.subplot2grid((3,4),(2,0)) 
df.Survived[(df.Sex == "male") & (df.Pclass == 1)].value_counts(normalize = True).plot(kind = "bar", alpha=0.5)
plt.title(" Rich Men Survived")

# Poor Men Survived (Jack)
plt.subplot2grid((3,4),(2,1)) 
df.Survived[(df.Sex == "male") & (df.Pclass == 3)].value_counts(normalize = True).plot(kind = "bar", alpha=0.5)
plt.title("Poor Men Survived")

# Rich Women Survived (Rose)
plt.subplot2grid((3,4),(2,2)) 
df.Survived[(df.Sex == "female") & (df.Pclass == 1)].value_counts(normalize = True).plot(kind = "bar", alpha=0.5, color = female_color)
plt.title(" Rich Women Survived")

# Poor Women Survived
plt.subplot2grid((3,4),(2,3)) 
df.Survived[(df.Sex == "female") & (df.Pclass == 3)].value_counts(normalize = True).plot(kind = "bar", alpha=0.5, color = female_color)
plt.title("Poor Women Survived")

plt.show()

In [84]:
# Use an algorithm to predict the data
    # 1) The simplest algorithm would be random (50 % chanes of being correct)
    #    => Simple rule based on graphs: If you are a women, you survive, if you are a man, you die.

train = pd.read_csv("train.csv")

# add new column
train["Hyp"] = 0 # added a new column named "hyp", stands for hypothesis, all rows are filled with 0
train.loc[train.Sex == "female", "Hyp"] = 1 # "loc" method: You pass the condition AND the column to update 

train["Result"] = 0
train.loc[train.Survived == train["Hyp"], "Result"] = 1

print(train["Result"].value_counts(normalize = True))
    # C: ALWAYS TAKE A LOOK ON THE DATA BEFORE RUNNING THE ALGORITHM: 
    #    'Sometimes, simple hints can give you really good answer'
    #     Using more complicated algorithms will NOT gather a big increase in predictability

1    0.786756
0    0.213244
Name: Result, dtype: float64


In [144]:
def clean_data(data):
    data["Fare"] = data["Fare"].fillna(data["Fare"].dropna().median()) 
    # fill in empty rows which don't have the Fare data by average values for Fare
    data["Age"] = data["Age"].fillna(data["Age"].dropna().median())
    # fill in empty rows which don't have the Age data by average values for Age    
    
    # Machines don't work well with categories, give them numbers, assign "male = 0", "female = 1"
    data.loc[data["Sex"] == "male", "Sex"] = 0 
    data.loc[data["Sex"] == "female", "Sex"] = 1
    
    data["Embarked"] = data["Embarked"].fillna("S")
    data.loc[data["Embarked"] == "S", "Embarked"] = 0
    data.loc[data["Embarked"] == "C", "Embarked"] = 1
    data.loc[data["Embarked"] == "Q", "Embarked"] = 2    

# Linear models might seem trivial for humans in 2D whilst becoming virtually impossible in say, 15D
# A good numeric approach beats the human brain. We are not good in dealing with, say 20D.

# Image recognition tasks, usually assign different dimensions to each pixel, you will have an image
# that is 50x50 pixels, i.e. 2500D, who can solve that? I can't.

# ----------------------------------------------------------------------------------------------------

# Predict logistic progession

from sklearn import linear_model, preprocessing
#from sklearn from linear_model
# sklearn implements a bunch of machine learning algorithms, you don't have to do anything

log_reg = linear_model.LogisticRegression(solver='lbfgs',class_weight='balanced', max_iter=7600)

# The picture represents a linear model, it gathers a simple explanation of the datapoints
#from PIL import Image
#image = Image.open('linear_model.png')
#image.show()

train = pd.read_csv("train.csv")
clean_data(train)

# I have to pass in the desired output into the algorithm: Which usually is called 'target'
target = train["Survived"].values
# Give the machine learning algorithm some hint, called features
features = train[["Pclass", "Age", "Sex", "SibSp", "Parch"]].values

# 1 Feature ("Sex"): Gathered 78.67 % accuracy
# 5 Features ("Pclass", "Age", "Sex", "SibSp", "Parch"): Gathered 79.12 % accuracy
# 6 Features ("Pclass", "Age", "Sex", "SibSp", "Parch", "Fare"): Gathered 79.57 % accuracy

# Creating "classifier": Called classifier because it just needs to take one of these passengers 
# and decide which of these buckets(categories in features) he (the classifier) wants to assign
# him into

classifier = linear_model.LogisticRegression()
classifier_ = classifier.fit(features, target)
# "How many survivde based on these features, make a model"

# As soon as the PC sees the word "fit", it goes through the data 
# and tries to find some hidden relationships in the data

# To check the accuracy of our model, i.e. "fitted" logical regression, we use "score"
print(classifier_.score(features,target))

# OUR METHODOLOGY: "HERE YOU GO, A BUNCH OF CATEGORIES, GIVE ME A GOOD EXPLANATION OF THIS!" 

# Take the linear features: transform and combine them into second degree polynomial
# Maybe our data could be better described by curves instead of straight lines
#from PIL import Image
#image1 = Image1.open('linear_wrong.png')
#image2 = Image2.open('quadratic.png')
#image1.show()
#image2.show()

poly = preprocessing.PolynomialFeatures(degree=2)
poly_features = poly.fit_transform(features)

classifier_ = classifier.fit(poly_features, target)
print(classifier_.score(poly_features,target))

# Alot of these algorithms are like "black boxes", you do not understand what happens in them

0.7912457912457912
0.8237934904601572


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [151]:
# PREDICT DECISION TREE
# This algorithm creates a simple decision tree
# It will take a look on all data, all rows, all the characteristics of it
# And it will makes a decision tree, "this move moves me the most in a certain direction"

from sklearn import tree, model_selection

def clean_data(data):
    data["Fare"] = data["Fare"].fillna(data["Fare"].dropna().median()) 
    # fill in empty rows which don't have the Fare data by average values for Fare
    data["Age"] = data["Age"].fillna(data["Age"].dropna().median())
    # fill in empty rows which don't have the Age data by average values for Age    
    
    # Machines don't work well with categories, give them numbers, assign "male = 0", "female = 1"
    data.loc[data["Sex"] == "male", "Sex"] = 0 
    data.loc[data["Sex"] == "female", "Sex"] = 1
    
    data["Embarked"] = data["Embarked"].fillna("S")
    data.loc[data["Embarked"] == "S", "Embarked"] = 0
    data.loc[data["Embarked"] == "C", "Embarked"] = 1
    data.loc[data["Embarked"] == "Q", "Embarked"] = 2   
    
log_reg = linear_model.LogisticRegression(solver='lbfgs',class_weight='balanced', max_iter=7600)
    
train = pd.read_csv("train.csv")
clean_data(train)

target = train["Survived"].values
feature_names = ["Pclass", "Age", "Fare", "Embarked", "Sex", "SibSp", "Parch"]
features = train[feature_names].values

decision_tree = tree.DecisionTreeClassifier(random_state = 1) 
decision_tree_ = decision_tree.fit(features, target)

print(decision_tree_.score(features, target))

#initilize with random number 1, derivatives are not below 1, easier for the tree to work.
#from PIL import Image
#image = Image.open('high_poly.png')
#image.show()
# The classifier just uses a tree, with a high degree polynomial, which has a high frequency,
# so it gets many of the data points right. Since we have many data points, its always going to do OK

# HOW DO WE FIX THIS PROBLEM WITH THE ALGORITHM?
# - INTENTIONALLY WITHDRAW SOME INFORMATION: "I have these 10 points, but I will only pass the 
#   algorithm 6 points. Because I know that the algorithm otherwise is going to get overzealous and
#   read too much into the data. What I want the algorithm to do is to try to keep a generalization"

# ALGORITHM: model_selection (better than tree, does not have the same issues)

# When you withdraw information, the part of the data that you are hiding, name: "Cross-validation-set"
scores = model_selection.cross_val_score(decision_tree, features, target, scoring = 'accuracy', cv=50)
print(scores)
print(scores.mean()) #Low score, since we don't use all the information

0.9797979797979798
[0.66666667 0.61111111 0.66666667 0.88888889 0.94444444 0.94444444
 0.72222222 0.77777778 0.72222222 0.77777778 0.72222222 0.61111111
 0.72222222 0.77777778 0.55555556 0.83333333 1.         0.66666667
 0.77777778 0.77777778 0.88888889 0.77777778 0.88888889 0.72222222
 0.55555556 0.83333333 0.94444444 0.88888889 0.66666667 0.83333333
 0.72222222 0.66666667 0.88888889 0.94444444 0.88888889 0.77777778
 0.72222222 0.72222222 0.72222222 0.77777778 0.88888889 0.82352941
 0.70588235 0.82352941 0.82352941 0.70588235 0.82352941 0.82352941
 0.88235294 0.88235294]
0.7836601307189542


In [154]:
# generelized_tree
# This algorithm creates a simple decision tree
# It will take a look on all data, all rows, all the characteristics of it
# And it will makes a decision tree, "this move moves me the most in a certain direction"

from sklearn import tree, model_selection

def clean_data(data):
    data["Fare"] = data["Fare"].fillna(data["Fare"].dropna().median()) 
    data["Age"] = data["Age"].fillna(data["Age"].dropna().median())
    
    data.loc[data["Sex"] == "male", "Sex"] = 0 
    data.loc[data["Sex"] == "female", "Sex"] = 1
    
    data["Embarked"] = data["Embarked"].fillna("S")
    data.loc[data["Embarked"] == "S", "Embarked"] = 0
    data.loc[data["Embarked"] == "C", "Embarked"] = 1
    data.loc[data["Embarked"] == "Q", "Embarked"] = 2   
    
log_reg = linear_model.LogisticRegression(solver='lbfgs',class_weight='balanced', max_iter=7600)
    
train = pd.read_csv("train.csv")
clean_data(train)

target = train["Survived"].values
feature_names = ["Pclass", "Age", "Fare", "Embarked", "Sex", "SibSp", "Parch"]
features = train[feature_names].values

generelized_tree = tree.DecisionTreeClassifier(
    random_state = 1,
    max_depth = 7, # want to create a tree which is deeper than 7 levels
    min_samples_split = 2 # can controll the sample split: when he decides that he has to branch out
) 
generelized_tree_ = generelized_tree.fit(features, target)

print(generelized_tree_.score(features, target))

scores = model_selection.cross_val_score(generelized_tree, features, target, scoring = 'accuracy', cv=50)
print(scores)
print(scores.mean()) #FIX: Tell algorithm to NOT read TOO MUCH into the data

# OBSERVATIONS: The general version of the tree was less confident on the first run compared to the
#               decision tree: 88 vs 98 %. Although: The general one gave a better performance overall

# HARD PROBLEM IN MACHINE LEARNING: Hard to understand what the machine is thinking
# LUCKILY FOR US: This information is visualizable within decision trees.

tree.export_graphviz(generelized_tree_, feature_names=feature_names, out_file = "tree.dot")
# "type dot.png" in C:\Users\Mahmu\Documents\General, you will see the content, NOT HUMAN FRIENDLY
# HUMAN FRIENDLY TRANSFORMATION OF DOT FILE: dot -Tpng tree.dot > tree.png 
# To use "dot", you need to download the API and set its PATH variable
# https://stackoverflow.com/questions/48243249/graphvizs-dot-tool-on-windows


0.8787878787878788
[0.77777778 0.66666667 0.77777778 0.94444444 0.83333333 0.88888889
 0.61111111 0.83333333 0.88888889 0.88888889 0.72222222 0.66666667
 0.83333333 0.77777778 0.72222222 0.83333333 0.94444444 0.72222222
 0.94444444 0.83333333 0.88888889 0.83333333 0.83333333 0.88888889
 0.94444444 0.83333333 0.83333333 0.83333333 0.77777778 0.88888889
 0.72222222 0.66666667 0.88888889 0.88888889 0.83333333 0.77777778
 0.72222222 0.66666667 0.88888889 0.77777778 0.83333333 0.88235294
 0.76470588 0.88235294 0.88235294 0.64705882 0.88235294 0.76470588
 1.         0.94117647]
0.8196078431372549


In [50]:
# test

for x in [1,3,2]:
    print(x)

1
3
2
