In [200]:
#Import packages
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import missingno as msno
from sklearn.impute import SimpleImputer
from scipy.stats.stats import pearsonr

In [201]:
#Import data
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')
train.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [202]:
# functions

def categorise_age(age):
    #put a category for age group
    age_group = [[0,10, 0], [11,20, 1], [21,30, 2], 
                 [31, 40, 3], [41, 50, 4], [51, 60, 5], 
                 [61, 70, 6], [71, 80, 7], [81, 90, 8]]
    counter = 0
    result = []
    
    for i in age:
        for j in age_group:
            if i in range(j[0], (j[1]+1)):
                result += [j[2]]
                break
            elif i not in range(0, 1000):
                result += ["NaN"]
                break
                
        counter += 1
    
    return result

def gender_to_number(column):
    result = []
    for i in column:
        # male = 0
        if i.lower() == "male":
            result += [0]
            
        # female = 1
        elif i.lower() == "female":
            result += [1]
        
        else:
            result += ["NaN"]
            
    return result

def embarked_to_numbers(column):
    
    result = []
    
    for i in column:
        if i == "S":
            result += [0]
        elif i == "C":
            result += [1]
        elif i == "Q":
            result += [2]
        else:
            result += ["NaN"]
            
    return result

def calculate_outlier_fences(set1):
    Q1 = np.percentile(set1, 25)
    Q3 = np.percentile(set1, 75)
    IQR = Q3 - Q1
    C = 1.5
    L_Fence = (Q1 - (C * IQR)).astype(float)
    U_Fence = (Q3 + (C * IQR)).astype(float)
    
    return [L_Fence, U_Fence]

# Return outliers based on the fences
def spot_outliers(set1):
    result = []
    for i in set1:
        if i < calculate_outlier_fences(set1)[0]:
            result += [i]
        elif i > calculate_outlier_fences(set1)[1]:
            result += [i]

In [203]:
# start preprocessing

# Age categorical & drop it
# train["Age_group"] = categorise_age(train["Age"])
# train.drop(columns = ["Age"], axis = 1, inplace = True)
train["Age"] = train["Age"].astype(float)

test["Age"] = test["Age"].astype(float)

# Gender in numerical
train["Gender"] = gender_to_number(train["Sex"])
train.drop(columns = ["Sex"], inplace = True)

test["Gender"] = gender_to_number(test["Sex"])
test.drop(columns = ["Sex"], inplace = True)

# Drop tickets
train.drop(columns = ["Ticket"], inplace = True)

test.drop(columns = ["Ticket"], inplace = True)

# Drop cabin
train.drop(columns = ["Cabin"], inplace = True)

test.drop(columns = ["Cabin"], inplace = True)

# Embarked in numbers
train["Embarked_num"] = embarked_to_numbers(train["Embarked"])
train.drop(columns = ["Embarked"], inplace = True)

test["Embarked_num"] = embarked_to_numbers(test["Embarked"])
test.drop(columns = ["Embarked"], inplace = True)

# Drop name
train.drop(columns = ["Name"], inplace = True)

test.drop(columns = ["Name"], inplace = True)

# Round fare
train["Fare"] = train["Fare"].round(2)

test["Fare"] = test["Fare"].round(2)


train.head()

Unnamed: 0,PassengerId,Survived,Pclass,Age,SibSp,Parch,Fare,Gender,Embarked_num
0,1,0,3,22.0,1,0,7.25,0,0
1,2,1,1,38.0,1,0,71.28,1,1
2,3,1,3,26.0,0,0,7.92,1,0
3,4,1,1,35.0,1,0,53.1,1,0
4,5,0,3,35.0,0,0,8.05,0,0


In [204]:
# Check for univariate outliers
spot_outliers(train["Fare"])
    #no outliers

# Check for missing data
train.isna().sum()

PassengerId       0
Survived          0
Pclass            0
Age             177
SibSp             0
Parch             0
Fare              0
Gender            0
Embarked_num      0
dtype: int64

In [205]:
# Impute missing data

trainMissing = train.iloc[:, :].values
dataMissing = train.iloc[:, :].values

imputer = SimpleImputer(missing_values=np.nan, strategy="mean")
imputer = imputer.fit(trainMissing[:, 4:9])
dataMissing[:, 4:9] = imputer.transform(dataMissing[:, 4:9])


In [211]:
testdata = pd.DataFrame(data = dataMissing)
testdata

Unnamed: 0,0,1,2,3,4,5,6,7,8
0,1,0,3,22,1,0,7.25,0,0
1,2,1,1,38,1,0,71.28,1,1
2,3,1,3,26,0,0,7.92,1,0
3,4,1,1,35,1,0,53.1,1,0
4,5,0,3,35,0,0,8.05,0,0
5,6,0,3,,0,0,8.46,0,2
6,7,0,1,54,0,0,51.86,0,0
7,8,0,3,2,3,1,21.08,0,0
8,9,1,3,27,0,2,11.13,1,0
9,10,1,2,14,1,0,30.07,1,1


In [135]:
train.isna().sum()

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64

In [197]:
train["Age"]

0      22.0
1      38.0
2      26.0
3      35.0
4      35.0
5       NaN
6      54.0
7       2.0
8      27.0
9      14.0
10      4.0
11     58.0
12     20.0
13     39.0
14     14.0
15     55.0
16      2.0
17      NaN
18     31.0
19      NaN
20     35.0
21     34.0
22     15.0
23     28.0
24      8.0
25     38.0
26      NaN
27     19.0
28      NaN
29      NaN
       ... 
861    21.0
862    48.0
863     NaN
864    24.0
865    42.0
866    27.0
867    31.0
868     NaN
869     4.0
870    26.0
871    47.0
872    33.0
873    47.0
874    28.0
875    15.0
876    20.0
877    19.0
878     NaN
879    56.0
880    25.0
881    33.0
882    22.0
883    28.0
884    25.0
885    39.0
886    27.0
887    19.0
888     NaN
889    26.0
890    32.0
Name: Age, Length: 891, dtype: float64