In [None]:
import numpy as np     # numerical
import pandas as pd    # data process
import matplotlib.pyplot as plt  # visualization
import seaborn as sns  # visualization

plt.style.use('seaborn')  # matplot style selection
sns.set(font_scale=2.5)

import missingno as msno  # helps to find null data

#ignore warnings
import warnings
warnings.filterwarnings('ignore')

# matplot without pop-up
%matplotlib inline


In [None]:
df_train = pd.read_csv('../input/titanic/train.csv')
df_test = pd.read_csv('../input/titanic/test.csv')

In [None]:
df_train.head() # display 5 elements

In [None]:
df_train.describe() # display statistics

In [None]:
for col in df_train.columns:
    msg = 'column: {:>10}\t Percent of NaN value: {:.2f}%'.format(col, 100 * (df_train[col].isnull().sum() / df_train[col].shape[0]))
    print(msg)
    
    '''
    :>10 = right alignment, :<10 = left alignment
    :.2f = decimal points
    
    df_train[col] = data series for each col as the data type of tuple(index, value)
    df_train[col].shape[0] = (rows, cols) = the total number of record
    
    '''

In [None]:
msno.matrix(df=df_train.iloc[:, :], figsize=(8, 8), color=(0.2, 0.5, 0.2)) # display as matrix
'''
iloc: index location[rows, cols]
figsize = figure size

'''

In [None]:
f, ax = plt.subplots(1, 2, figsize=(24, 8)) # subplot(row, col)

df_train['Survived'].value_counts().plot.pie(explode=[0, 0.1], autopct='%1.1f%%', ax=ax[0], shadow=True)
'''
df_train[''].value_counts(): return counts of each labels
explode: separate the sections in pie plot
autopct: format of percent
ax[0]: the first half of plot
'''

ax[0].set_title('Pie plot - Survived')
ax[0].set_ylabel('')

sns.countplot('Survived', data=df_train, ax=ax[1])
ax[1].set_title('Count plot - Survived')
plt.show()



## Pclass

In [None]:
df_train[['Pclass', 'Survived']].groupby(['Pclass'], as_index=True).count()
'''
how many survived for each Pclass
as_index = if I assign Pcalss as index or not

df_train[['Pclass', 'Survived']].groupby(['Pclass'], as_index=True).sum(), this gives actual value of the result
'''


In [None]:
df_train['Survived'].unique() # so in this case, Survived has 0, 1 for its value

In [None]:
pd.crosstab(df_train['Pclass'], df_train['Survived'], margins=True)
'''
margin = display total
this displays counts of each value
'''

In [None]:
df_train[['Pclass', 'Survived']].groupby(['Pclass'], as_index=True).mean().sort_values(by='Survived', ascending=False).plot.bar()
'''
sort_values(): sorting
ascending: order
'''

In [None]:
y_position = 1.02 # setting the distance between title and plot
f, ax = plt.subplots(1, 2, figsize=(24, 8)) # display 2 plots
df_train['Pclass'].value_counts().plot.bar(color=['#CD7F32', '#FFDF00', '#D3D3D3'], ax=ax[0])
ax[0].set_title('Number of passengers By Pclass', y=y_position)
ax[0].set_ylabel('Count')

sns.countplot('Pclass', hue='Survived', data=df_train, ax=ax[1]) # hue: distingush each value with colors
ax[1].set_title('Pclass: Survived vs Dead', y=y_position)
plt.show()

## Age

In [None]:
f, ax = plt.subplots(1, 2, figsize=(24, 8))
df_train[['Sex', 'Survived']].groupby(['Sex'], as_index=True).mean().plot.bar(ax=ax[0])
ax[0].set_title('Survived vs Sex')

sns.countplot('Sex', hue='Survived', data=df_train, ax=ax[1])
ax[1].set_title('Sex: Survived vs Dead')
plt.show()

In [None]:
df_train[['Sex', 'Survived']].groupby(['Sex'], as_index=False).mean()

In [None]:
pd.crosstab(df_train['Sex'], df_train['Survived'], margins=True)

## Both Sex and Pclass

In [None]:
sns.factorplot('Pclass', 'Survived', hue='Sex', data=df_train, size=6, aspect=1.5) # combining plots

- The higher class, the higher survival rate
- Female has higher survival rate

In [None]:
sns.factorplot(x='Sex', y='Survived', hue='Pclass', data=df_train, saturation=.5, size=6, aspect=1)
'''
x-axis: sex, y-axis: survived
aspect: ratio of height
col: displays each data separated by column, hue: each data separated by color

The colored bar means error range, called 'error bar'
'''

## Age

In [None]:
foo = df_train['Age']

In [None]:
df_train[df_train['Survived'] == 1]['Age'].hist()

In [None]:
f, ax = plt.subplots(1, 1, figsize=(9, 5))
sns.kdeplot(df_train[df_train['Survived'] == 1]['Age'], ax=ax) # draws kde graph, much smoother than hist graph
sns.kdeplot(df_train[df_train['Survived'] == 0]['Age'], ax=ax)
plt.legend(['Survived == 1', 'Survived == 0'])
plt.show()

In [None]:
plt.figure(figsize=(9, 5))
df_train['Age'][df_train['Pclass'] == 1].plot(kind='kde')
df_train['Age'][df_train['Pclass'] == 2].plot(kind='kde')
df_train['Age'][df_train['Pclass'] == 3].plot(kind='kde')

plt.xlabel('Age')
plt.ylabel('')
plt.title('Age Distribution within classes')
plt.legend(['1st class', '2nd class', '3rd class'])

The higher class, the older passengers

In [None]:
plt.figure(figsize=(9, 5))
df_train['Age'][(df_train['Survived'] == 1) & (df_train['Pclass'] == 1)].plot(kind='kde')
df_train['Age'][(df_train['Survived'] == 0) & (df_train['Pclass'] == 1)].plot(kind='kde')

plt.xlabel('Age')
plt.ylabel('')
plt.title('1st Class')
plt.legend(['Survived == 1','Survived == 0'])

In [None]:
plt.figure(figsize=(9, 5))
df_train['Age'][(df_train['Survived'] == 1) & (df_train['Pclass'] == 2)].plot(kind='kde')
df_train['Age'][(df_train['Survived'] == 0) & (df_train['Pclass'] == 2)].plot(kind='kde')

plt.xlabel('Age')
plt.ylabel('')
plt.title('2nd Class')
plt.legend(['Survived == 1','Survived == 0'])

In [None]:
plt.figure(figsize=(9, 5))
df_train['Age'][(df_train['Survived'] == 1) & (df_train['Pclass'] == 3)].plot(kind='kde')
df_train['Age'][(df_train['Survived'] == 0) & (df_train['Pclass'] == 3)].plot(kind='kde')

plt.xlabel('Age')
plt.ylabel('')
plt.title('3rd Class')
plt.legend(['Survived == 1','Survived == 0'])

Children have higher survival rate for all three Pclasses.
To make sure, cummlate_sum

In [None]:
survival_ratio_by_age_range = []

for i in range(1, 80):
    survival_ratio_by_age_range.append(df_train[df_train['Age'] < i]['Survived'].sum() / len(df_train[df_train['Age'] < i]['Survived']))
'''
1. return df rows of age under i
2. ['Survived'].sum(): get the number of survivals
3. divide the result by the total number
4. repeat the steps from 1 to 80

'''

plt.figure(figsize=(7, 7))
plt.plot(survival_ratio_by_age_range)
plt.title('Survival rate change depending on range of Age', y=1.02)
plt.ylabel('Survival rate')
plt.xlabel('Range of Age(1~x)')
plt.show()

## Embarked

In [None]:
f, ax = plt.subplots(1, 1, figsize=(7, 7))
df_train[['Embarked', 'Survived']].groupby(['Embarked'], as_index=True).mean().sort_values(by="Survived", ascending=False).plot.bar(ax=ax)

In [None]:
f, ax = plt.subplots(2, 2, figsize=(20, 15)) # create 2-dimentional plot tables
sns.countplot('Embarked', data=df_train, ax=ax[0, 0])
ax[0, 0].set_title('(1) Num Of Passengers Boarded')

sns.countplot('Embarked', hue='Sex', data=df_train, ax=ax[0, 1])
ax[0, 1].set_title('(2) Male-Female split for embarked')

sns.countplot('Embarked', hue='Survived', data=df_train, ax=ax[1, 0])
ax[1, 0].set_title('(3) Embarked vs Survived')

sns.countplot('Embarked', hue='Pclass', data=df_train, ax=ax[1, 1])
ax[1, 1].set_title('(4) Embarked vs Pclass')

plt.subplots_adjust(wspace=0.2, hspace=0.5) # making space between plots
plt.show()

1. Most passengers from S
2. Much more male than female from S (C and Q would have higher survival rate)
3. C and Q have higher suvival rate than S
4. Passengers from C were more likely to be 1st class than the other two

result: C have the highest survival rate

## Family - SibSp + Parch

In [None]:
df_train['FamilySize'] = df_train['SibSp'] + df_train['Parch'] + 1 # family member + him/herself

In [None]:
f, ax = plt.subplots(1, 3, figsize=(40, 10))
sns.countplot('FamilySize', data=df_train, ax=ax[0])
ax[0].set_title('(1) Num of Passenger Boarded', y=1.02)

sns.countplot('FamilySize', hue='Survived', data=df_train, ax=ax[1])
ax[1].set_title('(2) Survived countplot depending on FamilySize')

df_train[['FamilySize', 'Survived']].groupby(['FamilySize'], as_index=True).mean().sort_values(by='Survived', ascending=False).plot.bar(ax=ax[2])
ax[2].set_title('(3) Survived rate depending on FamilySize', y=1.02)

plt.subplots_adjust(wspace=0.2, hspace=0.2)
plt.show()

The more family, the higher survival rate

## Fare

In [None]:
f, ax = plt.subplots(1, 1, figsize=(6, 6))
g = sns.distplot(df_train['Fare'], color='b', label='Skewness: {:.2f}'.format(df_train['Fare'].skew()), ax=ax)
'''
displot: draws histogram
Skewness: how biasd the data is
g: assigned seborn object(plot)

too much biased data can produce lower percent of prediction
'''


plt.ylabel('')
g = g.legend(loc='best')

In [None]:
df_train['Fare'] = df_train['Fare'].map(lambda i: np.log(i) if i>0 else 0)

In [None]:
df_train['Fare']

In [None]:
f, ax = plt.subplots(1, 1, figsize=(6, 6))
g = sns.distplot(df_train['Fare'], color='b', label='Skewness: {:.2f}'.format(df_train['Fare'].skew()), ax=ax)
plt.ylabel('')
g = g.legend(loc='best')

## Ticket

In [None]:
df_train['Ticket'].value_counts()

Unordered and so random. Need to check each data by data

## Fill Null in Age

In [None]:
df_train['Age'].isnull().sum()

In [None]:
df_train['Initial'] = df_train['Name'].str.extract('([A-Za-z]+)\.')
df_test['Initial'] = df_test['Name'].str.extract('([A-Za-z]+)\.')

'''
str: convert to String datatype
extract: extract the data according to regular expression
([A-Za-z]+)\. : Alphabets followed by dot(.)
'''

In [None]:
pd.crosstab(df_train['Initial'], df_train['Sex'], margins=False).T

In [None]:
df_train['Initial'].replace(['Mlle','Mme','Ms','Dr','Major','Lady','Countess','Jonkheer','Col','Rev','Capt','Sir','Don','Dona'],
                           ['Miss','Miss','Miss','Mr','Mr','Mrs','Mrs','Other','Other','Other','Mr','Mr','Mr','Mr'], inplace=True)
'''
replace set of data with other set of corresponding data by one on one
inplace: whether apply or not

'''

In [None]:
df_train.groupby('Initial').mean()

In [None]:
# indexing data which is null Age and certain initial and assign value of mean to Age
df_train.loc[(df_train['Age'].isnull()) & (df_train['Initial'] == 'Mr'), 'Age'] = 33
df_train.loc[(df_train['Age'].isnull()) & (df_train['Initial'] == 'Master'), 'Age'] = 5
df_train.loc[(df_train['Age'].isnull()) & (df_train['Initial'] == 'Miss'), 'Age'] = 22
df_train.loc[(df_train['Age'].isnull()) & (df_train['Initial'] == 'Mrs'), 'Age'] = 36
df_train.loc[(df_train['Age'].isnull()) & (df_train['Initial'] == 'Other'), 'Age'] = 46

df_test.loc[(df_test['Age'].isnull()) & (df_test['Initial'] == 'Mr'), 'Age'] = 33
df_test.loc[(df_test['Age'].isnull()) & (df_test['Initial'] == 'Master'), 'Age'] = 5
df_test.loc[(df_test['Age'].isnull()) & (df_test['Initial'] == 'Miss'), 'Age'] = 22
df_test.loc[(df_test['Age'].isnull()) & (df_test['Initial'] == 'Mrs'), 'Age'] = 36
df_test.loc[(df_test['Age'].isnull()) & (df_test['Initial'] == 'Other'), 'Age'] = 46

In [None]:
df_test['Age'].isnull().sum()
# why 1? no idea

## Fill null in Embarked

In [None]:
df_train['Embarked'].isnull().sum()

In [None]:
df_train['Embarked'].fillna('S', inplace=True)

In [None]:
df_train['Embarked'].isnull().sum()

## Bining Age

In [None]:
def category_age(x):
    if x < 10:
        return 0
    elif x < 20:
        return 1
    elif x < 30:
        return 2
    elif x < 40:
        return 3
    elif x < 50:
        return 4
    elif x < 60:
        return 5
    elif x < 70:
        return 6
    else:
        return 7

In [None]:
df_train['Age'] = df_train['Age'].apply(category_age)

## Dropping columns

In [None]:
df_train.head()

In [None]:
df_train.drop(['PassengerId'], axis=1, inplace=True)
df_train.drop(['Name'], axis=1, inplace=True)
df_train.drop(['SibSp'], axis=1, inplace=True)
df_train.drop(['Parch'], axis=1, inplace=True)
df_train.drop(['Ticket'], axis=1, inplace=True)
df_train.drop(['Cabin'], axis=1, inplace=True)
df_train.drop(['Initial'], axis=1, inplace=True)

In [None]:
df_train.head()

## Mapping data to numeric value

In [None]:

df_train['Sex'].replace(['male', 'female'],[0, 1], inplace=True)
df_train['Embarked'].replace(['S', 'C', 'Q'],[0, 1, 2], inplace=True)


In [None]:
df_train.head()

## K-mean Clustering

In [None]:
# parameters: num of cluster, dataset
# ramdomly assign
def init_centroids(k, X):
    arr = []
    for i in range(k):
        cx1 = np.random.uniform(min(X[:,0]), max(X[:,0])) # picking a random num from uniformed distribution from x-axis
        cx2 = np.random.uniform(min(X[:,1]), max(X[:,1])) # picking number from min to max from y-axis
        arr.append([cx1, cx2])
    return np.asarray(arr)

In [None]:
# distance between points
def dist(a, b):
    return np.sqrt(sum(np.square(a-b)))


In [None]:
# num of cluster, data set, array contains centroids
def assign_cluster(k, X, cg):
    cluster = [-1]*len(X) # all elements except for the last element
    for i in range(len(X)):
        dist_arr = []
        for j in range(k):
            dist_arr.append(dist(X[i], cg[j])) # appending the distance between the current cluster and centroid
        idx = np.argmin(dist_arr) # create an array to contain index of minimum distance
        cluster[i] = idx # assign the point to our current point
    return np.asarray(cluster) # returning cluster array

In [None]:
# num of cluster, data set, cluster array
def compute_centroids(k, X, cluster):
    cg_arr = []
    for i in range(k):
        arr = []
        for j in range(len(X)):
            if cluster[j]==i: # if the current point belongs to the cluster
                arr.append(X[j]) # then append that point to arr
        cg_arr.append(np.mean(arr, axis=0)) # array contains all the points from a particular cluste
    return np.asarray(cg_arr)               # then we can find the centroid by taking the mean 


In [None]:
def measure_change(cg_prev, cg_new):
    res = 0
    for a,b in zip(cg_prev,cg_new): # zip helps us to iterate two array simultaneously
        res+=dist(a,b)
    return res # contains the sum of distances between the previous centroids and new centroids

In [None]:
def show_clusters(X, cluster, cg):
    df = DataFrame(dict(x=X[:,0], y=X[:,1], label=cluster))
    colors = {0:'blue', 1:'orange', 2:'green'}
    fig, ax = plt.subplots(figsize=(8, 8))
    grouped = df.groupby('label')
    for key, group in grouped:
        group.plot(ax=ax, kind='scatter', x='x', y='y', label=key, color=colors[key])
    ax.scatter(cg[:, 0], cg[:, 1], marker='*', s=150, c='#ff2222')
    plt.xlabel('X_1')
    plt.ylabel('X_2')
    plt.show()

In [None]:
def k_means(k, X):
    cg_prev = init_centroids(k, X)
    cluster = [0]*len(X)
    cg_change = 100
    while cg_change>.001:
        cluster = assign_cluster(k, X, cg_prev)
        show_clusters(X, cluster, cg_prev)
        cg_new = compute_centroids(k, X, cluster)
        cg_change = measure_change(cg_new, cg_prev)
        cg_prev = cg_new
    return cluster

cluster = k_means(3, X_train)