# Extracting & cleaning data

In [28]:
import pandas as pd 
import numpy as np
from collections import Counter


def clean_titanic(filename):
    """Clean the titanic CSV and returns the clean dataframe"""
    
    df = pd.read_csv(filename)

    # Remove unneeded info
    df.drop(['Name', 'Ticket', 'Cabin'], axis=1, inplace=True)
    
    #Fill age
    age_mean = df['Age'].mean()
    df['Age'] = df['Age'].fillna(age_mean)
    
    #Fill fare
    fare_mean = df['Fare'].mean()
    df['Fare'] = df['Fare'].fillna(fare_mean)
    
    # Replace strings with numbers
    df['Sex'] = df['Sex'].map({'female':0, 'male':1})
    
    # Fill embarked
    embarked = Counter(df['Embarked'])
    most_common = embarked.most_common(1)[0][0]
    df['Embarked'] = df['Embarked'].fillna('S')

    # Replace strings with columns
    dummies = pd.get_dummies(df['Embarked'], prefix='Embarked')
    df = pd.concat([df, dummies], axis=1)
    df.drop(['Embarked'], axis=1, inplace=True)
    
    return df
    
train_df = clean_titanic('train.csv')
test_df = clean_titanic('test.csv')
print("Train: ")
train_df.info()
print("\nTest: ")
test_df.info()

Train: 
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 11 columns):
PassengerId    891 non-null int64
Survived       891 non-null int64
Pclass         891 non-null int64
Sex            891 non-null int64
Age            891 non-null float64
SibSp          891 non-null int64
Parch          891 non-null int64
Fare           891 non-null float64
Embarked_C     891 non-null uint8
Embarked_Q     891 non-null uint8
Embarked_S     891 non-null uint8
dtypes: float64(2), int64(6), uint8(3)
memory usage: 58.4 KB

Test: 
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 418 entries, 0 to 417
Data columns (total 10 columns):
PassengerId    418 non-null int64
Pclass         418 non-null int64
Sex            418 non-null int64
Age            418 non-null float64
SibSp          418 non-null int64
Parch          418 non-null int64
Fare           418 non-null float64
Embarked_C     418 non-null uint8
Embarked_Q     418 non-null uint8
Embarked_S     418 non-nu

# Extracting arrays from dataframes

In [32]:
X = train_df.iloc[:,2:].values
Y = train_df['Survived'].values

X_test = test_df.iloc[:,1:].values


from sklearn.model_selection import train_test_split

X_train, X_val, Y_train, Y_val = train_test_split(X, Y, test_size=0.3)


# Fitting an estimator

In [33]:
from sklearn.ensemble import RandomForestClassifier 

clf = RandomForestClassifier(n_estimators=100, random_state=0)
clf = clf.fit(X_train, Y_train)
print(clf.score(X_val, Y_val))

0.805970149254


# Predicting the output

In [39]:
Y_test = clf.predict(X_test)

output = pd.DataFrame(test_df['PassengerId'])
output.insert(1, "Survived", Y_test)

print(output.head())
output.to_csv(path_or_buf="titanic_predict.csv", index=False)

   PassengerId  Survived
0          892         0
1          893         0
2          894         0
3          895         1
4          896         1
