In [1]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import os

from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import make_scorer, accuracy_score
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold

from collections import defaultdict
from collections import Counter

ages = defaultdict(int)
counter = Counter()
# ages = defaultdict(list)
fare_temp = []


In [2]:
def extract_prefix(x):
    pre2eng = {'Mlle':'Miss','Ms': 'Miss','Mme': 'Mrs'}
    prefix = ''
    names = x.split(',')
    if len(names) > 1 and '.' in names[1]:
        prefix = names[1].split('.')[0]
        prefix = prefix.strip()
        if prefix in pre2eng:
            prefix = pre2eng[prefix]
    return prefix

def simplify_name(df):
    df['Pre'] = df.Name.apply(lambda x: extract_prefix(x))
    return df

def statistics(df):
    global ages
    global counter
    
    for k in ages:
        ages[k] *= counter[k]
    
    for index,row in df.iterrows():
        if pd.notna(row['Age']):
            key = (row['Pre'], row['Sex'])
#             ages[key].append(row['Age'])
            counter[key] += 1
            ages[key] += row['Age']
    for k in ages:
        ages[k] /= counter[k]

In [3]:
def simplify_ages(df):
    statistics(df)
    global ages
    for index,row in df.iterrows():
        if pd.isna(row['Age']):
            key = (row['Pre'], row['Sex'])
            df.loc[index,'Age'] = ages[key]
    df['Age'].fillna(-0.5)
    bins = (-1,0, 5, 12, 18, 25, 35, 60, 120)
    group_names = ['Unknown','Baby', 'Child', 'Teenager', 'Student', 'Young_Adult', 'Adult', 'Senior']
    categories = pd.cut(df.Age, bins, labels=group_names)
    df.Age = categories
    return df

def simplify_cabins(df):
    df.Cabin = df.Cabin.fillna('N')
    df.Cabin = df.Cabin.apply(lambda x: x[0])
    return df

def simplify_relationship(df):
    df['Relations'] = (df.SibSp + df.Parch)
    return df

def combine_age_class(df):
    df['A*C'] = df.Age * df.Pclass
    return df

def simplify_fare(df):
    global fare_temp
    if not fare_temp:
        fare_temp = df['Fare'].dropna().tolist()
    else:
        fare_temp.extend(df['Fare'].dropna().tolist())
    df['Fare'].fillna(pd.Series(fare_temp).median(), inplace=True)
#     print df['Fare'].dropna()
    bins = (-1,7.91,14.454,31,1000)
    group_names = [0,1,2,3]
    categories = pd.cut(df.Fare, bins, labels=group_names)
    df.Fare = categories
    return df

def transform_features(df):
    df = simplify_name(df)
    df = simplify_relationship(df)
    df = simplify_ages(df)
    df = simplify_fare(df)
    df = combine_age_class(df)
#     df = simplify_cabins(df)
    df.Embarked = df.Embarked.fillna('U')
#     print df
    return df

In [4]:
train_dataframe = pd.read_csv('./train.csv')
train_dataframe = transform_features(train_dataframe)
test_dataframe = pd.read_csv('./test.csv')
test_dataframe = transform_features(test_dataframe)
   
feature_names = ['Pclass','Pre','Age','Sex','Relations','Embarked','Fare','A*C']
    
train_targets = train_dataframe['Survived']
train_features = train_dataframe[feature_names]

test_features = test_dataframe[feature_names]
    
for name in feature_names:
    le = LabelEncoder()
    le = le.fit(pd.concat([train_features[name],test_features[name]]))
    train_features.loc[:,name] = le.transform(train_features[name])
    test_features.loc[:,name] = le.transform(test_features[name])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.obj[item] = s


In [5]:
# clf = RandomForestClassifier()
# parameters = {'n_estimators': range(50,150), 
#              }

# acc_scorer = make_scorer(accuracy_score)
# grid_obj = GridSearchCV(clf, parameters, scoring=acc_scorer)
# grid_obj = grid_obj.fit(train_features, train_targets)
# clf = grid_obj.best_estimator_
clf = RandomForestClassifier(n_estimators=100)

In [6]:
kf = KFold(n_splits=10)
validation_accuracy=[]

for train_index, validation_index in kf.split(train_features):
    f_train,f_validation = train_features.loc[train_index],train_features.loc[validation_index]
    t_train,t_validation = train_targets.loc[train_index],train_targets.loc[validation_index]
    clf.fit(f_train,t_train)
#     print(clf.feature_importances_)
    predictions = clf.predict(f_validation)
    accuracy = accuracy_score(t_validation,predictions)
    validation_accuracy.append(accuracy)
print("Validation Mean Accuracy:"+str(np.mean(validation_accuracy)))

Validation Mean Accuracy:0.839488139825


In [7]:
clf.fit(train_features, train_targets)
predictions = clf.predict(test_features)
result = pd.DataFrame({'PassengerId':test_dataframe['PassengerId'],'Survived':predictions})
result.to_csv('./result.csv',index=False)