In [1]:
#Import libraries
import numpy as np
from numpy.random import random_integers
import pandas as pd
import matplotlib.pyplot as plt
import sklearn
from sklearn.cross_validation import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from scipy.stats import pointbiserialr, spearmanr
%matplotlib inline

print('Libraries Ready!')

Libraries Ready!


In [2]:
#Load data 
path = '../input/'
trainRawData = pd.read_csv(path+'train.csv')
testRawData = pd.read_csv(path+'test.csv')
# print len(trainRawData)
# print len(testRawData)
# trainRawData.head()
# testRawData.head()
fullRawData = pd.concat([trainRawData, testRawData])
# print len(fullRawData)

# Data Processing

In [3]:

fullData = fullRawData
fullData.index = fullData.PassengerId

#1. Get the title from the names
Title_Dictionary = {
                    "Capt":       "Officer",
                    "Col":        "Officer",
                    "Major":      "Officer",
                    "Jonkheer":   "Royalty",
                    "Don":        "Royalty",
                    "Sir" :       "Royalty",
                    "Dr":         "Officer",
                    "Rev":        "Officer",
                    "the Countess":"Royalty",
                    "Dona":       "Royalty",
                    "Mme":        "Mrs",
                    "Mlle":       "Miss",
                    "Ms":         "Mrs",
                    "Mr" :        "Mr",
                    "Mrs" :       "Mrs",
                    "Miss" :      "Miss",
                    "Master" :    "Master",
                    "Lady" :      "Royalty"
                    } 

fullData['Title'] = fullData['Name'].apply(lambda x: Title_Dictionary[x.split(',')[1].split('.')[0].strip()])


#2. Create new variable familyType
fullData['fsize'] = fullData['Parch'] + fullData['SibSp'] + 1
def familyType( fsize ):
    if(fsize == 1):
        return 'singleton'
    elif(fsize>1 & fsize<5):
        return 'small'
    elif(fsize >4 ):
        return 'large'
familyType = fullData['fsize'].map(familyType)
fullData['familyType'] = familyType


#3. Add in the median age based on the Title, Pclass and Sex of each passenger
mask_Age = fullData.Age.notnull()
Age_Sex_Title_Pclass = fullData.loc[mask_Age, ["Age", "Title", "Sex", "Pclass"]]
Filler_Ages = Age_Sex_Title_Pclass.groupby(by = ["Title", "Pclass", "Sex"]).median()
Filler_Ages = Filler_Ages.Age.unstack(level = -1).unstack(level = -1)
mask_Age = fullData.Age.isnull()
Age_Sex_Title_Pclass_missing = fullData.loc[mask_Age, ["Title", "Sex", "Pclass"]]
def Age_filler(row):
    if row.Sex == "female":
        age = Filler_Ages.female.loc[row["Title"], row["Pclass"]]
        return age
    
    elif row.Sex == "male":
        age = Filler_Ages.male.loc[row["Title"], row["Pclass"]]
        return age
Age_Sex_Title_Pclass_missing["Age"]  = Age_Sex_Title_Pclass_missing.apply(Age_filler, axis = 1)
ages = pd.concat([Age_Sex_Title_Pclass["Age"], Age_Sex_Title_Pclass_missing["Age"]])
fullData['Age'] = ages

#4. Replacing embarked missing value
mask_Embarked = fullData.Embarked.isnull()
d = fullData.loc[mask_Embarked]
fullData.loc[mask_Embarked].Embarked = 'C'
#Replacing fare missing value
fullData['Fare']=fullData['Fare'].fillna(value=fullData.Fare.mean())

#5. Drop unused attributes
fullData = fullData.drop(['Ticket', 'Cabin','fsize','Name','PassengerId'], axis=1)

#6. Convert the categorial vaiables to dummy variables
dummies_Sex=pd.get_dummies(fullData['Sex'],prefix='Sex')
dummies_Embarked = pd.get_dummies(fullData['Embarked'], prefix= 'Embarked') 
dummies_Pclass = pd.get_dummies(fullData['Pclass'], prefix= 'Pclass')
dummies_Title = pd.get_dummies(fullData['Title'], prefix= 'Title')
dummies_FamilyType = pd.get_dummies(fullData['familyType'], prefix='familyType')
fullData = pd.concat([fullData, dummies_Sex, dummies_Embarked, dummies_Pclass, dummies_Title, dummies_FamilyType], axis=1)
fullData = fullData.drop(['Sex','Embarked','Pclass','Title','familyType'], axis=1)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self[name] = value


# Feature Selection

In [5]:
train = fullData[:891]
 
columns = train.columns.values
param=[]
correlation=[]
abs_corr=[]

for c in columns:
    #Check if binary or continuous
    if len(train[c].unique())<=2:
        corr = spearmanr(train['Survived'],train[c])[0]
    else:
        corr = pointbiserialr(train['Survived'],train[c])[0]
    param.append(c)
    correlation.append(corr)
    abs_corr.append(abs(corr))

#Create dataframe for visualization
param_df=pd.DataFrame({'correlation':correlation,'parameter':param, 'abs_corr':abs_corr})
#Sort by absolute correlation
param_df=param_df.sort_values(by=['abs_corr'], ascending=False)
#Set parameter name as index
param_df=param_df.set_index('parameter')

best_features=param_df.index[1:10+1].values

array(['Title_Mr', 'Sex_female', 'Sex_male', 'Title_Mrs', 'Title_Miss',
       'Pclass_3', 'Pclass_1', 'Fare', 'familyType_singleton',
       'familyType_small'], dtype=object)

In [18]:
X = train[best_features].values
Y = train.loc[:, 'Survived'].values
X_train, X_test, Y_train, Y_test = train_test_split(X, Y,  
                                                    train_size=0.75,  
                                                    test_size=0.25) 

In [61]:
#Output processed data
features = np.append(best_features, 'Survived')
processedTrain = train[features]
new_columns = processedTrain.columns.values 
new_columns[10] = 'class' 
processedTrain.columns = new_columns
processedTrain.head()
processedTrain.to_csv('processedTrain.csv',index = False, header= True)
test=fullData[891:]
test[best_features].to_csv('processedTest.csv',index = False, header= True)

# Model Building

In [22]:
#Tuning using TPOT 
#http://www.kdnuggets.com/2016/05/tpot-python-automating-data-science.html
from sklearn.cross_validation import train_test_split  
from tpot import TPOT 

my_tpot = TPOT(generations=10)  
my_tpot.fit(X_train, Y_train)  
print(my_tpot.score(X_test, Y_test)) 

my_tpot.export('exported_Classifier.py')

0.831853634252


# TPOT exported script 

In [62]:
import numpy as np
import pandas as pd

from sklearn.cross_validation import train_test_split
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import f_classif
from sklearn.linear_model import LogisticRegression
from xgboost import XGBClassifier

# NOTE: Make sure that the class is labeled 'class' in the data file
tpot_data = pd.read_csv('processedTrain.csv', sep=',')
training_indices, testing_indices = train_test_split(tpot_data.index, stratify = tpot_data['class'].values, train_size=0.75, test_size=0.25)


result1 = tpot_data.copy()

# Perform classification with a logistic regression classifier
lrc1 = LogisticRegression(C=0.01)
lrc1.fit(result1.loc[training_indices].drop('class', axis=1).values, result1.loc[training_indices, 'class'].values)
result1['lrc1-classification'] = lrc1.predict(result1.drop('class', axis=1).values)

# Use Scikit-learn's SelectKBest for feature selection
training_features = result1.loc[training_indices].drop('class', axis=1)
training_class_vals = result1.loc[training_indices, 'class'].values

if len(training_features.columns.values) == 0:
    result2 = result1.copy()
else:
    selector = SelectKBest(f_classif, k=min(24, len(training_features.columns)))
    selector.fit(training_features.values, training_class_vals)
    mask = selector.get_support(True)
    mask_cols = list(training_features.iloc[:, mask].columns) + ['class']
    result2 = result1[mask_cols]

# Perform classification with an eXtreme gradient boosting classifier
xgbc3 = XGBClassifier(learning_rate=0.1, n_estimators=77, max_depth=4320)
xgbc3.fit(result2.loc[training_indices].drop('class', axis=1).values, result2.loc[training_indices, 'class'].values)
result3 = result2.copy()
result3['xgbc3-classification'] = xgbc3.predict(result3.drop('class', axis=1).values)


In [90]:
#Predict Value
testData = pd.read_csv('processedTest.csv', sep=',')
testData.head()
testData['lrc1-classification'] = lrc1.predict(testData.values)
testData['Survived'] = xgbc3.predict(testData.values)

In [94]:
#Write prediction result to file
testRawData['Survived'] = testData['Survived']
testRawData.head()
result=testRawData[['PassengerId','Survived']]
result.to_csv('TPOTprediction.csv', index= False, header='True')