In [76]:
import numpy as np
import pandas as pd
import csv
import math

from sklearn.tree import DecisionTreeClassifier
from sklearn.cross_validation import cross_val_score
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, AdaBoostClassifier, VotingClassifier
from sklearn.grid_search import GridSearchCV
from sklearn.preprocessing import StandardScaler, MinMaxScaler

In [34]:
sexDic = {"female": 1, "male": 0}
embarkDic = {"C": 1, "S":2, "Q":3}

def substitute_sex_with_num(x):
    return sexDic[x["Sex"]]
def substitute_embarked_with_num(x):
    return embarkDic[x["Embarked"]]

In [36]:
trainDFrame = pd.read_csv("train.csv")
testDFrame = pd.read_csv("test.csv")
IDtest = testDFrame["PassengerId"]

testDFrame["Survived"] = 0
dataset = trainDFrame.append(testDFrame, ignore_index=True)

dataset["Embarked"] = dataset["Embarked"].fillna("S")


dataset["Sex"]=dataset.apply(substitute_sex_with_num, axis=1)
dataset["Embarked"]=dataset.apply(substitute_embarked_with_num, axis=1)





In [37]:
dataset['FamilySize'] = dataset.SibSp.values + dataset.Parch.values + 1

In [38]:
NameSplit = dataset.Name.str.split('[,.]')
surnames = [str.strip(name[0]) for name in NameSplit.values]
dataset['Surname'] = surnames
dataset['FamilyID'] = dataset.Surname.str.cat(dataset.FamilySize.astype(str), sep='')

numMapped = range(len(surnames))
surnameDic = ticketDic = dict(zip(surnames, numMapped))



def substitute_surname_with_num(x):
    return surnameDic[x["Surname"]]
dataset["Surname"]=dataset.apply(substitute_surname_with_num, axis=1)




In [39]:
import string
def substrings_in_string(big_string, substrings):
    for substring in substrings:
        if string.find(big_string, substring) != -1:
            return substring
    print big_string
    return np.nan

title_list=['Mrs', 'Mr', 'Master', 'Miss', 'Major', 'Rev',
                    'Dr', 'Ms', 'Mlle','Col', 'Capt', 'Mme', 'Countess',
                    'Don', 'Jonkheer']

dataset['Title']=dataset['Name'].map(lambda x: substrings_in_string(x, title_list))
 
titleDic={"Mr": 1, "Mrs": 2, "Miss": 3, "Master": 4}
#replacing all titles with mr, mrs, miss, master
def replace_titles(x):
    title=x['Title']
    if title in ['Don', 'Major', 'Capt', 'Jonkheer', 'Rev', 'Col']:
        return titleDic['Mr']
    elif title in ['Countess', 'Mme']:
        return titleDic['Mrs']
    elif title in ['Mlle', 'Ms']:
        return titleDic['Miss']
    elif title =='Dr':
        if x['Sex']=='Male':
            return titleDic['Mr']
        else:
            return titleDic['Mrs']
    else:
        return titleDic[title]
dataset['Title']=dataset.apply(replace_titles, axis=1)
dataset["Title"] = dataset["Title"].astype(int)


In [40]:
tickets = dataset["Ticket"]
ticketValues = sorted(list(set(tickets)))
newTicketKeys = []

for ind, item in enumerate(ticketValues):
    if ind > 704:
        if ind != 787:
            element = item.split(" ")[1]
        else:
            element =newTicketKeys[ind-1]
    else:
        element = item[0]
    newTicketKeys.append(element)

ticketDic = dict(zip(ticketValues, newTicketKeys))

def get_new_ticket_value(x):
    value = ticketDic[x["Ticket"]]
    if len(value) > 1:
        value = value[0]
    return value

dataset['Ticket']=dataset.apply(get_new_ticket_value, axis=1)
dataset["Ticket"][473]= "3"
dataset["Fare"][1043] = 8.0

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


In [41]:
dataset["Cabin"] = pd.Series([i[0] if not pd.isnull(i) else 'X' for i in dataset['Cabin'] ])
#dataset["Pclass"] = dataset["Pclass"].astype("category")


In [42]:


cabinDic = {'A':1, 'B':2, 'C':3, 'D':4, 'E':5, 'F':6, 'G':7, 'T':8, 'X':9}

def substitute_cabin_with_num(x):
    return cabinDic[x["Cabin"]]
dataset["Cabin"]=dataset.apply(substitute_cabin_with_num, axis=1)

In [43]:
ageNanInds = list(dataset["Age"][dataset["Age"].isnull()].index)

for ind in ageNanInds :
    ageMed = dataset["Age"].median()
    ageToPredict = dataset["Age"][((dataset['SibSp'] == dataset.iloc[ind]["SibSp"]) & (dataset['Parch'] == dataset.iloc[ind]["Parch"]) & (dataset['Pclass'] == dataset.iloc[ind]["Pclass"]))].median()
    if not np.isnan(ageToPredict) :
        dataset['Age'].iloc[ind] = ageToPredict
    else :
        dataset['Age'].iloc[ind] = ageMed

In [52]:
attributes = ['Age','Embarked','Cabin','Fare','Pclass','Sex', 'Ticket',
              'Parch','SibSp','Title','Survived']
finalData = dataset[attributes]

In [53]:


train = finalData[:891]
test = finalData[891:]
test.drop(labels=["Survived"],axis = 1,inplace=True)

train["Survived"] = train["Survived"].astype(int)
Y_train = train["Survived"]
X_train = train.drop(labels = ["Survived"],axis = 1)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


In [54]:


# RFC Parameters tunning 
RFC = RandomForestClassifier()


## Search grid for optimal parameters
rf_param_grid = {"max_depth": [None],
              "max_features": [1, 3, len(attributes)-1],
              "min_samples_split": [2, 3, 8, 10],
              "min_samples_leaf": [1, 3, 8, 10],
              "bootstrap": [False],
              "n_estimators" :[100,200, 300],
              "criterion": ["entropy", "gini"]}


gsRFC = GridSearchCV(RFC,param_grid = rf_param_grid, cv=10, scoring="accuracy", n_jobs= 4, verbose = 1)

gsRFC.fit(X_train,Y_train)

RFC_best = gsRFC.best_estimator_

Fitting 10 folds for each of 288 candidates, totalling 2880 fits


[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:   21.7s
[Parallel(n_jobs=4)]: Done 192 tasks      | elapsed:  1.6min
[Parallel(n_jobs=4)]: Done 442 tasks      | elapsed:  3.4min
[Parallel(n_jobs=4)]: Done 792 tasks      | elapsed:  6.7min
[Parallel(n_jobs=4)]: Done 1242 tasks      | elapsed: 10.7min
[Parallel(n_jobs=4)]: Done 1792 tasks      | elapsed: 15.0min
[Parallel(n_jobs=4)]: Done 2442 tasks      | elapsed: 19.8min
[Parallel(n_jobs=4)]: Done 2880 out of 2880 | elapsed: 23.6min finished


In [61]:
GBC = GradientBoostingClassifier()
gb_param_grid = {'loss' : ["deviance"],
              'n_estimators' : [100,200,300],
              'learning_rate': [0.1, 0.05, 0.01],
              'max_depth': [4, 8],
              'min_samples_leaf': [100,150],
              'max_features': [0.3, 0.1] 
              }

gsGBC = GridSearchCV(GBC,param_grid = gb_param_grid, cv=10, scoring="accuracy", n_jobs= 4, verbose = 1)

gsGBC.fit(X_train,Y_train)

GBC_best = gsGBC.best_estimator_

Fitting 10 folds for each of 72 candidates, totalling 720 fits


[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:    2.7s
[Parallel(n_jobs=4)]: Done 192 tasks      | elapsed:   11.5s
[Parallel(n_jobs=4)]: Done 442 tasks      | elapsed:   28.7s
[Parallel(n_jobs=4)]: Done 720 out of 720 | elapsed:   46.4s finished


In [72]:
DTC = DecisionTreeClassifier()

adaDTC = AdaBoostClassifier(DTC, random_state=7)

ada_param_grid = {"base_estimator__criterion" : ["gini", "entropy"],
              "base_estimator__splitter" :   ["best", "random"],
              "algorithm" : ["SAMME","SAMME.R"],
              "n_estimators" :[1,2],
              "learning_rate":  [0.0001, 0.001, 0.01, 0.1, 0.2, 0.3,1.5]}

gsadaDTC = GridSearchCV(adaDTC,param_grid = ada_param_grid, cv=10, scoring="accuracy", n_jobs= 4, verbose = 1)

gsadaDTC.fit(X_train,Y_train)

ada_best = gsadaDTC.best_estimator_

Fitting 10 folds for each of 112 candidates, totalling 1120 fits


[Parallel(n_jobs=4)]: Done 416 tasks      | elapsed:    3.6s
[Parallel(n_jobs=4)]: Done 1120 out of 1120 | elapsed:    9.1s finished


In [62]:
GBC_best

GradientBoostingClassifier(init=None, learning_rate=0.1, loss='deviance',
              max_depth=4, max_features=0.3, max_leaf_nodes=None,
              min_samples_leaf=100, min_samples_split=2,
              min_weight_fraction_leaf=0.0, n_estimators=300,
              presort='auto', random_state=None, subsample=1.0, verbose=0,
              warm_start=False)

In [73]:
modelForest = ada_best.fit(X_train,Y_train)
scores = cross_val_score(modelForest, X_train, Y_train, cv=10)
np.mean(scores)

0.8182606968562025

In [78]:
votingC = VotingClassifier(estimators=[('rfc', RFC_best), ('adac',ada_best),('gbc',GBC_best)],
                           voting='soft')

votingC = votingC.fit(X_train, Y_train)

In [79]:
test_Survived = pd.Series(votingC.predict(test), name="Survived")
IDtest = testDFrame["PassengerId"]

results = pd.concat([IDtest,test_Survived],axis=1)
results.to_csv("sub29.csv",index=False)

  if diff:


In [1]:
from sklearn.svm import SVC


def get_my_best_estimator(X, y, folds):

    params_grid = [{
        'C': [2**i for i in range(-5, 9)],
        'gamma': [2**i for i in range(-10, 3)]}]
    gs = GridSearchCV(SVC(), params_grid, n_jobs=-1, cv=folds)
    gs.fit(X, y)
    best_estimator = gs.best_estimator_
    return best_estimator

trainScaler = MinMaxScaler(feature_range=(0, 1))
X_train = trainScaler.fit_transform(X_train)
testScalar = trainScaler.fit(test)
test = trainScaler.transform(test)

bestEstimator = get_my_best_estimator(X_train, Y_train, 10)
modelRbf = SVC(C=bestEstimator.C , gamma=bestEstimator.gamma, kernel='rbf', verbose=True, ).fit(X_train, Y_train)
scores = cross_val_score(modelRbf, X_train, Y_train, cv=10)
np.mean(scores)

NameError: name 'X_train' is not defined

In [19]:
model = RFC_best
predictionsFloat = model.predict(test)
dataa = np.array(predictionsFloat).astype(int).tolist()
numbers = range(892, 1310)

finalData = []
finalData.append(["PassengerId","Survived"])
for i in range(len(dataa)):
    v = []
    v.append(numbers[i])
    v.append(dataa[i])
    finalData.append(v)
    
with open("sub26.csv", "wb") as f:
    writer = csv.writer(f)
    writer.writerows(finalData)

ValueError: Input contains NaN, infinity or a value too large for dtype('float32').

In [29]:
results

Unnamed: 0,PassengerId,Survived
0,892,0
1,893,0
2,894,0
3,895,0
4,896,1
5,897,0
6,898,0
7,899,0
8,900,1
9,901,0


In [8]:
attributes = ['Age','Cabin','Embarked','Fare','Parch','Pclass','Sex','SibSp', 'Ticket',
              'FamilySize','Surname','Title','Survived']

In [11]:
predictionColumns = ['Age','Embarked','Fare','Pclass','Sex', 'Ticket',
              'FamilySize','Title','PassengerId']

predictSet = dataset[predictionColumns].values.tolist()

trainSet = [list(item) for item in predictSet if not math.isnan(item[0])] 
testSet = [list(item) for item in predictSet if math.isnan(item[0])]

bins = [0, 5, 50, 80]
ages = list(pd.cut([item[0] for item in trainSet], bins, labels=range(len(bins)-1)))

for index, item in enumerate(trainSet):
    item[0] = ages[index]


Xtrain = list(np.array(trainSet)[:,1:-1])
Xtest = list(np.array(testSet)[:,1:-1])
Ytrain = ages




        

In [28]:
from sklearn import tree, linear_model
from sklearn.ensemble import RandomForestClassifier
from sklearn.cross_validation import cross_val_score
from sklearn.svm import LinearSVC
from sklearn.svm import SVC
from sklearn.grid_search import GridSearchCV
from sklearn.preprocessing import StandardScaler, MinMaxScaler



def get_my_best_estimator(X, y, folds):

    params_grid = [{
        'C': [2**i for i in range(-5, 10)],
        'gamma': [2**i for i in range(-10, 3)]}]
    gs = GridSearchCV(SVC(), params_grid, n_jobs=-1, cv=folds)
    gs.fit(X, y)
    best_estimator = gs.best_estimator_
    return best_estimator

def get_LinearSVC_best_estimator(xTrain, yTrain):

    model = SVC(kernel='linear', random_state=42)
    parameters = {'C': (0.1, 0.5, 1, 4)}
    grid = GridSearchCV(estimator=model, param_grid=parameters)
    grid.fit(xTrain, yTrain)
    return grid.best_estimator_




In [72]:
trainScaler = MinMaxScaler(feature_range=(0, 1))
finalTrainX = trainScaler.fit_transform(finalTrainX)
testScalar = trainScaler.fit(finalTrainX)
finalTestX = testScalar.transform(finalTestX)

In [76]:
#bestLinearEstimator = get_LinearSVC_best_estimator(finalTrainX, finalTrainY)
bestEstimator = get_my_best_estimator(finalTrainX, finalTrainY, 5)

In [18]:
#modelLin = LinearSVC(C=bestLinearEstimator.C, random_state=42).fit(finalTrainX, finalTrainY)
#scores = cross_val_score(modelLin, finalTrainX, finalTrainY, cv=5)
#np.mean(scores)



In [77]:
bestEstimator

SVC(C=0.03125, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape=None, degree=3, gamma=0.0009765625, kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False)

In [78]:
modelRbf = SVC(C=bestEstimator.C , gamma=bestEstimator.gamma, kernel='rbf', verbose=True).fit(finalTrainX, finalTrainY)
scores = cross_val_score(modelRbf, finalTrainX, finalTrainY, cv=5)
np.mean(scores)

[LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM]

0.6161649089097865

In [27]:
model = modelForest

In [28]:
predictionsFloat = model.predict(test)
dataa = np.array(predictionsFloat).astype(int).tolist()
numbers = range(892, 1310)

finalData = []
finalData.append(["PassengerId","Survived"])
for i in range(len(dataa)):
    v = []
    v.append(numbers[i])
    v.append(dataa[i])
    finalData.append(v)

In [29]:
with open("newTrain2.csv", "wb") as f:
    writer = csv.writer(f)
    writer.writerows(train)