In [1]:
import pandas as pd
import numpy as np

train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')

# Exploratoy Analysis

In [2]:
train.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [3]:
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

In [4]:
# Checking the missing values
round(train.isnull().sum()*100/train.shape[0],1)

# With a mojority of null values, let's ignore "Cabin". 

PassengerId     0.0
Survived        0.0
Pclass          0.0
Name            0.0
Sex             0.0
Age            19.9
SibSp           0.0
Parch           0.0
Ticket          0.0
Fare            0.0
Cabin          77.1
Embarked        0.2
dtype: float64

In [5]:
# Enconding other variables

## Sex

def sex_to_bin(sex):
    if sex == 'male':
        return 1
    else:
        return 0
train['Sex_bin'] = train['Sex'].map(sex_to_bin) 
test['Sex_bin']  = test['Sex'].map(sex_to_bin)

train.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Sex_bin
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S,1
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C,0
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S,0
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S,0
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S,1


In [6]:
# Enconding other variables

## Name
honorific = []
for name in train['Name'].str.split():
    honorific.append(name[1])
print(set(honorific))

{'Carlo,', 'Capt.', 'Rev.', 'Mme.', 'Billiard,', 'Walle,', 'Mlle.', 'der', 'Col.', 'Velde,', 'y', 'Master.', 'Ms.', 'the', 'Major.', 'Jonkheer.', 'Mulder,', 'Mr.', 'Planke,', 'Gordon,', 'Shawah,', 'Messemaeker,', 'Steen,', 'Dr.', 'Mrs.', 'Impe,', 'Melkebeke,', 'Cruyssen,', 'Miss.', 'Pelsmaeker,', 'Don.'}


In [7]:
def military_enc(h):
    n = ['Capt.','Major,','Col']

    if n[0] in h or n[1] in h or n[2] in h :
        return 1
    else:
        return 0

train['Military'] = train['Name'].map(military_enc) 
test['Military']  = test['Name'].map(military_enc)

In [8]:
f = pd.get_dummies(train['Embarked'])
train = pd.concat([train,f], axis = 1)

f = pd.get_dummies(test['Embarked'])
test = pd.concat([test,f], axis = 1)


In [9]:
train.describe()


Unnamed: 0,PassengerId,Survived,Pclass,Age,SibSp,Parch,Fare,Sex_bin,Military,C,Q,S
count,891.0,891.0,891.0,714.0,891.0,891.0,891.0,891.0,891.0,891.0,891.0,891.0
mean,446.0,0.383838,2.308642,29.699118,0.523008,0.381594,32.204208,0.647587,0.012346,0.188552,0.08642,0.722783
std,257.353842,0.486592,0.836071,14.526497,1.102743,0.806057,49.693429,0.47799,0.110485,0.391372,0.281141,0.447876
min,1.0,0.0,1.0,0.42,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,223.5,0.0,2.0,20.125,0.0,0.0,7.9104,0.0,0.0,0.0,0.0,0.0
50%,446.0,0.0,3.0,28.0,0.0,0.0,14.4542,1.0,0.0,0.0,0.0,1.0
75%,668.5,1.0,3.0,38.0,1.0,0.0,31.0,1.0,0.0,0.0,0.0,1.0
max,891.0,1.0,3.0,80.0,8.0,6.0,512.3292,1.0,1.0,1.0,1.0,1.0


In [10]:
variables = ['Survived','Pclass','Age','SibSp','Parch','Fare','Sex_bin','Military','C','Q','S']
X = train[variables]
# X.head()
abs(X.corr()['Survived']).sort_values()

Q           0.003650
Military    0.025544
SibSp       0.035322
Age         0.077221
Parch       0.081629
S           0.155660
C           0.168240
Fare        0.257307
Pclass      0.338481
Sex_bin     0.543351
Survived    1.000000
Name: Survived, dtype: float64

In [20]:
variables = ['Pclass','Fare','Sex_bin','C','S','Age']
X = train[variables]
y = train['Survived']

In [21]:
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC

In [23]:
model1 = LogisticRegression()
model2 = RandomForestClassifier()
model3 = KNeighborsClassifier(n_neighbors = 13)
model4 = GaussianNB()
model5 = SVC()

models =[model1,model2,model3,model4,model5]
model_n =["Logistic Regression","Random Forest","K Neighbors","Nayve Bayes","SVC"]
from sklearn.model_selection import RepeatedKFold
for modelo in models:
    resultados = []
    kf = RepeatedKFold(n_splits=2,n_repeats =10, random_state=0)
    for linha_t,linha_v in kf.split(X):
   
        X_treino, X_valid = X.iloc[linha_t].fillna(X.iloc[linha_t].mean()),X.iloc[linha_v].fillna(X.iloc[linha_v].mean())
        y_treino, y_valid = y.iloc[linha_t],y.iloc[linha_v]
        modelo.fit(X_treino,y_treino)
        previ = modelo.predict(X_valid)
        acc = np.mean(y_valid == previ)
        resultados.append(acc)

    # print("acc max", np.max(resultados))
    print("acc med", np.mean(resultados))
    # print("acc min", np.min(resultados))
    print()

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
acc med 0.7897871214793166

acc med 0.7976527938731295

acc med 0.6898964579029576

acc med 0.7683554189550058

acc med 0.670934146218572



In [None]:
X = X.fillna(X.mean())
modelo.fit(X,y)

X_prev = test[variables]
X_prev = X_prev.fillna(X_prev.mean())

Previsoes = modelo.predict(X_prev)

In [None]:
sub = pd.Series(Previsoes,index = test['PassengerId'],name = 'Survived')
sub.head()
sub.to_csv("My_Sub_logistic.csv",header = True)