In [146]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt

from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.impute import SimpleImputer

from sklearn.model_selection import train_test_split

from sklearn.pipeline import make_pipeline
from sklearn.compose import ColumnTransformer

from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier

from sklearn.metrics import r2_score, accuracy_score

In [147]:
train = pd.read_csv("./train.csv")
test = pd.read_csv("./test.csv")

In [148]:
train = train.drop(["Name", "Sex"], axis=1)

In [149]:
train.head()

Unnamed: 0,PassengerId,Survived,Pclass,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,35.0,1,0,113803,53.1,C123,S
4,5,0,3,35.0,0,0,373450,8.05,,S


In [150]:
train.isnull().sum()

PassengerId      0
Survived         0
Pclass           0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64

In [151]:
lb = LabelEncoder()

In [152]:
train["Cabin"] = lb.fit_transform(train["Cabin"])
train["Ticket"] = lb.fit_transform(train["Ticket"])
train["Embarked"] = lb.fit_transform(train["Embarked"])

In [153]:
train["Age"] = train["Age"].fillna(train["Age"].median())
train["Cabin"] = train["Cabin"].fillna(train["Cabin"].median())
train["Ticket"] = train["Ticket"].fillna(train["Ticket"].median())
train["Embarked"] = train["Embarked"].fillna(train["Embarked"].median())

In [154]:
X = train.drop("Survived", axis=1)
y = train["Survived"]

In [155]:
X_train, X_aprov, y_train, y_aprov = train_test_split(X, y, test_size=.2, random_state=42)

In [156]:
def pipeline(preprocessor, model):
    pipe = make_pipeline(
        SimpleImputer(),
        preprocessor,
        model
    )

    pipe.fit(X_train, y_train)
    pred = pipe.predict(X_aprov)
    print(f'R2: {r2_score(pred, y_aprov)*100:.2f}')
    print(f'Score: {pipe.score(X_aprov, y_aprov)*100:.2f}')
    

In [157]:
pipeline(StandardScaler(), LinearRegression())

R2: -396.40
Score: 25.27


In [158]:
pipeline(StandardScaler(), LogisticRegression())

R2: -33.21
Score: 73.18


In [159]:
pipeline(StandardScaler(), RandomForestClassifier(random_state=42))

R2: -5.60
Score: 75.42


In [160]:
pipeline(StandardScaler(), DecisionTreeClassifier())

R2: -36.32
Score: 65.92


In [176]:
predict_set = pd.read_csv("./gender_submission.csv")
predict_set.head()

Unnamed: 0,PassengerId,Survived
0,892,0
1,893,1
2,894,0
3,895,0
4,896,1


In [177]:
predict_response = predict_set["Survived"]

In [163]:
test = test.drop(["Name", "Sex"], axis=1)

In [164]:
test["Ticket"] = lb.fit_transform(test["Ticket"])
test["Cabin"] = lb.fit_transform(test["Cabin"])
test["Embarked"] = lb.fit_transform(test["Embarked"])

In [165]:
test["Age"] = test["Age"].fillna(test["Age"].median())
test["Fare"] = test["Fare"].fillna(test["Fare"].median())

In [166]:
model = make_pipeline(
    SimpleImputer(),
    StandardScaler(),
    RandomForestClassifier(random_state=42)
)

In [167]:
model.fit(X_train, y_train)

In [168]:
predict = model.predict(test)

In [169]:
acc = accuracy_score(predict, predict_response)
print(f"Acurracia: {acc*100:.2f}%")

Acurracia: 44.74%


In [170]:
model_lg = make_pipeline(
    SimpleImputer(),
    StandardScaler(),
    LogisticRegression()
)

In [171]:
model_lg.fit(X_train, y_train)

In [172]:
predict = model_lg.predict(test)

In [173]:
acc = accuracy_score(predict, predict_response)
print(f"Acurracia: {acc*100:.2f}%")

Acurracia: 61.00%


In [181]:
from sklearn.neural_network import MLPClassifier

In [232]:
NN = MLPClassifier(hidden_layer_sizes=(15,), random_state=1, warm_start=True)

In [233]:
NN.fit(X_train, y_train)



In [234]:
predict = NN.predict(test)

In [235]:
print(accuracy_score(predict, predict_response)*100)

40.66985645933015


In [236]:
predictions = pd.DataFrame({"PassengerId": predict_set["PassengerId"], "Survived": predict})
predictions.to_csv('output.csv', index=False)