In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
dftest = pd.read_csv('/kaggle/input/titanic/test.csv')
dftest.head()

In [None]:
dfgender = pd.read_csv('/kaggle/input/titanic/gender_submission.csv')
dfgender.head()

In [None]:
dftrain = pd.read_csv('/kaggle/input/titanic/train.csv')
dftrain.head()

# **Dftrain exploration**
----


In [None]:
dftrain.columns

# **Pré-processamento de dados**
----


In [None]:
from sklearn.model_selection import train_test_split

# Definindo features e labels
X = dftrain[['PassengerId', 'Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Ticket', 'Fare', 'Cabin', 'Embarked']]
Y = dftrain['Survived']

# Separando entre treino e teste
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.2, random_state=4)
print ('Train set:', X_train.shape,  y_train.shape)
print ('Test set:', X_test.shape,  y_test.shape)

In [None]:
df_analise = X_train
df_analise.head()

# **Pré-processamento textual**
---


In [None]:
from sklearn import preprocessing 

df_text = df_analise[['Embarked', 'Sex', 'Cabin', 'Ticket']]

#Criando meu encoder
enc = preprocessing.OneHotEncoder(handle_unknown='ignore', sparse=False)
enc.fit(df_text)

# Transformando texto em numero
XX_text = enc.transform(df_text)

# Criando DataFrame texto -> numerico
df_text = pd.DataFrame(XX_text, columns=enc.get_feature_names(df_text.columns))

In [None]:
df_text

# **DataFrame numérico**
---


In [None]:
df_num = df_analise[['PassengerId', 'Pclass', 'Age', 'SibSp', 'Parch', 'Fare']]
df_num

# **Concatenação do Pré-processamento textual com o Pré-processamento numérico**
---

In [None]:
df_conc = pd.concat([df_num.reset_index(drop=True).reset_index(drop=True), df_text.reset_index(drop=True)], axis=1)
df_conc

# **Pré-processamento numérico**
---

In [None]:
'''
# Substituindo valores nulos
X_train_ = df_conc.fillna(0)
'''

from sklearn.impute import KNNImputer

knn = KNNImputer()
knn.fit(df_conc)

X_knn = knn.transform(df_conc)
X_train_ = pd.DataFrame(X_knn, columns=df_conc.columns)

X_train_.head()

In [None]:
X_train_.dropna().shape, df_conc.shape

In [None]:
# Normalização do dataset
ss = preprocessing.StandardScaler()
X_train_ = ss.fit_transform(X_train_)

X_train_ = pd.DataFrame(X_train_, columns=df_conc.columns)

In [None]:
X_train_.shape

In [None]:
y_train.shape

# **Mesmos métodos aplicados no DataFrame de teste**

In [None]:
from sklearn import preprocessing 

df_text_test = X_test[['Embarked', 'Sex', 'Cabin', 'Ticket']]

# Transformando texto em numero
XX_text_test = enc.transform(df_text_test)

# Criando DataFrame texto -> numerico
df_text_test = pd.DataFrame(XX_text_test, columns=enc.get_feature_names(df_text_test.columns))

In [None]:
df_num_test = X_test[['PassengerId', 'Pclass', 'Age', 'SibSp', 'Parch', 'Fare']]
df_num_test

In [None]:
df_conc_test = pd.concat([df_num_test.reset_index(drop=True).reset_index(drop=True), df_text_test.reset_index(drop=True)], axis=1)
df_conc_test

In [None]:
knn.fit(df_conc_test)

X_knn_test = knn.transform(df_conc_test)
X_test_ = pd.DataFrame(X_knn_test, columns=df_conc.columns)

X_test_.head()

In [None]:
X_test_.dropna().shape, df_conc_test.shape

In [None]:
X_test_ = ss.fit_transform(X_test_)

X_test_ = pd.DataFrame(X_test_, columns=df_conc.columns)
X_test_

In [None]:
X_test_.shape

In [None]:
y_test.shape

#  **1 Classification** 
---

# **1.1 K - Nearest Neighbors**
----

In [None]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn import metrics
import matplotlib.pyplot as plt

Ks = 20
mean_acc = np.zeros((Ks-1))
std_acc = np.zeros((Ks-1))

for n in range(1,Ks):
    
    #Train Model and Predict  
    neigh = KNeighborsClassifier(n_neighbors = n).fit(X_train_,y_train)
    predknn  = neigh.predict(X_test_)
    mean_acc[n-1] = metrics.accuracy_score(y_test, predknn)
    std_acc[n-1] = np.std(predknn==y_test)/np.sqrt(predknn.shape[0])

mean_acc

In [None]:
plt.plot(range(1,Ks),mean_acc,'g')
plt.fill_between(range(1,Ks),mean_acc - 1 * std_acc,mean_acc + 1 * std_acc, alpha=0.10)
plt.fill_between(range(1,Ks),mean_acc - 3 * std_acc,mean_acc + 3 * std_acc, alpha=0.10,color="green")
plt.legend(('Accuracy ', '+/- 1xstd','+/- 3xstd'))
plt.ylabel('Accuracy ')
plt.xlabel('Number of Neighbors (K)')
plt.tight_layout()
plt.show()

In [None]:
print( "The best accuracy was with", mean_acc.max(), "with k=", mean_acc.argmax()+1) 

In [None]:
k = 9
neigh = KNeighborsClassifier(n_neighbors = k).fit(X_train_,y_train)
predknn = neigh.predict(X_test_)
print("Train set Accuracy: ", metrics.accuracy_score(y_train, neigh.predict(X_train_)))
print("Test set Accuracy: ", metrics.accuracy_score(y_test, predknn))

# **1.2 Decision Tree**
----

In [None]:
from sklearn.tree import DecisionTreeClassifier
drugTree = DecisionTreeClassifier(criterion="entropy", max_depth = 4)
drugTree # it shows the default parameters

In [None]:
drugTree.fit(X_train_,y_train)
predTree = drugTree.predict(X_test_)

In [None]:
print (predTree[0:10])
print (y_test[0:10])

In [None]:
from sklearn import metrics

print("DecisionTrees's Accuracy: ", metrics.accuracy_score(y_test, predTree))

# **1.3 Support Vector Machine**
----

In [None]:
from sklearn.svm import SVC
suportvm = SVC(kernel='rbf')
suportvm.fit(X_train_, y_train) 

In [None]:
predsvm = suportvm.predict(X_test_)

In [None]:
print (predsvm [0:10])
print (y_test [0:10])

# **1.4 Logistic Regression**
----

In [None]:
from sklearn.linear_model import LogisticRegression
logir = LogisticRegression(C=0.01, solver='liblinear').fit(X_train_,y_train)
predlr = logir.predict(X_test_)

In [None]:
print (predlr [0:10])
print (y_test [0:10])

# **1.5 Stochastic Gradient Descent Classifier**
----

In [None]:
from sklearn.linear_model import SGDClassifier
sgd = SGDClassifier()
sgd.fit(X_train_, y_train)
predsgd = sgd.predict(X_test_)

In [None]:
print (predsgd [0:10])
print (y_test [0:10])

# **2 Model Accuracy**
----

In [None]:
from sklearn.metrics import jaccard_score
from sklearn.metrics import f1_score
from sklearn.metrics import log_loss

In [None]:
print("LR Jaccard index: %.2f" % jaccard_score(y_test, predknn,pos_label=0))
print("LR F1-score: %.2f" % f1_score(y_test, predknn, average='weighted') )

In [None]:
print("LR Jaccard index: %.2f" % jaccard_score(y_test, predTree,pos_label=0))
print("LR F1-score: %.2f" % f1_score(y_test, predTree, average='weighted') )

In [None]:
print("LR Jaccard index: %.2f" % jaccard_score(y_test, predsvm,pos_label=0))
print("LR F1-score: %.2f" % f1_score(y_test, predsvm, average='weighted') )

In [None]:
print("LR Jaccard index: %.2f" % jaccard_score(y_test, predlr,pos_label=0))
print("LR F1-score: %.2f" % f1_score(y_test, predlr, average='weighted') )