In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# Importamos algunos agoritmos de clasificacion:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
# Una forma para evaluar el modelo
from sklearn.metrics import accuracy_score


df = pd.read_csv("train.csv")
df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [2]:
df.isnull().sum() # Debemos preparar el set de datos antes de empezar con las predcciones

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64

In [3]:
# Se puede hacer tb para un determinado parametro
df.Age.isnull().sum()

177

In [15]:
# Tenemos que completar los campos de edad nulos:
df.Age = df.Age.fillna(df.Age.mean())
df.Age.isnull().sum()

0

In [4]:
df.Ticket.value_counts()

Ticket
347082      7
CA. 2343    7
1601        7
3101295     6
CA 2144     6
           ..
9234        1
19988       1
2693        1
PC 17612    1
370376      1
Name: count, Length: 681, dtype: int64

In [5]:
df.Cabin.isnull().sum(), len(df) # NO merece tener en cuenta este parametro ya que en la mayoria de casos esta vacio

(687, 891)

In [6]:
df.Embarked.value_counts()

Embarked
S    644
C    168
Q     77
Name: count, dtype: int64

In [7]:
df.Embarked.isnull().sum()

2

In [8]:
# Los dos datos que estan vacios los ponemos como S
df['Embarked'] = df['Embarked'].fillna('S')
df.Embarked.value_counts()

Embarked
S    646
C    168
Q     77
Name: count, dtype: int64

In [13]:
# Borraremos del set de datos las columnas que no nos aportan informacion para el modelo de prediccion:
# Name por  razones obvias, cada uno se llama de una forma
# Ticket podemos ver que tampoco aporta demasiada informacion, hay demasiados tipos diferentes y casi todos tienen uno diferente
# Cabin: casi todos los valores son nulos
df = df.drop(["Name", "Ticket", "Cabin"], axis=1)
df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
0,1,0,3,male,22.0,1,0,7.25,S
1,2,1,1,female,38.0,1,0,71.2833,C
2,3,1,3,female,26.0,0,0,7.925,S
3,4,1,1,female,35.0,1,0,53.1,S
4,5,0,3,male,35.0,0,0,8.05,S


In [17]:
# Datos categoricos
df = pd.get_dummies(df, columns=['Sex', 'Pclass', 'Embarked'], drop_first=True)
df.head()

Unnamed: 0,PassengerId,Survived,Age,SibSp,Parch,Fare,Sex_male,Pclass_2,Pclass_3,Embarked_Q,Embarked_S
0,1,0,22.0,1,0,7.25,True,False,True,False,True
1,2,1,38.0,1,0,71.2833,False,False,False,False,False
2,3,1,26.0,0,0,7.925,False,False,True,False,True
3,4,1,35.0,1,0,53.1,False,False,False,False,True
4,5,0,35.0,0,0,8.05,True,False,True,False,True


In [20]:
# Nos toca reallizar el escalado de datos
# Esto lo hacemos para que los campos con valores altos no tengan mas relevancia que los demas,
# para esto asignamos un mismo rango de valores a todos

# Forma 1: StandardScaler (la otra opcion es MinMaxScaler)
df.Age = (df.Age - np.mean(df.Age, axis=0)) / (np.std(df.Age, axis=0))
df.Fare = (df.Fare - np.mean(df.Fare, axis=0)) / (np.std(df.Fare, axis=0))

df.head()

Unnamed: 0,PassengerId,Survived,Age,SibSp,Parch,Fare,Sex_male,Pclass_2,Pclass_3,Embarked_Q,Embarked_S
0,1,0,-0.592481,1,0,-0.502445,True,False,True,False,True
1,2,1,0.638789,1,0,0.786845,False,False,False,False,False
2,3,1,-0.284663,0,0,-0.488854,False,False,True,False,True
3,4,1,0.407926,1,0,0.42073,False,False,False,False,True
4,5,0,0.407926,0,0,-0.486337,True,False,True,False,True


In [21]:
# Obtenemos X e Y (INPUT Y OUTPUT)

X = df.drop("Survived", axis=1)
X.head()

Unnamed: 0,PassengerId,Age,SibSp,Parch,Fare,Sex_male,Pclass_2,Pclass_3,Embarked_Q,Embarked_S
0,1,-0.592481,1,0,-0.502445,True,False,True,False,True
1,2,0.638789,1,0,0.786845,False,False,False,False,False
2,3,-0.284663,0,0,-0.488854,False,False,True,False,True
3,4,0.407926,1,0,0.42073,False,False,False,False,True
4,5,0.407926,0,0,-0.486337,True,False,True,False,True


In [23]:
Y = df["Survived"]
Y.head()

0    0
1    1
2    1
3    1
4    0
Name: Survived, dtype: int64

In [25]:
# Ahora vamos a crear los grupos de train y test
from sklearn.model_selection import train_test_split

In [28]:
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.2, random_state=42)
X_train.head()

Unnamed: 0,PassengerId,Age,SibSp,Parch,Fare,Sex_male,Pclass_2,Pclass_3,Embarked_Q,Embarked_S
331,332,1.215947,0,0,-0.074583,True,False,False,False,True
733,734,-0.515526,0,0,-0.386671,True,True,False,False,True
382,383,0.177063,0,0,-0.488854,True,False,True,False,True
704,705,-0.284663,1,0,-0.49028,True,False,True,False,True
813,814,-1.82375,4,2,-0.018709,False,False,True,False,True


# Algoritmos de clasificacion


In [29]:
# KNeighborsClassifier
clf = KNeighborsClassifier()
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)
acc_KN = accuracy_score(y_test, y_pred)
acc_KN

0.5586592178770949

In [30]:
# DecisionTreeClassifier
clf = DecisionTreeClassifier()
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)
acc_DT = accuracy_score(y_test, y_pred)
acc_DT

0.776536312849162

In [31]:
# RandomForestClassifier
clf = RandomForestClassifier()
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)
acc_RF = accuracy_score(y_test, y_pred)
acc_RF

0.8379888268156425

In [32]:
# GaussianNB
clf = GaussianNB()
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)
acc_NB = accuracy_score(y_test, y_pred)
acc_NB

0.7597765363128491

In [33]:
# SVC
clf = SVC()
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)
acc_SVC = accuracy_score(y_test, y_pred)
acc_SVC

0.5865921787709497

In [None]:
# En mi caso con el que he obtenido mejores resultados es sin duda el RandomForestClassifier 
#(Podria ser porque se me olvido eliminar el passengerId)

In [34]:
test = pd.read_csv("test.csv")
test.head()

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0,,S
2,894,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,,Q
3,895,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,,S
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,,S


In [36]:
# Tenemos que hacer el mismo procesamiento dde datos con test que el que hicimos con el set original
test.isnull().sum()

PassengerId      0
Pclass           0
Name             0
Sex              0
Age             86
SibSp            0
Parch            0
Ticket           0
Fare             1
Cabin          327
Embarked         0
dtype: int64

In [37]:
test.Age = test.Age.fillna(test.Age.mean())
test.Fare = test.Fare.fillna(test.Fare.mean())
test.isnull().sum()

PassengerId      0
Pclass           0
Name             0
Sex              0
Age              0
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          327
Embarked         0
dtype: int64

In [39]:
test = test.drop(["Name", "Ticket", "Cabin"], axis=1)
test.head()

Unnamed: 0,PassengerId,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
0,892,3,male,34.5,0,0,7.8292,Q
1,893,3,female,47.0,1,0,7.0,S
2,894,2,male,62.0,0,0,9.6875,Q
3,895,3,male,27.0,0,0,8.6625,S
4,896,3,female,22.0,1,1,12.2875,S


In [40]:
test.Age = (test.Age - np.mean(test.Age, axis=0)) / (np.std(test.Age, axis=0))
test.Fare = (test.Fare - np.mean(test.Fare, axis=0)) / (np.std(test.Fare, axis=0))

df.head()

Unnamed: 0,PassengerId,Survived,Age,SibSp,Parch,Fare,Sex_male,Pclass_2,Pclass_3,Embarked_Q,Embarked_S
0,1,0,-0.592481,1,0,-0.502445,True,False,True,False,True
1,2,1,0.638789,1,0,0.786845,False,False,False,False,False
2,3,1,-0.284663,0,0,-0.488854,False,False,True,False,True
3,4,1,0.407926,1,0,0.42073,False,False,False,False,True
4,5,0,0.407926,0,0,-0.486337,True,False,True,False,True


In [41]:
test = pd.get_dummies(test, columns=['Sex', 'Pclass', 'Embarked'], drop_first=True)
test.head()

Unnamed: 0,PassengerId,Age,SibSp,Parch,Fare,Sex_male,Pclass_2,Pclass_3,Embarked_Q,Embarked_S
0,892,0.334993,0,0,-0.498407,True,False,True,True,False
1,893,1.32553,1,0,-0.513274,False,False,True,False,True
2,894,2.514175,0,0,-0.465088,True,True,False,True,False
3,895,-0.25933,0,0,-0.483466,True,False,True,False,True
4,896,-0.655545,1,1,-0.418471,False,False,True,False,True


In [44]:
# Ahora ya tenemos el set de datos con el mismo formato qe el anterior, de modo que podemos probar los modelos que hemos entrenado

clf = KNeighborsClassifier()
# Colcemos a entrenar con los datos iniciales
clf.fit(X_train, y_train)

# Realizamos la prediccion
y_pred= clf.predict(test)
y_pred

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,

In [46]:

df_submission = pd.read_csv("gender_submission.csv")
df_submission.head()

Unnamed: 0,PassengerId,Survived
0,892,0
1,893,1
2,894,0
3,895,0
4,896,1


In [47]:
# Metemos los datos que hemos obtenido:
df_submission["Survived"] = y_pred
df_submission.head()

Unnamed: 0,PassengerId,Survived
0,892,0
1,893,0
2,894,0
3,895,0
4,896,0


In [48]:
# Ahora habria que guardar el csv actualizado y ya lo podemos subir a kaggle 
df_submission.to_csv("gender_submission.csv", index=False)