In [1]:
import pandas as pd
#Load files

X_full = pd.read_csv("/home/manu/Documents/Data Science/Projects/Titanic/train.csv", index_col='PassengerId')
X_test_full = pd.read_csv("/home/manu/Documents/Data Science/Projects/Titanic/test.csv", index_col='PassengerId')

In [2]:
from sklearn.model_selection import train_test_split
#Prepare DataFrames

# Remove rows with missing target, separate target from predictors
X_full.dropna(axis=0, subset=['Survived'], inplace=True)
y = X_full.Survived
X_full.drop(['Survived'], axis=1, inplace=True)

# Break off validation set from training data
X_train_full, X_valid_full, y_train, y_valid = train_test_split(X_full, y, 
                                                                train_size=0.8, test_size=0.2,
                                                                random_state=0)

# "Cardinality" means the number of unique values in a column
# Select categorical columns with relatively low cardinality (convenient but arbitrary)
categorical_cols = [cname for cname in X_train_full.columns if
                    X_train_full[cname].nunique() < 10 and 
                    X_train_full[cname].dtype == "object"]

# Select numerical columns
numerical_cols = [cname for cname in X_train_full.columns if 
                X_train_full[cname].dtype in ['int64', 'float64']]

# Keep selected columns only
my_cols = categorical_cols + numerical_cols
X_train = X_train_full[my_cols].copy()
X_valid = X_valid_full[my_cols].copy()
X_test = X_test_full[my_cols].copy()

In [3]:
#X_train.head()

In [8]:
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from xgboost import XGBRegressor
from sklearn.metrics import mean_absolute_error

# Preprocess for numerical data
numerical_transformer = SimpleImputer(strategy='median')

# Preprocessing for categorical data
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

# Bundle preprocessing for numerical and categorical data
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_cols),
        ('cat', categorical_transformer, categorical_cols)
    ])

# Define model
model = XGBRegressor(n_estimators=1000, learning_rate=0.1)

# Bundle preprocessing and modeling code in a pipeline
clf = Pipeline(steps=[('preprocessor', preprocessor),
                      ('model', model)
                     ])

# Preprocessing of training data, fit model 
clf.fit(X_train, y_train)

# Preprocessing of validation data, get predictions
preds = clf.predict(X_valid)

In [9]:
#ADECUATING RESULTS
""""RF results aren't 1 or 0 but somewhere between. Each results > 0.5 must be rounded 
to 1 (survived) and <0.5 to 0 (didn't survive)"""

preds = pd.Series(preds)
preds.index = y_valid.index

for i in range(preds.size):
    if (preds.iloc[i] >= 0.5):
        preds.iloc[i] = 1
    else:
        preds.iloc[i] = 0


In [10]:
#MODEL VALIDATION: Comparing predicted data against validated data
Comparision = (y_valid == preds)

Correct_predictions = Comparision.sum()
total_values = y_valid.shape[0]
print("Percentage of success predictions: ", (Correct_predictions * 100/total_values),"%")


Percentage of success predictions:  82.68156424581005 %


In [11]:
print (X_train)

                Sex Embarked  Pclass   Age  SibSp  Parch     Fare
PassengerId                                                      
141          female        C       3   NaN      0      2  15.2458
440            male        S       2  31.0      0      0  10.5000
818            male        C       2  31.0      1      1  37.0042
379            male        C       3  20.0      0      0   4.0125
492            male        S       3  21.0      0      0   7.2500
...             ...      ...     ...   ...    ...    ...      ...
836          female        C       1  39.0      1      1  83.1583
193          female        S       3  19.0      1      0   7.8542
630            male        Q       3   NaN      0      0   7.7333
560          female        S       3  36.0      1      0  17.4000
685            male        S       2  60.0      1      1  39.0000

[712 rows x 7 columns]


In [12]:
#Retraining model and generating predictions
"""Retraining model with all the training data available (train + validation)"""

X_train = X_full [my_cols]

clf.fit(X_train, y)

preds = clf.predict(X_test)

#ADECUATING RESULTS
""""Only for RF as results aren't 1 or 0 but somewhere between. Each results > 0.5 must be rounded 
to 1 (survived) and <0.5 to 0 (didn't survive)"""

preds = pd.Series(preds)
preds.index = X_test.index

for i in range(preds.size):
    if (preds.iloc[i] >= 0.5):
        preds.iloc[i] = 1
    else:
        preds.iloc[i] = 0


In [13]:
#Formating and saving result

DF_preds = pd.DataFrame({'Survived':preds}, index = preds.index)


#Changing to int as it is the valid format for the CSV in the competition
DF_preds = DF_preds.astype(int)

DF_preds.to_csv ("/home/manu/Documents/Data Science/Projects/Titanic/result_pipeline.csv")