<a href="https://colab.research.google.com/github/Mateorovere/Spaceship-Titanic/blob/main/Spaceship_Titanic.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [5]:
#task is to predict whether a passenger was transported to an alternate dimension during the Spaceship Titanic's collision with the spacetime anomaly
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error

X = pd.read_csv('train.csv',index_col='PassengerId')
y_test = pd.read_csv('test.csv',index_col='PassengerId')
y = X.Transported
X.drop(['Name','Transported'],axis=1,inplace=True) #i droped it because i would guess that the name has nothing to do with this problem

# Break off validation set from training data
X_train, X_valid, y_train, y_valid = train_test_split(X, y, train_size=0.85, test_size=0.15, random_state=0)

In [6]:
object_cols = [cname for cname in X.columns if X[cname].dtype == "object"]
numerical_cols = [cname for cname in X.columns if X[cname].dtype in ['int64', 'float64']]
categorical_cols = [cname for cname in X.columns if X[cname].nunique() < 10 and  X[cname].dtype == "object"] #with low cardinality

#Doing the preprocessing separated

In [7]:
from sklearn.preprocessing import OneHotEncoder
OH = OneHotEncoder(handle_unknown='ignore', sparse=False)
OH_X_train = pd.DataFrame(OH.fit_transform(X_train[categorical_cols]))
OH_X_valid = pd.DataFrame(OH.transform(X_valid[categorical_cols]))

OH_X_train.index = X_train.index
OH_X_valid.index = X_valid.index

num_X_train = X_train.drop(object_cols, axis=1)
num_X_valid = X_valid.drop(object_cols, axis=1)

OH_X_train = pd.concat([num_X_train, OH_X_train], axis=1)
OH_X_valid = pd.concat([num_X_valid, OH_X_valid], axis=1)
### this block could be erased because i did a pipeline later on, but is usful to remember how it was done

In [8]:
from sklearn.impute import SimpleImputer

# Imputation
my_imputer = SimpleImputer()
imputed_X_train = pd.DataFrame(my_imputer.fit_transform(OH_X_train))
imputed_X_valid = pd.DataFrame(my_imputer.transform(OH_X_valid))

# Imputation removed column names; put them back
imputed_X_train.columns = OH_X_train.columns
imputed_X_valid.columns = OH_X_valid.columns
### this block could be erased because i did a pipeline later on



In [9]:
def score_dataset(X_train, X_valid, y_train, y_valid):
    model = RandomForestRegressor(n_estimators=100, random_state=0)
    model.fit(X_train, y_train)
    preds = model.predict(X_valid)
    return mean_absolute_error(y_valid, preds)

In [10]:
score_dataset(imputed_X_train,imputed_X_valid,y_train,y_valid)



0.28918365089577797

# Vs doing the pipeline

In [11]:
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

numerical_transformer = SimpleImputer(strategy='constant')

# Preprocessing for categorical data
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_cols),
        ('cat', categorical_transformer, categorical_cols)
    ])

# Keep selected columns only
my_cols = categorical_cols + numerical_cols
X_train2 = X_train[my_cols].copy()
X_valid2 = X_valid[my_cols].copy()

model = RandomForestRegressor(n_estimators=100, random_state=0)

# Bundle preprocessing and modeling code in a pipeline
my_pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                              ('model', model)
                             ])



In [12]:
my_pipeline.fit(X_train, y_train)

# Preprocessing of validation data, get predictions
preds = my_pipeline.predict(X_valid)

# Evaluate the model
score = mean_absolute_error(y_valid, preds)
print('MAE:', score)

MAE: 0.2859889761005351


#Cross validation

In [15]:
#seeing if cross validation improves the accuracy
from sklearn.model_selection import cross_val_score
scores = -1 * cross_val_score(my_pipeline, X, y,
                              cv=5,
                              scoring='neg_mean_absolute_error')
print(scores.mean())
#and because the output MAE is lower, it means that it performed better

0.2821823210209932


In [None]:
#Now with xgboost to see if it improves as it should

from xgboost import XGBRegressor

my_model = XGBRegressor(n_estimators=700, learning_rate=0.015)
my_model.fit(imputed_X_train, y_train)

predictions = my_model.predict(imputed_X_valid) # Your code here

# Calculate MAE
mae = mean_absolute_error(predictions,y_valid)
mae
#surprisingly it didng go as expected because MAE is bigger than other alternatives



0.29860988