<a href="https://colab.research.google.com/github/Mateorovere/Spaceship-Titanic/blob/main/Spaceship_Titanic.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [38]:
#task is to predict whether a passenger was transported to an alternate dimension during the Spaceship Titanic's collision with the spacetime anomaly
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression

X = pd.read_csv('train.csv',index_col='PassengerId')
y_test = pd.read_csv('test.csv',index_col='PassengerId')
y = X.Transported
X.drop(['Name','Transported'],axis=1,inplace=True) #i droped it because i would guess that the name has nothing to do with this problem

# Break off validation set from training data
X_train, X_valid, y_train, y_valid = train_test_split(X, y, train_size=0.85, test_size=0.15, random_state=0)

In [39]:
object_cols = [cname for cname in X.columns if X[cname].dtype == "object"]
numerical_cols = [cname for cname in X.columns if X[cname].dtype in ['int64', 'float64']]
categorical_cols = [cname for cname in X.columns if X[cname].nunique() < 10 and  X[cname].dtype == "object"] #with low cardinality

#Doing the preprocessing separated

In [None]:
from sklearn.preprocessing import OneHotEncoder
OH = OneHotEncoder(handle_unknown='ignore', sparse=False)
OH_X_train = pd.DataFrame(OH.fit_transform(X_train[categorical_cols]))
OH_X_valid = pd.DataFrame(OH.transform(X_valid[categorical_cols]))

OH_X_train.index = X_train.index
OH_X_valid.index = X_valid.index

num_X_train = X_train.drop(object_cols, axis=1)
num_X_valid = X_valid.drop(object_cols, axis=1)

OH_X_train = pd.concat([num_X_train, OH_X_train], axis=1)
OH_X_valid = pd.concat([num_X_valid, OH_X_valid], axis=1)
### this block could be erased because i did a pipeline later on, but is usful to remember how it was done

In [None]:
from sklearn.impute import SimpleImputer

# Imputation
my_imputer = SimpleImputer()
imputed_X_train = pd.DataFrame(my_imputer.fit_transform(OH_X_train))
imputed_X_valid = pd.DataFrame(my_imputer.transform(OH_X_valid))

# Imputation removed column names; put them back
imputed_X_train.columns = OH_X_train.columns
imputed_X_valid.columns = OH_X_valid.columns
### this block could be erased because i did a pipeline later on

# Vs doing the pipeline

In [40]:
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

numerical_transformer = SimpleImputer(strategy='constant')

# Preprocessing for categorical data
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_cols),
        ('cat', categorical_transformer, categorical_cols)
    ])

# Keep selected columns only
my_cols = categorical_cols + numerical_cols
X_train2 = X_train[my_cols].copy()
X_valid2 = X_valid[my_cols].copy()

model = LogisticRegression(max_iter=300)

# Bundle preprocessing and modeling code in a pipeline
my_pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                              ('model', model)
                             ])

In [41]:
from sklearn.metrics import accuracy_score
my_pipeline.fit(X_train, y_train)

# Preprocessing of validation data, get predictions
preds = my_pipeline.predict(X_valid)

# Evaluate the model
accuracy = accuracy_score(y_valid, preds)
print('Accuracy', accuracy)

Accuracy 0.7760736196319018


#Cross validation

In [42]:
#seeing if cross validation improves the accuracy
from sklearn.model_selection import cross_val_score
scores = cross_val_score(my_pipeline, X, y,
                              cv=5,
                              scoring='f1')
print(scores.mean())

0.7907563181245333


In [43]:
#this last part is to submit to a kaggle competition
from  sklearn.model_selection import cross_val_predict

test_preds = my_pipeline.predict(y_test)


output = pd.DataFrame({'PassengerId': y_test.index,
                       'Transported': test_preds})
output.to_csv('submission.csv', index=False)

In [44]:
sub=pd.read_csv('submission.csv')
sub

Unnamed: 0,PassengerId,Transported
0,0013_01,True
1,0018_01,False
2,0019_01,True
3,0021_01,True
4,0023_01,False
...,...,...
4272,9266_02,True
4273,9269_01,False
4274,9271_01,True
4275,9273_01,True
