In [1]:
! mkdir ~/.kaggle

mkdir: cannot create directory ‘/root/.kaggle’: File exists


In [2]:
!pip install kaggle

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [3]:
!cp kaggle.json ~/.kaggle/kaggle.json

In [4]:
!chmod 600 /root/.kaggle/kaggle.json

In [5]:
!kaggle competitions download -c spaceship-titanic

spaceship-titanic.zip: Skipping, found more recently modified local copy (use --force to force download)


In [None]:
!unzip spaceship-titanic.zip

Archive:  spaceship-titanic.zip
replace sample_submission.csv? [y]es, [n]o, [A]ll, [N]one, [r]ename: 

# Load and do EDA

### Notes about the features
- **PassengerId** - People in a group are often family members, but not always.
- **Cabin** - The cabin number where the passenger is staying. Takes the form deck/num/side, where side can be either P for Port or S for Starboard.
- **RoomService, FoodCourt, ShoppingMall, Spa, VRDeck** - Amount the passenger has billed at each of the Spaceship Titanic's many luxury amenities.

MACHILE LEARNING TASK - Predict **Transported** (Binary Classification)

### TO DO
 - Fill in missing values in a bunch of the columns
 - Balance out the dataset if imbalance
 - Drop high cardinality columns

In [None]:
!pip install pandas-profiling

In [None]:
! pip install https://github.com/pandas-profiling/pandas-profiling/archive/master.zip 

In [None]:
import pandas as pd
from pandas_profiling import ProfileReport

In [None]:
df = pd.read_csv('train.csv')

In [None]:
df.head()

In [None]:

df.info()

In [None]:
profile = ProfileReport(df, title='Spachip Titanic')

In [None]:
profile.to_notebook_iframe()

In [None]:
def split_cabin(x):
  if len(str(x).split('/')) < 3:
    return ['Missing', "Missing", 'Missing']
  else:
    return str(x).split('/')
    

In [None]:
def split_passenger(x):

    return str(x).split('_')

In [None]:
# Create a preprocessing function to transform our dataset
def preprocessing(df):
    #Fill Missing values in homeplanet with new category "Missing"
    df['HomePlanet'].fillna('Missing', inplace = True)

    #CryoSleep - highly correlated with the target - drop na rows
    df['CryoSleep'].fillna('Missing', inplace=True)
    #Cabin preprocessing -extract Deck
    df['TempCabin'] = df['Cabin'].apply(lambda x: split_cabin(x))
    df['Deck'] = df['TempCabin'].apply(lambda x: x[0])
    df['Side'] = df['TempCabin'].apply(lambda x: x[2])
    df.drop(['TempCabin','Cabin'], axis=1, inplace = True)
    
    #Get passenger group
    df['TempPassenger'] = df['PassengerId'].apply(lambda x: split_passenger(x))
    df['PassengerGroup'] = df['TempPassenger'].apply(lambda x: x[0])
    df.drop(['TempPassenger'], axis=1, inplace = True)

    #Destination
    df['Destination'].fillna('Missing', inplace = True)

    # Age
    df['Age'].fillna(df['Age'].mean(), inplace = True)
    # VIP - drop Vip Na
    df['VIP'].fillna('Missing', inplace=True)
    

    #Monetary spending columns
    df['RoomService'].fillna(0, inplace=True)
    df['FoodCourt'].fillna(0, inplace=True)
    df['ShoppingMall'].fillna(0, inplace=True)
    df['Spa'].fillna(0, inplace=True)
    df['VRDeck'].fillna(0, inplace=True)

    #Drop Name due to high cardinality
    df.drop('Name', axis=1, inplace = True)

    # Drop remaining null rows
    #df.dropna(inplace=True)



In [None]:
abt = df.copy()

In [None]:
abt.head()

In [None]:
preprocessing(abt)

In [None]:
abt.info()

In [None]:
abt.head()

# MODELING
 - Feature and Target values - X, y
 - One hot encode any categorical features
 - Train, holdout split
 - Train with several algorithms

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from matplotlib import pyplot as pyplot
import seaborn as sns

In [None]:
# Create Feature columns
# Drop identifier columns
X = abt.drop(['Transported', 'PassengerId'], axis=1)
#X = pd.concat((X,pd.get_dummies(X.PassengerId)),1)
# One hot encode
X = pd.get_dummies(X)
# Create Target columns
y = abt['Transported']

In [None]:
seed = 1234

In [None]:
#Create training and testing partitions
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=seed)

In [None]:
len(X.columns)

In [None]:
abt['HomePlanet'].unique()

In [None]:
sns.countplot(x='Transported', data=df)

# Setup ML Pipelines

In [None]:
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier

In [None]:
pipelines = {
    'rf': make_pipeline(StandardScaler(), RandomForestClassifier(random_state=seed)),
    'gb': make_pipeline(StandardScaler(), GradientBoostingClassifier(random_state=seed))

}

In [None]:
RandomForestClassifier().get_params()

In [None]:
GradientBoostingClassifier().get_params()

In [None]:
grid = {
    'rf': {
        'randomforestclassifier__n_estimators' : [100, 200, 300]
    },
    'gb': {
        'gradientboostingclassifier__n_estimators' : [100, 200, 300]
    }
}

In [None]:
pipelines['rf']

In [None]:
#Create a blank dictionary to hold the models
fit_models = {}
#Loop through all the algos
for algo, pipeline in pipelines.items():
    print(f'Training the {algo} model.')
    #Create new Grid Search CV Class
    model = GridSearchCV(pipeline, grid[algo], n_jobs=-1, cv=10)
    # Train the model
    model.fit(X_train, y_train)
    # Store results inside of the dictionary
    fit_models[algo] = model

Training the rf model.
Training the gb model.


# Evaluate Performance on Test Partition

  - Grab the testing data from the test.csv and evaluate on that

In [None]:
from sklearn.metrics import accuracy_score, precision_score, recall_score

In [None]:
#Evaluate the performance of the model

for algo, model in fit_models.items():
    yhat =  model.predict(X_test)
    accuracy = accuracy_score(y_test, yhat)
    precision = precision_score(y_test, yhat)
    recall = recall_score(y_test, yhat)

    print(f' Metrics for {algo} accuracy: {accuracy}, recall: {recall}, precison: {precision}')
 
 #Previous results
 #Metrics for rf accuracy: 0.7910276073619632, recall: 0.7646604938271605, precison: 0.8050365556458164
 #Metrics for gb accuracy: 0.8075153374233128, recall: 0.8479938271604939, precison: 0.7827635327635327


# Save Best Model

In [None]:
import pickle

In [None]:
with open('gradienteboosted.pkl', 'wb') as f:
  pickle.dump(fit_models['gb'], f)

In [None]:
with open('gradienteboosted.pkl', 'rb') as f:
  reloaded_model = pickle.load(f)

In [None]:
reloaded_model

# Predict on test data

In [None]:
# Read in the TestCSV Dataset
test_df = pd.read_csv('test.csv')
# Deep copy
abt_test = test_df.copy()
# Run through the processing pipeline
preprocessing(abt_test)
#One hot encoding categorical variables
abt_test = pd.get_dummies(abt_test.drop('PassengerId', axis=1))


In [None]:
abt_test

In [None]:
len(abt_test.columns)

In [None]:
len(X.columns)

In [None]:
yhat_test = fit_models['gb'].predict(abt_test)

In [None]:
submission = pd.DataFrame([test_df['PassengerId'], yhat_test]).T
submission.columns = ['PassengerID', 'Transported']

In [None]:
submission.head()

# Submit to Kaggle

In [None]:
submission.to_csv('KaggleSpaceShip_submission.csv', index = False)

In [None]:
!kaggle competitions submit -c spaceship-titanic -m "Initial gradient boosting model" -f "KaggleSpaceShip_submission.csv"