In [13]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
from sklearn.model_selection import cross_val_score


In [14]:
def preprocess_data(df):
    """
    Preprocesses the dataframe and returns the processed dataframe
    """
    df['Spending'] = df['Spa'] + df['VRDeck'] + df['FoodCourt']
    df.drop(['Spa', 'VRDeck', 'FoodCourt'], axis=1, inplace=True)
    df['Deck'] = df['Cabin'].apply(lambda x: str(x)[0])
    df['Side'] = df['Cabin'].apply(lambda x: str(x)[-1])
    df['Group'] = df['PassengerId'].apply(lambda x: str(x)[:4])
    df.drop(['Cabin'], axis=1, inplace=True)
    df.drop(['PassengerId'], axis=1, inplace=True)
    df.drop(['HomePlanet'], axis=1, inplace=True)
    df.drop(['Destination'], axis=1, inplace=True)
    df.drop(['Name'], axis=1, inplace=True)
    df.drop(['Group'], axis=1, inplace=True)
    # dummy variables
    df = pd.get_dummies(df, columns=['Deck', 'Side', 'CryoSleep', 'VIP'])
    # delete rows with missing values
    df.dropna(inplace=True)
    return df
def missing_values(df):
    # Calculate missing value and their percentage for each feature
    missing_percent = df.isnull().sum() * 100 / df.shape[0]
    df_missing_percent = pd.DataFrame(missing_percent).round(2)
    df_missing_percent = df_missing_percent.reset_index().rename(
                    columns={
                            'index':'Feature',
                            0:'Missing Percentage (%)'
                    }
                )
    df_missing_value = df.isnull().sum()
    df_missing_value = df_missing_value.reset_index().rename(
                    columns={
                            'index':'Feature',
                            0:'Missing Values'
                    }
                )

    Final = df_missing_value.merge(df_missing_percent, how = 'inner', left_on = 'Feature', right_on = 'Feature')
    Final = Final.sort_values(by = 'Missing Percentage (%)',ascending = False)
    return Final

Unnamed: 0,Feature,Missing Values,Missing Percentage (%)
8,FoodCourt,106,2.48
10,Spa,101,2.36
3,Cabin,100,2.34
9,ShoppingMall,98,2.29
12,Name,94,2.2
2,CryoSleep,93,2.17
6,VIP,93,2.17
4,Destination,92,2.15
5,Age,91,2.13
1,HomePlanet,87,2.03


In [16]:


numericals = list(train_df.select_dtypes(include=['int64', 'float64']).columns)

#sns.pairplot(train_df[numericals+['Transported']], hue='Transported').savefig('pairplot.pdf')
test_proc_df = preprocess_data(test_df.copy())
train_df = preprocess_data(train_df)

x_train = train_df.drop(['Transported'], axis=1, inplace=False)
y_train = train_df['Transported']


numericals = train_df.select_dtypes(include=['int64', 'float64']).columns
categoricals = train_df.select_dtypes(include=['object']).columns
df_num = train_df[numericals]
df_cat = train_df[categoricals]

KeyError: 'Spa'

In [None]:
train_df.head()

Unnamed: 0,Age,RoomService,ShoppingMall,Transported,Spending,Deck_A,Deck_B,Deck_C,Deck_D,Deck_E,...,Deck_G,Deck_T,Deck_n,Side_P,Side_S,Side_n,CryoSleep_False,CryoSleep_True,VIP_False,VIP_True
0,39.0,0.0,0.0,False,0.0,0,1,0,0,0,...,0,0,0,1,0,0,1,0,1,0
1,24.0,109.0,25.0,True,602.0,0,0,0,0,0,...,0,0,0,0,1,0,1,0,1,0
2,58.0,43.0,0.0,False,10340.0,1,0,0,0,0,...,0,0,0,0,1,0,1,0,0,1
3,33.0,0.0,371.0,False,4805.0,1,0,0,0,0,...,0,0,0,0,1,0,1,0,1,0
4,16.0,303.0,151.0,True,637.0,0,0,0,0,0,...,0,0,0,0,1,0,1,0,1,0


In [None]:
# use xgboost for regression
from xgboost import XGBClassifier
from sklearn.svm import SVC
#import lgbm
from lightgbm import LGBMClassifier
# fit model
model = XGBClassifier(learning_rate=0.01, max_depth=4, n_estimators=1000)
model.fit(x_train, y_train)

# make predictions
predictions = model.predict(test_proc_df)

predictions = map(lambda x: True if x else False, predictions)

#set up submission dataframe
submission = pd.DataFrame({
    "PassengerId": test_df["PassengerId"],
    "Transported": predictions
})

# save submission
submission.to_csv("submission.csv", index=False)

In [None]:
xgb_optimal = XGBClassifier(learning_rate=0.01, max_depth=4, n_estimators=1000)
svc_optimal = SVC(C=1.5, gamma='scale', kernel='rbf')
lgbm_optimal = LGBMClassifier(learning_rate=0.05, max_depth=8, n_estimators=100)

In [None]:
from sklearn.ensemble import VotingClassifier

# Create Hard Voting Classifier
Ensemble_HV = VotingClassifier(estimators= [('SVC', svc_optimal),
                                           ('XBG', xgb_optimal),
                                           ('LGBM', lgbm_optimal)],
                              voting = 'hard')

# Create Soft Voting Classifier
Ensemble_SV = VotingClassifier(estimators= [('SVC', svc_optimal),
                                           ('XBG', xgb_optimal),
                                           ('LGBM', lgbm_optimal)],
                              voting = 'soft')

# Return Accuracy Scores
cv_HV = cross_val_score(Ensemble_HV, x_train, y_train, scoring='accuracy')
cv_SV = cross_val_score(Ensemble_SV, x_train, y_train, scoring='accuracy')

print('Hard Voting Classifier:' , cv_HV.mean())
print('Soft Voting Classifier:' , cv_SV.mean())

Hard Voting Classifier: nan
Soft Voting Classifier: nan


5 fits failed out of a total of 5.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
5 fits failed with the following error:
Traceback (most recent call last):
  File "c:\Users\mwili\anaconda3\lib\site-packages\sklearn\model_selection\_validation.py", line 681, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "c:\Users\mwili\anaconda3\lib\site-packages\sklearn\ensemble\_voting.py", line 324, in fit
    return super().fit(X, transformed_y, sample_weight)
  File "c:\Users\mwili\anaconda3\lib\site-packages\sklearn\ensemble\_voting.py", line 74, in fit
    self.estimators_ = Parallel(n_jobs=self.n_jobs)(
  File "c:\Users\mwili\anaconda3\lib\site-packages\joblib\parallel.py", line 1043, in __call__
    if self.dispatch

In [None]:
def predict(model):
    model.fit(x_train, y_train)
    Y_pred = model.predict(test_proc_df)
    pred = pd.DataFrame({
    'PassengerId': test_df['PassengerId'],
    'Transported': Y_pred
})
    return pred

In [None]:
predict(svc_optimal).to_csv('submission_svc_optimal.csv', index=False)
predict(xgb_optimal).to_csv('submission_xgb_optimal.csv', index=False)
predict(lgbm_optimal).to_csv('submission_lgbm_optimal.csv', index=False)
predict(Ensemble_HV).to_csv('submission_Ensemble_HV.csv', index=False)


ValueError: Input contains NaN, infinity or a value too large for dtype('float64').