In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import PolynomialFeatures,StandardScaler,LabelEncoder,OneHotEncoder
from sklearn.metrics import mean_squared_error,r2_score
from sklearn.linear_model import LogisticRegression
import matplotlib.pyplot as plt
import os
from sklearn.impute import SimpleImputer
from xgboost import XGBClassifier

In [2]:
df=pd.read_csv('train.csv')
x_test=pd.read_csv('test.csv')

In [3]:
df.isna().sum()

PassengerId       0
HomePlanet      201
CryoSleep       217
Cabin           199
Destination     182
Age             179
VIP             203
RoomService     181
FoodCourt       183
ShoppingMall    208
Spa             183
VRDeck          188
Name            200
Transported       0
dtype: int64

In [4]:
df['Cabin']=df['Cabin'].str.split('/')
set(df['Cabin'].str.get(0))

{'A', 'B', 'C', 'D', 'E', 'F', 'G', 'T', nan}

In [5]:
set(df['Cabin'].str.get(2))


{'P', 'S', nan}

In [6]:
#split Cabin
df['Cabin']=df['Cabin'].str.split('/')
df['Cabin_deck'] = df['PassengerId'].str.get(0).astype(int)
df['Cabin_num']=df['PassengerId'].str.get(1).astype(int)
df['Cabin_side']=df['PassengerId'].str.get(2).astype(int)

#split cabin test
x_test['Cabin']=x_test['Cabin'].str.split('/')
x_test['Cabin_deck'] = x_test['PassengerId'].str.get(0).astype(int)
x_test['Cabin_num']=x_test['PassengerId'].str.get(1).astype(int)
x_test['Cabin_side']=x_test['PassengerId'].str.get(2).astype(int)

In [7]:
#split id
df['PassengerId']=df['PassengerId'].str.split('_')
df['group'] = df['PassengerId'].str.get(0).astype(int)
df['passenger_num']=df['PassengerId'].str.get(1).astype(int)

#split id test
x_test['PassengerId']=x_test['PassengerId'].str.split('_')
x_test['group'] = x_test['PassengerId'].str.get(0).astype(int)
x_test['passenger_num']=x_test['PassengerId'].str.get(1).astype(int)

In [8]:
#fill missing data
imputer = SimpleImputer(strategy='most_frequent')

# Fit the imputer on your data (calculates the mean for imputation)
imputer.fit(df[['HomePlanet','Destination','Cabin_deck','Cabin_side']])
df[['HomePlanet','Destination','Cabin_deck','Cabin_side']] = imputer.transform(df[['HomePlanet','Destination','Cabin_deck','Cabin_side']])
x_test[['HomePlanet','Destination','Cabin_deck','Cabin_side']] = imputer.transform(x_test[['HomePlanet','Destination','Cabin_deck','Cabin_side']])



In [9]:
#new feature
df['total_spent']=df['FoodCourt']+df['RoomService']+df['ShoppingMall']+df['Spa']+df['VRDeck']

#turn to 0,1
df['VIP']=df['VIP'].apply(lambda val:1 if val==True else 0)
x_test['VIP']=x_test['VIP'].apply(lambda val:1 if val==True else 0)

df['CryoSleep']=df['CryoSleep'].apply(lambda val:1 if val==True else 0)
x_test['CryoSleep']=x_test['CryoSleep'].apply(lambda val:1 if val==True else 0)

df['Transported']=df['Transported'].apply(lambda val:1 if val==True else 0)

#fill the other missing data with 0
df['Age'] = df['Age'].fillna(df['Age'].mean())
x_test['Age'] = x_test['Age'].fillna(df['Age'].mean())

df=df.fillna(0)
x_test=x_test.fillna(0)


#dummies 
df = pd.get_dummies(df, columns=['HomePlanet','Destination','Cabin_deck','Cabin_side'], prefix=['HomePlanet','Destination','Cabin_deck','Cabin_side'],dtype=int)
x_test = pd.get_dummies(x_test, columns=['HomePlanet','Destination','Cabin_deck','Cabin_side'], prefix=['HomePlanet','Destination','Cabin_deck','Cabin_side'],dtype=int)

#y
y_train=df['Transported']


#drop columns
df=df.drop(['PassengerId','Cabin','Name','Transported'],axis=1)
x_test=x_test.drop(['PassengerId','Cabin','Name'],axis=1)


In [10]:
df.head(5)

Unnamed: 0,CryoSleep,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Cabin_num,group,...,Cabin_side_0,Cabin_side_1,Cabin_side_2,Cabin_side_3,Cabin_side_4,Cabin_side_5,Cabin_side_6,Cabin_side_7,Cabin_side_8,Cabin_side_9
0,0,39.0,0,0.0,0.0,0.0,0.0,0.0,0,1,...,1,0,0,0,0,0,0,0,0,0
1,0,24.0,0,109.0,9.0,25.0,549.0,44.0,0,2,...,1,0,0,0,0,0,0,0,0,0
2,0,58.0,1,43.0,3576.0,0.0,6715.0,49.0,0,3,...,1,0,0,0,0,0,0,0,0,0
3,0,33.0,0,0.0,1283.0,371.0,3329.0,193.0,0,3,...,1,0,0,0,0,0,0,0,0,0
4,0,16.0,0,303.0,70.0,151.0,565.0,2.0,0,4,...,1,0,0,0,0,0,0,0,0,0


In [11]:
#split the data
x_train, x_cv, y_train, y_cv = train_test_split(df, y_train,test_size=0.20,random_state=32)

model=XGBClassifier()
model.fit(x_train,y_train)

pretrain=model.predict(x_train)

from sklearn.metrics import accuracy_score
acc=accuracy_score(pretrain,y_train)
print(acc)

0.9314063848144952


In [12]:
pretrain=model.predict(x_cv)

from sklearn.metrics import accuracy_score
acc=accuracy_score(pretrain,y_cv)
print(acc)

0.7837837837837838


In [13]:
# Initialize the XGBClassifier with parameters
model2 = XGBClassifier(
    max_depth=5,
    subsample=0.5,
    colsample_bytree=0.8,
    reg_alpha=0,
    reg_lambda=1,
    learning_rate=0.1,
    n_estimators=1000,
    gamma=1
)
model2.fit(x_train,y_train)

pretrain=model2.predict(x_train)

from sklearn.metrics import accuracy_score
acc=accuracy_score(pretrain,y_train)
print(acc)

0.9620362381363244


In [14]:
pretrain=model2.predict(x_cv)

from sklearn.metrics import accuracy_score
acc=accuracy_score(pretrain,y_cv)
print(acc)

0.7768832662449684


In [15]:
df['total_spent']

0           0.0
1         736.0
2       10383.0
3        5176.0
4        1091.0
         ...   
8688     8536.0
8689        0.0
8690     1873.0
8691     4637.0
8692     4826.0
Name: total_spent, Length: 8693, dtype: float64

In [16]:
print(set(y_train))

{0, 1}


In [18]:
print(set(df['CryoSleep']))


{0, 1}
