In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import PolynomialFeatures,StandardScaler,LabelEncoder,OneHotEncoder
from sklearn.metrics import mean_squared_error,r2_score
from sklearn.linear_model import LogisticRegression
import matplotlib.pyplot as plt
import os
from sklearn.impute import SimpleImputer
from xgboost import XGBClassifier

In [2]:
df=pd.read_csv('train.csv')
x_test=pd.read_csv('test.csv')
ids=x_test['PassengerId']

In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8693 entries, 0 to 8692
Data columns (total 14 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   PassengerId   8693 non-null   object 
 1   HomePlanet    8492 non-null   object 
 2   CryoSleep     8476 non-null   object 
 3   Cabin         8494 non-null   object 
 4   Destination   8511 non-null   object 
 5   Age           8514 non-null   float64
 6   VIP           8490 non-null   object 
 7   RoomService   8512 non-null   float64
 8   FoodCourt     8510 non-null   float64
 9   ShoppingMall  8485 non-null   float64
 10  Spa           8510 non-null   float64
 11  VRDeck        8505 non-null   float64
 12  Name          8493 non-null   object 
 13  Transported   8693 non-null   bool   
dtypes: bool(1), float64(6), object(7)
memory usage: 891.5+ KB


In [4]:
numirecal_data=df.select_dtypes(exclude=['bool','object']).columns.to_list()

imuter = SimpleImputer(strategy='mean')
num_without_nulls = pd.DataFrame(imuter.fit_transform(df[numirecal_data]),columns=numirecal_data)
test_num_without_nulls = pd.DataFrame(imuter.transform(df[numirecal_data]),columns=numirecal_data)

df[numirecal_data] = num_without_nulls
x_test[numirecal_data] = test_num_without_nulls

In [5]:
catagorical_data=df.select_dtypes(include=['object']).columns.to_list()

imuter = SimpleImputer(strategy='most_frequent')
catagorical_without_null = pd.DataFrame(imuter.fit_transform(df[catagorical_data]),columns=catagorical_data)
df[catagorical_data] = catagorical_without_null

In [6]:
catagorical_data=x_test.select_dtypes(include=['object']).columns.to_list()

imuter = SimpleImputer(strategy='most_frequent')
catagorical_without_null = pd.DataFrame(imuter.fit_transform(df[catagorical_data]),columns=catagorical_data)
x_test[catagorical_data] = catagorical_without_null

In [7]:
df.isna().sum()

PassengerId     0
HomePlanet      0
CryoSleep       0
Cabin           0
Destination     0
Age             0
VIP             0
RoomService     0
FoodCourt       0
ShoppingMall    0
Spa             0
VRDeck          0
Name            0
Transported     0
dtype: int64

In [8]:
#split Cabin
df['Cabin']=df['Cabin'].str.split('/')
df['Cabin_deck'] = df['PassengerId'].str.get(0).astype(int)
df['Cabin_num']=df['PassengerId'].str.get(1).astype(int)
df['Cabin_side']=df['PassengerId'].str.get(2).astype(int)

#split cabin test
x_test['Cabin']=x_test['Cabin'].str.split('/')
x_test['Cabin_deck'] = x_test['PassengerId'].str.get(0).astype(int)
x_test['Cabin_num']=x_test['PassengerId'].str.get(1).astype(int)
x_test['Cabin_side']=x_test['PassengerId'].str.get(2).astype(int)

In [9]:
#split id
df['PassengerId']=df['PassengerId'].str.split('_')
df['group'] = df['PassengerId'].str.get(0).astype(int)
df['passenger_num']=df['PassengerId'].str.get(1).astype(int)

#split id test
x_test['PassengerId']=x_test['PassengerId'].str.split('_')
x_test['group'] = x_test['PassengerId'].str.get(0).astype(int)
x_test['passenger_num']=x_test['PassengerId'].str.get(1).astype(int)

In [10]:
numirecal_data=df.select_dtypes(exclude=['bool','object']).columns.to_list()
st_scaler=StandardScaler()
df[numirecal_data]=st_scaler.fit_transform(df[numirecal_data])
x_test[numirecal_data]=st_scaler.transform(x_test[numirecal_data])

In [11]:
#label enconding
df_catagorical_data=df.select_dtypes(include=['object']).columns.to_list()
#test_catagorical_data=df.select_dtypes(include=['object']).columns.to_list()

lb=LabelEncoder()
df[df_catagorical_data]=lb.fit_transform(df_catagorical_data)
x_test[df_catagorical_data]=lb.transform(df_catagorical_data)


In [12]:
print(df['Transported'])

0       False
1        True
2       False
3       False
4        True
        ...  
8688    False
8689    False
8690     True
8691    False
8692     True
Name: Transported, Length: 8693, dtype: bool


In [13]:
#new feature
df['total_spent']=df['FoodCourt']+df['RoomService']+df['ShoppingMall']+df['Spa']+df['VRDeck']
x_test['total_spent']=x_test['FoodCourt']+x_test['RoomService']+x_test['ShoppingMall']+x_test['Spa']+x_test['VRDeck']

df['Transported']=df['Transported'].apply(lambda val:1 if val==True else 0)




'''#dummies 
df = pd.get_dummies(df, columns=['HomePlanet','Destination','Cabin_deck','Cabin_side'], prefix=['HomePlanet','Destination','Cabin_deck','Cabin_side'],dtype=int)
x_test = pd.get_dummies(x_test, columns=['HomePlanet','Destination','Cabin_deck','Cabin_side'], prefix=['HomePlanet','Destination','Cabin_deck','Cabin_side'],dtype=int)
'''


#y
y_train=df['Transported']


#drop columns
df=df.drop(['PassengerId','Cabin','Name','Transported'],axis=1)
x_test=x_test.drop(['PassengerId','Cabin','Name'],axis=1)


In [14]:
print(y_train)

0       0
1       1
2       0
3       0
4       1
       ..
8688    0
8689    0
8690    1
8691    0
8692    1
Name: Transported, Length: 8693, dtype: int64


In [15]:
#split the data
x_train, x_cv, y_train, y_cv = train_test_split(df, y_train,test_size=0.20,random_state=32)

model=XGBClassifier()
model.fit(x_train,y_train)

pretrain=model.predict(x_train)

from sklearn.metrics import accuracy_score
acc=accuracy_score(pretrain,y_train)
print(acc)

0.9134311187805579


In [16]:
pretrain=model.predict(x_cv)

from sklearn.metrics import accuracy_score
acc=accuracy_score(pretrain,y_cv)
print(acc)

0.7751581368602645


In [17]:
predict=model.predict(x_test)


In [18]:
'''y_pred=model.predict(x_test)

ids=list(ids)
submit=pd.DataFrame({'PassengerId':ids,'Survived':y_pred})
submit.to_csv('submit.csv',index=False)
'''

"y_pred=model.predict(x_test)\n\nids=list(ids)\nsubmit=pd.DataFrame({'PassengerId':ids,'Survived':y_pred})\nsubmit.to_csv('submit.csv',index=False)\n"

In [19]:
# Initialize the XGBClassifier with parameters
model2 = XGBClassifier(
    max_depth=4,
    subsample=0.6,
    colsample_bytree=0.9,
    reg_alpha=1,
    reg_lambda=2,
    learning_rate=0.05,
    n_estimators=1000,
    gamma=0
)
model2.fit(x_train,y_train)

pretrain=model2.predict(x_train)

from sklearn.metrics import accuracy_score
acc=accuracy_score(pretrain,y_train)
print(acc)

0.8697152717860225


In [None]:
pretrain=model2.predict(x_cv)


In [20]:


from sklearn.metrics import accuracy_score
acc=accuracy_score(pretrain,y_cv)
print(acc)

0.780333525014376


In [24]:
y_pred=model2.predict(x_test)
   
y_pred=y_pred.astype(bool)

In [26]:

ids=list(ids)
submit=pd.DataFrame({'PassengerId':ids,'Transported':y_pred})
submit.to_csv('submit4.csv',index=False)


In [25]:
print(y_pred)

[ True False False ...  True False False]
