In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import PolynomialFeatures,StandardScaler,LabelEncoder,OneHotEncoder
from sklearn.metrics import mean_squared_error,r2_score
from sklearn.linear_model import LogisticRegression
import matplotlib.pyplot as plt
import os
from sklearn.impute import SimpleImputer
from xgboost import XGBClassifier

In [2]:
df=pd.read_csv('train.csv')
x_test=pd.read_csv('test.csv')


In [3]:
#turn to 0,1
df['VIP']=df['VIP'].apply(lambda val:1 if val==True else 0)
x_test['VIP']=x_test['VIP'].apply(lambda val:1 if val==True else 0)

df['CryoSleep']=df['CryoSleep'].apply(lambda val:1 if val==True else 0)
x_test['CryoSleep']=x_test['CryoSleep'].apply(lambda val:1 if val==True else 0)

df['Transported']=df['Transported'].apply(lambda val:1 if val==True else 0)

df=df.drop(['PassengerId','Cabin','Name'],axis=1)
x_test=x_test.drop(['PassengerId','Cabin','Name'],axis=1)


#drop
df=df.dropna()

#dummies
df = pd.get_dummies(df, columns=['HomePlanet','Destination'], prefix=['HomePlanet','Destination'],dtype=int)
x_test = pd.get_dummies(x_test, columns=['HomePlanet','Destination'], prefix=['HomePlanet','Destination'],dtype=int)


#y
y_train=df['Transported']
df=df.drop('Transported',axis=1)

In [4]:

df['VIP'].describe()

count    7281.000000
mean        0.023486
std         0.151451
min         0.000000
25%         0.000000
50%         0.000000
75%         0.000000
max         1.000000
Name: VIP, dtype: float64

In [5]:
df.shape

(7281, 14)

In [6]:
#split the data
x_train, x_cv, y_train, y_cv = train_test_split(df, y_train,test_size=0.20,random_state=32)

model=XGBClassifier()
model.fit(x_train,y_train)

pretrain=model.predict(x_train)

from sklearn.metrics import accuracy_score
acc=accuracy_score(pretrain,y_train)
print(acc)

0.8925137362637363


In [7]:
pretrain=model.predict(x_cv)

from sklearn.metrics import accuracy_score
acc=accuracy_score(pretrain,y_cv)
print(acc)

0.7735072065888813


In [8]:
# Initialize the XGBClassifier with parameters
model2 = XGBClassifier(
    max_depth=4,
    min_child_weight=1,
    gamma=0,
    subsample=0.8,
    colsample_bytree=0.8,
    reg_alpha=0,
    reg_lambda=1,
    learning_rate=0.05,
    n_estimators=1000,
    random_state=42
)
model2.fit(x_train,y_train)

pretrain=model2.predict(x_train)

from sklearn.metrics import accuracy_score
acc=accuracy_score(pretrain,y_train)
print(acc)

0.8810096153846154


In [9]:
pretrain=model2.predict(x_cv)

from sklearn.metrics import accuracy_score
acc=accuracy_score(pretrain,y_cv)
print(acc)

0.7741935483870968


In [10]:
import numpy as np
from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import GradientBoostingClassifier
from xgboost import XGBClassifier
from sklearn.ensemble import VotingClassifier
from sklearn.metrics import accuracy_score




# Define individual models
nn_model = MLPClassifier(hidden_layer_sizes=(100, 50), max_iter=500, random_state=42)
xgb_model = XGBClassifier(learning_rate=0.1, n_estimators=100, random_state=42)
gb_model = GradientBoostingClassifier(learning_rate=0.1, n_estimators=100, random_state=42)

# Create an ensemble of models
ensemble_model = VotingClassifier(estimators=[('nn', nn_model), ('xgb', xgb_model), ('gb', gb_model)], voting='soft')

# Train the ensemble model
ensemble_model.fit(x_train, y_train)

# Make predictions
y_pred = ensemble_model.predict(x_cv)

# Calculate accuracy
accuracy = accuracy_score(y_cv, y_pred)
print("Ensemble model accuracy:", accuracy)


Ensemble model accuracy: 0.7886067261496225


In [11]:


#scaling the data using standard scaler
scaler=StandardScaler()
x_train=scaler.fit_transform(x_train[['Age', 'RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck']])
x_test=scaler.transform(x_test[['Age', 'RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck']])
x_cv=scaler.transform(x_cv[['Age', 'RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck']])


# Create polynomial features (degree 2)
poly = PolynomialFeatures(2, include_bias=False)
x_train=poly.fit_transform(x_train)
x_cv=poly.transform(x_cv)


In [12]:
model=LogisticRegression(max_iter=1000)
model.fit(x_train,y_train)

In [13]:
print(model.score(x_train,y_train))
print(model.score(x_cv,y_cv))

0.7884615384615384
0.7762525737817433
