In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.impute import KNNImputer

In [2]:
train_df = pd.read_csv('train.csv')
test_df = pd.read_csv('test.csv')

In [3]:
test_df['Transported'] = False

In [4]:
df = pd.concat([train_df, test_df], sort = False)

In [5]:
df.head()

Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Name,Transported
0,0001_01,Europa,False,B/0/P,TRAPPIST-1e,39.0,False,0.0,0.0,0.0,0.0,0.0,Maham Ofracculy,False
1,0002_01,Earth,False,F/0/S,TRAPPIST-1e,24.0,False,109.0,9.0,25.0,549.0,44.0,Juanna Vines,True
2,0003_01,Europa,False,A/0/S,TRAPPIST-1e,58.0,True,43.0,3576.0,0.0,6715.0,49.0,Altark Susent,False
3,0003_02,Europa,False,A/0/S,TRAPPIST-1e,33.0,False,0.0,1283.0,371.0,3329.0,193.0,Solam Susent,False
4,0004_01,Earth,False,F/1/S,TRAPPIST-1e,16.0,False,303.0,70.0,151.0,565.0,2.0,Willy Santantines,True


In [6]:
df.shape[0] == train_df.shape[0] + test_df.shape[0]

True

In [7]:
df.isna().sum()

PassengerId       0
HomePlanet      288
CryoSleep       310
Cabin           299
Destination     274
Age             270
VIP             296
RoomService     263
FoodCourt       289
ShoppingMall    306
Spa             284
VRDeck          268
Name            294
Transported       0
dtype: int64

In [8]:
df[['Deck', 'Num', 'Side']] = df['Cabin'].str.split('/', expand = True)
df = df.drop(columns = ['Cabin'])

In [9]:
df.head()

Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Name,Transported,Deck,Num,Side
0,0001_01,Europa,False,TRAPPIST-1e,39.0,False,0.0,0.0,0.0,0.0,0.0,Maham Ofracculy,False,B,0,P
1,0002_01,Earth,False,TRAPPIST-1e,24.0,False,109.0,9.0,25.0,549.0,44.0,Juanna Vines,True,F,0,S
2,0003_01,Europa,False,TRAPPIST-1e,58.0,True,43.0,3576.0,0.0,6715.0,49.0,Altark Susent,False,A,0,S
3,0003_02,Europa,False,TRAPPIST-1e,33.0,False,0.0,1283.0,371.0,3329.0,193.0,Solam Susent,False,A,0,S
4,0004_01,Earth,False,TRAPPIST-1e,16.0,False,303.0,70.0,151.0,565.0,2.0,Willy Santantines,True,F,1,S


In [10]:
df['Deck'] = df['Deck'].fillna('U')

In [11]:
df['Num'] = df['Num'].fillna(-1)

In [12]:
df['Side'] = df['Side'].fillna('U')

In [13]:
df.isna().sum()

PassengerId       0
HomePlanet      288
CryoSleep       310
Destination     274
Age             270
VIP             296
RoomService     263
FoodCourt       289
ShoppingMall    306
Spa             284
VRDeck          268
Name            294
Transported       0
Deck              0
Num               0
Side              0
dtype: int64

In [14]:
df['Deck'].value_counts()

Deck
F    4239
G    3781
E    1323
B    1141
C    1102
D     720
A     354
U     299
T      11
Name: count, dtype: int64

In [15]:
df['Deck'] = df['Deck'].map({
    'F': 0,
    'G': 1,
    'E': 2,
    'B': 3,
    'C': 4,
    'D': 5,
    'A': 6,
    'U': 7,
    'T': 8,
})

In [16]:
df['Side'].value_counts()

Side
S    6381
P    6290
U     299
Name: count, dtype: int64

In [17]:
df['Side'] = df['Side'].map({
    'S': 0,
    'P': 1,
    'U': 2,
})

In [18]:
df.drop(['Name'], axis = 1, inplace = True)

In [20]:
df.select_dtypes(exclude=["object"]).columns.tolist()

['Age',
 'RoomService',
 'FoodCourt',
 'ShoppingMall',
 'Spa',
 'VRDeck',
 'Transported',
 'Deck',
 'Side']

In [21]:
impute_list = ['Age', 'VIP', 'Num', 'CryoSleep', 'Side', 'Deck', 'RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck']

In [22]:
rest = list(set(df.columns) - set(impute_list))

In [23]:
df_rest = df[rest]

In [24]:
imp = KNNImputer(n_neighbors=5)

In [25]:
df_impute = imp.fit_transform(df[impute_list])

In [26]:
df_impute = pd.DataFrame(df_impute, columns=impute_list)

In [27]:
df = pd.concat([df_rest.reset_index(drop = True), df_impute.reset_index(drop = True)], axis = 1)

In [28]:
df.isna().sum()

HomePlanet      288
PassengerId       0
Destination     274
Transported       0
Age               0
VIP               0
Num               0
CryoSleep         0
Side              0
Deck              0
RoomService       0
FoodCourt         0
ShoppingMall      0
Spa               0
VRDeck            0
dtype: int64

In [29]:
df['HomePlanet'] = df['HomePlanet'].fillna('U')
df['Destination'] = df['Destination'].fillna('U')

In [30]:
category_colls = ['HomePlanet', 'Destination']

In [31]:
for col in category_colls:
    df = pd.concat([df, pd.get_dummies(df[col], prefix = col)], axis = 1)

In [33]:
df.drop(columns=category_colls, inplace=True)

In [34]:
bill_cols = ['RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck']
df['amt_spent'] = df[bill_cols].sum(axis = 1)
df['std_amt_spent'] = df[bill_cols].std(axis = 1)
df['mean_amt_spent'] = df[bill_cols].mean(axis = 1)

In [36]:
df.corr()

Unnamed: 0,PassengerId,Transported,Age,VIP,Num,CryoSleep,Side,Deck,RoomService,FoodCourt,...,HomePlanet_Europa,HomePlanet_Mars,HomePlanet_U,Destination_55 Cancri e,Destination_PSO J318.5-22,Destination_TRAPPIST-1e,Destination_U,amt_spent,std_amt_spent,mean_amt_spent
PassengerId,1.0,0.014628,-0.015205,0.011598,0.665212,-0.003554,0.003018,-0.00861,-0.001211,-0.010573,...,-0.003572,-0.018262,0.003564,-0.007143,0.015622,-0.00333,-0.000521,-0.008251,-0.011631,-0.008251
Transported,0.014628,1.0,-0.050519,-0.018644,-0.03524,0.324552,-0.068138,0.06279,-0.174742,0.034771,...,0.131977,0.005643,0.006403,0.083625,0.00076,-0.072731,-0.000554,-0.140428,-0.121155,-0.140428
Age,-0.015205,-0.050519,1.0,0.080972,-0.135749,-0.065953,-0.013037,0.172652,0.067299,0.122475,...,0.218642,0.024134,0.002416,0.022309,-0.032164,-0.00087,0.004758,0.180016,0.175425,0.180016
VIP,0.011598,-0.018644,0.080972,1.0,-0.091207,-0.08106,0.011414,0.146679,0.061817,0.122562,...,0.139645,0.045371,-0.00063,0.038614,-0.005918,-0.026696,-0.009969,0.165767,0.154094,0.165767
Num,0.665212,-0.03524,-0.135749,-0.091207,1.0,-0.039968,-0.042168,-0.60306,-0.010837,-0.175268,...,-0.4771,0.058,0.007981,-0.14499,0.105792,0.059594,0.001456,-0.20886,-0.197466,-0.20886
CryoSleep,-0.003554,0.324552,-0.065953,-0.08106,-0.039968,1.0,-0.00656,0.057726,-0.257396,-0.212602,...,0.102374,0.033578,0.000302,0.069066,0.087543,-0.108452,-0.01811,-0.385804,-0.388096,-0.385804
Side,0.003018,-0.068138,-0.013037,0.011414,-0.042168,-0.00656,1.0,0.162449,0.015047,-0.009081,...,-0.018517,0.006517,-0.010545,-0.011475,0.009466,0.005147,-0.003468,-0.001915,-0.00591,-0.001915
Deck,-0.00861,0.06279,0.172652,0.146679,-0.60306,0.057726,0.162449,1.0,0.031895,0.251862,...,0.668231,-0.116363,-0.00312,0.206567,-0.119478,-0.101595,-0.010686,0.298029,0.282676,0.298029
RoomService,-0.001211,-0.174742,0.067299,0.061817,-0.010837,-0.257396,0.015047,0.031895,1.0,-0.018722,...,-0.073776,0.253287,-0.004925,-0.023471,-0.060989,0.059838,-0.005584,0.224382,0.218389,0.224382
FoodCourt,-0.010573,0.034771,0.122475,0.122562,-0.175268,-0.212602,-0.009081,0.251862,-0.018722,1.0,...,0.363057,-0.127256,-0.012601,0.130845,-0.062059,-0.071583,-0.010677,0.745141,0.751886,0.745141


In [35]:
df.corr()['Transported'].sort_values(ascending=False)

Transported                  1.000000
CryoSleep                    0.324552
HomePlanet_Europa            0.131977
Destination_55 Cancri e      0.083625
Deck                         0.062790
FoodCourt                    0.034771
PassengerId                  0.014628
HomePlanet_U                 0.006403
HomePlanet_Mars              0.005643
ShoppingMall                 0.004163
Destination_PSO J318.5-22    0.000760
Destination_U               -0.000554
VIP                         -0.018644
Num                         -0.035240
Age                         -0.050519
Side                        -0.068138
Destination_TRAPPIST-1e     -0.072731
HomePlanet_Earth            -0.119644
std_amt_spent               -0.121155
mean_amt_spent              -0.140428
amt_spent                   -0.140428
VRDeck                      -0.142783
Spa                         -0.154836
RoomService                 -0.174742
Name: Transported, dtype: float64

In [38]:
df['3_high_cols'] = df['CryoSleep'] + df['HomePlanet_Europa'] + df['Destination_55 Cancri e']
df['3_low_cols'] = df['mean_amt_spent'] + df['amt_spent'] + df['HomePlanet_Earth']

In [39]:
df.corr()['Transported'].sort_values(ascending=False)

Transported                  1.000000
CryoSleep                    0.324552
3_high_cols                  0.284251
HomePlanet_Europa            0.131977
Destination_55 Cancri e      0.083625
Deck                         0.062790
FoodCourt                    0.034771
PassengerId                  0.014628
HomePlanet_U                 0.006403
HomePlanet_Mars              0.005643
ShoppingMall                 0.004163
Destination_PSO J318.5-22    0.000760
Destination_U               -0.000554
VIP                         -0.018644
Num                         -0.035240
Age                         -0.050519
Side                        -0.068138
Destination_TRAPPIST-1e     -0.072731
HomePlanet_Earth            -0.119644
std_amt_spent               -0.121155
mean_amt_spent              -0.140428
amt_spent                   -0.140428
3_low_cols                  -0.140452
VRDeck                      -0.142783
Spa                         -0.154836
RoomService                 -0.174742
Name: Transp

In [40]:
train_df, test_df = df[:train_df.shape[0]], df[train_df.shape[0]:]

In [41]:
test_df = test_df.drop(columns='Transported')

In [42]:
train_df.shape, test_df.shape

((8693, 26), (4277, 25))

In [62]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

In [55]:
X = train_df.drop(columns='Transported')
Y = train_df['Transported']

In [56]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=42)

In [57]:
model1 = LogisticRegression()
model1.fit(X_train, Y_train)
pred = model1.predict(X_test)
accuracy_score(Y_test, pred)

STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT

Increase the number of iterations to improve the convergence (max_iter=100).
You might also want to scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


0.772857964347326

In [58]:
model2 = DecisionTreeClassifier()
model2.fit(X_train, Y_train)
pred = model2.predict(X_test)
accuracy_score(Y_test, pred)

0.7349051178838413

In [59]:
model3 = RandomForestClassifier()
model3.fit(X_train, Y_train)
pred = model3.predict(X_test)
accuracy_score(Y_test, pred)

0.7912593444508338

In [66]:
pred = model3.predict(test_df)

final = pd.DataFrame()
final['PassengerId'] = test_df['PassengerId']
final['Transported'] = pred1

final.to_csv('final.csv', index = False)