In [1]:
import numpy as np
import pandas as pd
import sklearn
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report,accuracy_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OrdinalEncoder

In [2]:
train_data = pd.read_csv('train.csv')
test_data = pd.read_csv('test.csv')

In [4]:
train_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8693 entries, 0 to 8692
Data columns (total 14 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   PassengerId   8693 non-null   object 
 1   HomePlanet    8492 non-null   object 
 2   CryoSleep     8476 non-null   object 
 3   Cabin         8494 non-null   object 
 4   Destination   8511 non-null   object 
 5   Age           8514 non-null   float64
 6   VIP           8490 non-null   object 
 7   RoomService   8512 non-null   float64
 8   FoodCourt     8510 non-null   float64
 9   ShoppingMall  8485 non-null   float64
 10  Spa           8510 non-null   float64
 11  VRDeck        8505 non-null   float64
 12  Name          8493 non-null   object 
 13  Transported   8693 non-null   bool   
dtypes: bool(1), float64(6), object(7)
memory usage: 891.5+ KB


In [5]:
train_data.describe()

Unnamed: 0,Age,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck
count,8514.0,8512.0,8510.0,8485.0,8510.0,8505.0
mean,28.82793,224.687617,458.077203,173.729169,311.138778,304.854791
std,14.489021,666.717663,1611.48924,604.696458,1136.705535,1145.717189
min,0.0,0.0,0.0,0.0,0.0,0.0
25%,19.0,0.0,0.0,0.0,0.0,0.0
50%,27.0,0.0,0.0,0.0,0.0,0.0
75%,38.0,47.0,76.0,27.0,59.0,46.0
max,79.0,14327.0,29813.0,23492.0,22408.0,24133.0


In [7]:
train_data.isnull().sum()

PassengerId       0
HomePlanet      201
CryoSleep       217
Cabin           199
Destination     182
Age             179
VIP             203
RoomService     181
FoodCourt       183
ShoppingMall    208
Spa             183
VRDeck          188
Name            200
Transported       0
dtype: int64

In [8]:
def cleantrain(data): 
    #the passenger ID does not matter, nor does the name
    data.drop(["PassengerId", 'Name'], axis = 1, inplace = True)
    data = pd.get_dummies(data, columns= ['HomePlanet', 'CryoSleep', 'Destination', 'VIP']
                                                 , drop_first= True)
    data = pd.concat([data, pd.get_dummies(data['Cabin'].str[0], drop_first = True)], axis = 1)
    data = pd.concat([data, pd.get_dummies(data['Cabin'].str[-1], drop_first = True)], axis = 1)
    data.drop('Cabin', axis = 1, inplace= True)
    data['Age'].fillna(data['Age'].median(), inplace = True)
    data['RoomService'].fillna(data['RoomService'].median(), inplace = True)
    data['FoodCourt'].fillna(data['FoodCourt'].median(), inplace = True)
    data['ShoppingMall'].fillna(data['ShoppingMall'].median(), inplace = True)
    data['Spa'].fillna(data['Spa'].median(), inplace = True)
    data['VRDeck'].fillna(data['VRDeck'].median(), inplace = True)

    return data

In [9]:
def cleantest(data): 
    #the passenger ID does not matter, nor does the name
    data.drop(['Name'], axis = 1, inplace = True)
    data = pd.get_dummies(data, columns= ['HomePlanet', 'CryoSleep', 'Destination', 'VIP']
                                                 , drop_first= True)
    data = pd.concat([data, pd.get_dummies(data['Cabin'].str[0], drop_first = True)], axis = 1)
    data = pd.concat([data, pd.get_dummies(data['Cabin'].str[-1], drop_first = True)], axis = 1)
    data.drop('Cabin', axis = 1, inplace= True)
    data['Age'].fillna(data['Age'].median(), inplace = True)
    data['RoomService'].fillna(data['RoomService'].median(), inplace = True)
    data['FoodCourt'].fillna(data['FoodCourt'].median(), inplace = True)
    data['ShoppingMall'].fillna(data['ShoppingMall'].median(), inplace = True)
    data['Spa'].fillna(data['Spa'].median(), inplace = True)
    data['VRDeck'].fillna(data['VRDeck'].median(), inplace = True)

    return data

In [10]:
train = cleantrain(train_data)
test = cleantest(test_data)

In [11]:
train.head()

Unnamed: 0,Age,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Transported,HomePlanet_Europa,HomePlanet_Mars,CryoSleep_True,...,Destination_TRAPPIST-1e,VIP_True,B,C,D,E,F,G,T,S
0,39.0,0.0,0.0,0.0,0.0,0.0,False,1,0,0,...,1,0,1,0,0,0,0,0,0,0
1,24.0,109.0,9.0,25.0,549.0,44.0,True,0,0,0,...,1,0,0,0,0,0,1,0,0,1
2,58.0,43.0,3576.0,0.0,6715.0,49.0,False,1,0,0,...,1,1,0,0,0,0,0,0,0,1
3,33.0,0.0,1283.0,371.0,3329.0,193.0,False,1,0,0,...,1,0,0,0,0,0,0,0,0,1
4,16.0,303.0,70.0,151.0,565.0,2.0,True,0,0,0,...,1,0,0,0,0,0,1,0,0,1


In [12]:
test.isnull().sum()

PassengerId                  0
Age                          0
RoomService                  0
FoodCourt                    0
ShoppingMall                 0
Spa                          0
VRDeck                       0
HomePlanet_Europa            0
HomePlanet_Mars              0
CryoSleep_True               0
Destination_PSO J318.5-22    0
Destination_TRAPPIST-1e      0
VIP_True                     0
B                            0
C                            0
D                            0
E                            0
F                            0
G                            0
T                            0
S                            0
dtype: int64

In [13]:
X = train.drop('Transported', axis = 1)
y = train['Transported']
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=142)

In [14]:
X.shape, y.shape

((8693, 20), (8693,))

In [15]:
log = LogisticRegression()
# fitting data in our model
log.fit(X_train,y_train)

pred = log.predict(X_test)

# CLASSIFICATION REPORT

print(accuracy_score(y_test, pred))

0.7860839562967222


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


In [16]:
clf = RandomForestClassifier(n_estimators=50, max_depth =10, random_state=15)
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)
print(accuracy_score(y_test, y_pred))

0.7993099482461185


In [17]:
subid = test.PassengerId
test.drop('PassengerId', inplace=True, axis=1)

clf.fit(X,y)
pred = clf.predict(test)
pred = pred.astype(bool)
output = pd.DataFrame({'PassengerId': subid,'Transported': pred.squeeze()})
output.head()

Unnamed: 0,PassengerId,Transported
0,0013_01,True
1,0018_01,False
2,0019_01,True
3,0021_01,True
4,0023_01,True


In [18]:
output.shape

(4277, 2)