In [1]:
# Import needed packages

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
import seaborn as sns


from sklearn.preprocessing import LabelBinarizer
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.impute import SimpleImputer

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report, f1_score, make_scorer


import statistics as stat

import warnings
warnings.filterwarnings('ignore')

In [2]:
train_data = pd.read_csv("train.csv")
train_data.head()

Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Name,Transported
0,0001_01,Europa,False,B/0/P,TRAPPIST-1e,39.0,False,0.0,0.0,0.0,0.0,0.0,Maham Ofracculy,False
1,0002_01,Earth,False,F/0/S,TRAPPIST-1e,24.0,False,109.0,9.0,25.0,549.0,44.0,Juanna Vines,True
2,0003_01,Europa,False,A/0/S,TRAPPIST-1e,58.0,True,43.0,3576.0,0.0,6715.0,49.0,Altark Susent,False
3,0003_02,Europa,False,A/0/S,TRAPPIST-1e,33.0,False,0.0,1283.0,371.0,3329.0,193.0,Solam Susent,False
4,0004_01,Earth,False,F/1/S,TRAPPIST-1e,16.0,False,303.0,70.0,151.0,565.0,2.0,Willy Santantines,True


In [3]:
test_data = pd.read_csv("test.csv")
test_data.head()

Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Name
0,0013_01,Earth,True,G/3/S,TRAPPIST-1e,27.0,False,0.0,0.0,0.0,0.0,0.0,Nelly Carsoning
1,0018_01,Earth,False,F/4/S,TRAPPIST-1e,19.0,False,0.0,9.0,0.0,2823.0,0.0,Lerome Peckers
2,0019_01,Europa,True,C/0/S,55 Cancri e,31.0,False,0.0,0.0,0.0,0.0,0.0,Sabih Unhearfus
3,0021_01,Europa,False,C/1/S,TRAPPIST-1e,38.0,False,0.0,6652.0,0.0,181.0,585.0,Meratz Caltilter
4,0023_01,Earth,False,F/5/S,TRAPPIST-1e,20.0,False,10.0,0.0,635.0,0.0,0.0,Brence Harperez


In [4]:
# class distribution
# transported: False = 0, True = 1
train_data['Transported'].value_counts()

Transported
True     4378
False    4315
Name: count, dtype: int64

the train data is almost balanced

# Pre-Processing

In [5]:
# by default majority class (not transported (False)) will be negative
lb = LabelBinarizer()
train_data['Transported'] = lb.fit_transform(train_data['Transported'].values)


The .isnull() method in Python searches for both None and NaN values

In [6]:
print('\nNull Values in Training \n{}'.format(train_data.isnull().sum()))
print('\nNull Values in Testing \n{}'.format(test_data.isnull().sum()))
print('\nDuplicated values in train {}'.format(train_data.duplicated().sum()))
print('\nDuplicated values in test {}'.format(test_data.duplicated().sum()))



Null Values in Training 
PassengerId       0
HomePlanet      201
CryoSleep       217
Cabin           199
Destination     182
Age             179
VIP             203
RoomService     181
FoodCourt       183
ShoppingMall    208
Spa             183
VRDeck          188
Name            200
Transported       0
dtype: int64

Null Values in Testing 
PassengerId       0
HomePlanet       87
CryoSleep        93
Cabin           100
Destination      92
Age              91
VIP              93
RoomService      82
FoodCourt       106
ShoppingMall     98
Spa             101
VRDeck           80
Name             94
dtype: int64

Duplicated values in train 0

Duplicated values in test 0


we have null values everywhere:

# Dropping Unecessary columns

In [7]:
#Removing less important features
train_data.drop(['PassengerId','Name','Transported'], axis=1, inplace = True)
test_data.drop(['PassengerId','Name'], axis=1, inplace = True)

# Treatment of missing data and duplicates

In [8]:
train_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8693 entries, 0 to 8692
Data columns (total 11 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   HomePlanet    8492 non-null   object 
 1   CryoSleep     8476 non-null   object 
 2   Cabin         8494 non-null   object 
 3   Destination   8511 non-null   object 
 4   Age           8514 non-null   float64
 5   VIP           8490 non-null   object 
 6   RoomService   8512 non-null   float64
 7   FoodCourt     8510 non-null   float64
 8   ShoppingMall  8485 non-null   float64
 9   Spa           8510 non-null   float64
 10  VRDeck        8505 non-null   float64
dtypes: float64(6), object(5)
memory usage: 747.2+ KB


In [9]:
test_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4277 entries, 0 to 4276
Data columns (total 11 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   HomePlanet    4190 non-null   object 
 1   CryoSleep     4184 non-null   object 
 2   Cabin         4177 non-null   object 
 3   Destination   4185 non-null   object 
 4   Age           4186 non-null   float64
 5   VIP           4184 non-null   object 
 6   RoomService   4195 non-null   float64
 7   FoodCourt     4171 non-null   float64
 8   ShoppingMall  4179 non-null   float64
 9   Spa           4176 non-null   float64
 10  VRDeck        4197 non-null   float64
dtypes: float64(6), object(5)
memory usage: 367.7+ KB


# Treatement of Numericals columns

In [10]:
train_data_numerics_columns = train_data.select_dtypes(exclude=['object','bool']).columns.tolist()
imputer = SimpleImputer(strategy='mean')
num_without_nulls_train_data = pd.DataFrame(imputer.fit_transform(train_data[train_data_numerics_columns]),columns=train_data_numerics_columns)
num_without_nulls_train_data.isnull().sum()

Age             0
RoomService     0
FoodCourt       0
ShoppingMall    0
Spa             0
VRDeck          0
dtype: int64

In [11]:
train_data[train_data_numerics_columns] = num_without_nulls_train_data
train_data.isnull().sum()

HomePlanet      201
CryoSleep       217
Cabin           199
Destination     182
Age               0
VIP             203
RoomService       0
FoodCourt         0
ShoppingMall      0
Spa               0
VRDeck            0
dtype: int64

In [12]:
test_data_numerics_columns = test_data.select_dtypes(exclude=['object','bool']).columns.tolist()
imputer = SimpleImputer(strategy='mean')
num_without_nulls_test_data = pd.DataFrame(imputer.fit_transform(test_data[test_data_numerics_columns]),columns=test_data_numerics_columns)
num_without_nulls_test_data.isnull().sum()

Age             0
RoomService     0
FoodCourt       0
ShoppingMall    0
Spa             0
VRDeck          0
dtype: int64

In [13]:
test_data[test_data_numerics_columns] = num_without_nulls_test_data
test_data.isnull().sum()

HomePlanet       87
CryoSleep        93
Cabin           100
Destination      92
Age               0
VIP              93
RoomService       0
FoodCourt         0
ShoppingMall      0
Spa               0
VRDeck            0
dtype: int64

# Treatement of Categorical Columns

In [None]:
categorical_col_train_data = train_data.select_dtypes(include=['object','bool']).columns.tolist()
imputer = SimpleImputer(strategy='most_frequent')
cat_without_nulls_train_data = pd.DataFrame(imputer.fit_transform(train_data[categorical_col_train_data]),columns=categorical_col_train_data)
cat_without_nulls_train_data.isnull().sum()

In [None]:
train_data[categorical_col_train_data] = cat_without_nulls_train_data
train_data.isnull().sum()

In [None]:
categorical_col_test_data = test_data.select_dtypes(include=['object','bool']).columns.tolist()
imputer = SimpleImputer(strategy='most_frequent')
cat_without_nulls_test_data = pd.DataFrame(imputer.fit_transform(test_data[categorical_col_test_data]),columns=categorical_col_test_data)
cat_without_nulls_test_data.isnull().sum()

In [None]:
test_data[categorical_col_test_data] = cat_without_nulls_test_data
test_data.isnull().sum()

In [None]:
for i in train_data.columns:
    p = train_data[i].isnull().sum()/(train_data[i].count()+train_data[i].isnull().sum())
    print(f'Percentage of null values Train Data column {i} :',f"{p:.2%}")

In [None]:
for i in test_data.columns:
    p = test_data[i].isnull().sum()/(test_data[i].count()+test_data[i].isnull().sum())
    print(f'Percentage of null values Test Data column {i} :',f"{p:.2%}")

# Transforming Cabin to deck/num/side

In [None]:
# split the Cabin column into three columns
train_data[['Deck', 'Num','Side']] = train_data['Cabin'].str.split('/', expand=True)
train_data.head()

In [None]:
train_data.drop(['Cabin'],axis=1,inplace=True)

In [None]:
# split the Cabin column into three columns
test_data[['Deck', 'Num','Side']] = test_data['Cabin'].str.split('/', expand=True)
test_data.head()

In [None]:
test_data.drop(['Cabin'],axis=1,inplace=True)

Transforming Age to int and VIP to int

In [None]:
train_data['Age'] = train_data['Age'].astype(int)
test_data['Age'] = test_data['Age'].astype(int)

In [None]:
train_data['VIP'] = train_data['VIP'].replace({True: 1, False: 0})
test_data['VIP'] = test_data['VIP'].replace({True: 1, False: 0})

#  Solving mismatch in train and test set after categorical encoding

In [None]:
train_data['train']=1
test_data['train']=0

In [None]:
combined = pd.concat([train_data,test_data])

In [None]:
combined = pd.get_dummies(combined)

In [None]:
combined.head()

In [None]:
train_data_dum = combined[combined['train'] == 1]
test_data_dum = combined[combined['train'] == 0]
train_data_dum.drop(['train'],axis=1,inplace=True)
test_data_dum.drop(['train'],axis=1,inplace=True)

In [None]:
targets = pd.read_csv('train.csv')['Transported']
X_train, X_test, y_train, y_test = train_test_split(train_data_dum, targets,random_state=42,test_size=0.2,stratify=targets)

train_test_split with stratify=True results in consistent class distribution betwen training and test sets.

In [None]:
print('y_train class distribution')
print(y_train.value_counts(normalize=True))

print('y_test class distribution')
print(y_test.value_counts(normalize=True))

# Building the lightgbm model

In [None]:
import lightgbm as lgb
clf = lgb.LGBMClassifier()
clf.fit(X_train, y_train)

In [None]:
# predict the results
y_pred=clf.predict(X_test)

In [None]:
# view accuracy
from sklearn.metrics import accuracy_score
accuracy=accuracy_score(y_pred, y_test)
print('LightGBM Model accuracy score: {0:0.4f}'.format(accuracy_score(y_test, y_pred)))

In [None]:
predictions = clf.predict(test_data_dum)

In [None]:
predictions = predictions.astype(dtype=bool)

In [None]:
output = pd.DataFrame({'PassengerId': pd.read_csv("test.csv").PassengerId, 'Transported': predictions})
output.to_csv('submission.csv', index=False)
print("Your submission was successfully saved!")