In [1]:
# Import needed packages
import pandas as pd 
import numpy as np
import matplotlib.pyplot as plt


from sklearn.preprocessing import LabelBinarizer
from sklearn.impute import SimpleImputer
from sklearn.model_selection import StratifiedKFold
import lightgbm as lgb
from statistics import mean, stdev

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report, f1_score, make_scorer


import warnings
warnings.filterwarnings('ignore')

In [2]:
train_data = pd.read_csv("train.csv")
train_data.head()

Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Name,Transported
0,0001_01,Europa,False,B/0/P,TRAPPIST-1e,39.0,False,0.0,0.0,0.0,0.0,0.0,Maham Ofracculy,False
1,0002_01,Earth,False,F/0/S,TRAPPIST-1e,24.0,False,109.0,9.0,25.0,549.0,44.0,Juanna Vines,True
2,0003_01,Europa,False,A/0/S,TRAPPIST-1e,58.0,True,43.0,3576.0,0.0,6715.0,49.0,Altark Susent,False
3,0003_02,Europa,False,A/0/S,TRAPPIST-1e,33.0,False,0.0,1283.0,371.0,3329.0,193.0,Solam Susent,False
4,0004_01,Earth,False,F/1/S,TRAPPIST-1e,16.0,False,303.0,70.0,151.0,565.0,2.0,Willy Santantines,True


In [3]:
test_data = pd.read_csv("test.csv")
test_data.head()

Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Name
0,0013_01,Earth,True,G/3/S,TRAPPIST-1e,27.0,False,0.0,0.0,0.0,0.0,0.0,Nelly Carsoning
1,0018_01,Earth,False,F/4/S,TRAPPIST-1e,19.0,False,0.0,9.0,0.0,2823.0,0.0,Lerome Peckers
2,0019_01,Europa,True,C/0/S,55 Cancri e,31.0,False,0.0,0.0,0.0,0.0,0.0,Sabih Unhearfus
3,0021_01,Europa,False,C/1/S,TRAPPIST-1e,38.0,False,0.0,6652.0,0.0,181.0,585.0,Meratz Caltilter
4,0023_01,Earth,False,F/5/S,TRAPPIST-1e,20.0,False,10.0,0.0,635.0,0.0,0.0,Brence Harperez


In [4]:
# class distribution
# transported: False = 0, True = 1
train_data['Transported'].value_counts()

Transported
True     4378
False    4315
Name: count, dtype: int64

the train data is almost balanced

# Pre-Processing

In [5]:
# by default majority class (not transported (False)) will be negative
lb = LabelBinarizer()
train_data['Transported'] = lb.fit_transform(train_data['Transported'].values)


The .isnull() method in Python searches for both None and NaN values

In [6]:
print('\nNull Values in Training \n{}'.format(train_data.isnull().sum()))
print('\nNull Values in Testing \n{}'.format(test_data.isnull().sum()))
print('\nDuplicated values in train {}'.format(train_data.duplicated().sum()))
print('\nDuplicated values in test {}'.format(test_data.duplicated().sum()))



Null Values in Training 
PassengerId       0
HomePlanet      201
CryoSleep       217
Cabin           199
Destination     182
Age             179
VIP             203
RoomService     181
FoodCourt       183
ShoppingMall    208
Spa             183
VRDeck          188
Name            200
Transported       0
dtype: int64

Null Values in Testing 
PassengerId       0
HomePlanet       87
CryoSleep        93
Cabin           100
Destination      92
Age              91
VIP              93
RoomService      82
FoodCourt       106
ShoppingMall     98
Spa             101
VRDeck           80
Name             94
dtype: int64

Duplicated values in train 0

Duplicated values in test 0


we have null values everywhere:

# Dropping Unecessary columns

In [7]:
#Removing less important features
train_data.drop(['PassengerId','Name','Transported'], axis=1, inplace = True)
test_data.drop(['PassengerId','Name'], axis=1, inplace = True)

# Treatment of missing data and duplicates

In [8]:
train_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8693 entries, 0 to 8692
Data columns (total 11 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   HomePlanet    8492 non-null   object 
 1   CryoSleep     8476 non-null   object 
 2   Cabin         8494 non-null   object 
 3   Destination   8511 non-null   object 
 4   Age           8514 non-null   float64
 5   VIP           8490 non-null   object 
 6   RoomService   8512 non-null   float64
 7   FoodCourt     8510 non-null   float64
 8   ShoppingMall  8485 non-null   float64
 9   Spa           8510 non-null   float64
 10  VRDeck        8505 non-null   float64
dtypes: float64(6), object(5)
memory usage: 747.2+ KB


In [9]:
test_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4277 entries, 0 to 4276
Data columns (total 11 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   HomePlanet    4190 non-null   object 
 1   CryoSleep     4184 non-null   object 
 2   Cabin         4177 non-null   object 
 3   Destination   4185 non-null   object 
 4   Age           4186 non-null   float64
 5   VIP           4184 non-null   object 
 6   RoomService   4195 non-null   float64
 7   FoodCourt     4171 non-null   float64
 8   ShoppingMall  4179 non-null   float64
 9   Spa           4176 non-null   float64
 10  VRDeck        4197 non-null   float64
dtypes: float64(6), object(5)
memory usage: 367.7+ KB


# Treatement of Numericals columns

In [10]:
train_data_numerics_columns = train_data.select_dtypes(exclude=['object','bool']).columns.tolist()
imputer = SimpleImputer(strategy='mean')
num_without_nulls_train_data = pd.DataFrame(imputer.fit_transform(train_data[train_data_numerics_columns]),columns=train_data_numerics_columns)
num_without_nulls_train_data.isnull().sum()

Age             0
RoomService     0
FoodCourt       0
ShoppingMall    0
Spa             0
VRDeck          0
dtype: int64

In [11]:
train_data[train_data_numerics_columns] = num_without_nulls_train_data
train_data.isnull().sum()

HomePlanet      201
CryoSleep       217
Cabin           199
Destination     182
Age               0
VIP             203
RoomService       0
FoodCourt         0
ShoppingMall      0
Spa               0
VRDeck            0
dtype: int64

In [12]:
test_data_numerics_columns = test_data.select_dtypes(exclude=['object','bool']).columns.tolist()
imputer = SimpleImputer(strategy='mean')
num_without_nulls_test_data = pd.DataFrame(imputer.fit_transform(test_data[test_data_numerics_columns]),columns=test_data_numerics_columns)
num_without_nulls_test_data.isnull().sum()

Age             0
RoomService     0
FoodCourt       0
ShoppingMall    0
Spa             0
VRDeck          0
dtype: int64

In [13]:
test_data[test_data_numerics_columns] = num_without_nulls_test_data
test_data.isnull().sum()

HomePlanet       87
CryoSleep        93
Cabin           100
Destination      92
Age               0
VIP              93
RoomService       0
FoodCourt         0
ShoppingMall      0
Spa               0
VRDeck            0
dtype: int64

In [14]:
# Calculate the mode value of the 'VIP' column
mode_vip= train_data['VIP'].mode().iloc[0]
mode_vip_test= test_data['VIP'].mode().iloc[0]
# Fill missing values in the 'VIP' column with the mode
train_data['VIP'].fillna(mode_vip, inplace=True)
test_data['VIP'].fillna(mode_vip_test, inplace=True)

In [15]:
train_data['VIP'] = train_data['VIP'].replace({True: 1, False: 0})
test_data['VIP'] = test_data['VIP'].replace({True: 1, False: 0})

In [16]:
train_data['Age'] = train_data['Age'].astype(int)
test_data['Age'] = test_data['Age'].astype(int)

In [17]:
train_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8693 entries, 0 to 8692
Data columns (total 11 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   HomePlanet    8492 non-null   object 
 1   CryoSleep     8476 non-null   object 
 2   Cabin         8494 non-null   object 
 3   Destination   8511 non-null   object 
 4   Age           8693 non-null   int32  
 5   VIP           8693 non-null   int64  
 6   RoomService   8693 non-null   float64
 7   FoodCourt     8693 non-null   float64
 8   ShoppingMall  8693 non-null   float64
 9   Spa           8693 non-null   float64
 10  VRDeck        8693 non-null   float64
dtypes: float64(5), int32(1), int64(1), object(4)
memory usage: 713.2+ KB


Filling cabin missing values

In [18]:
imputer = SimpleImputer(strategy='most_frequent')
train_data['Cabin'] = imputer.fit_transform(train_data['Cabin'].values.reshape(-1,1))[:,0]
test_data['Cabin'] = imputer.fit_transform(test_data['Cabin'].values.reshape(-1,1))[:,0]

In [19]:
train_data.isnull().sum()

HomePlanet      201
CryoSleep       217
Cabin             0
Destination     182
Age               0
VIP               0
RoomService       0
FoodCourt         0
ShoppingMall      0
Spa               0
VRDeck            0
dtype: int64

In [20]:
test_data.isnull().sum()

HomePlanet      87
CryoSleep       93
Cabin            0
Destination     92
Age              0
VIP              0
RoomService      0
FoodCourt        0
ShoppingMall     0
Spa              0
VRDeck           0
dtype: int64

In [21]:
train_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8693 entries, 0 to 8692
Data columns (total 11 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   HomePlanet    8492 non-null   object 
 1   CryoSleep     8476 non-null   object 
 2   Cabin         8693 non-null   object 
 3   Destination   8511 non-null   object 
 4   Age           8693 non-null   int32  
 5   VIP           8693 non-null   int64  
 6   RoomService   8693 non-null   float64
 7   FoodCourt     8693 non-null   float64
 8   ShoppingMall  8693 non-null   float64
 9   Spa           8693 non-null   float64
 10  VRDeck        8693 non-null   float64
dtypes: float64(5), int32(1), int64(1), object(4)
memory usage: 713.2+ KB


In [22]:
train_data.isnull().sum()

HomePlanet      201
CryoSleep       217
Cabin             0
Destination     182
Age               0
VIP               0
RoomService       0
FoodCourt         0
ShoppingMall      0
Spa               0
VRDeck            0
dtype: int64

# Treatement of Categorical Columns

In [23]:
from missforest.missforest import MissForest
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier


mf = MissForest()
train_data_imputed = mf.fit_transform(train_data,categorical=['HomePlanet','CryoSleep','Destination','Cabin'])
test_data_imputed = mf.fit_transform(test_data,categorical=['HomePlanet','CryoSleep','Destination','Cabin'])

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000374 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1616
[LightGBM] [Info] Number of data points in the train set: 8103, number of used features: 10
[LightGBM] [Info] Start training from score -1.380265
[LightGBM] [Info] Start training from score -0.612905
[LightGBM] [Info] Start training from score -1.576421
[LightGBM] [Info] Number of positive: 2914, number of negative: 5189
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000327 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1617
[LightGBM] [Info] Number of data points in the train set: 8103, number of used features: 10
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.359620 -> inits

In [24]:
train_data_imputed.isnull().sum()

HomePlanet      0
CryoSleep       0
Cabin           0
Destination     0
Age             0
VIP             0
RoomService     0
FoodCourt       0
ShoppingMall    0
Spa             0
VRDeck          0
dtype: int64

In [25]:
test_data_imputed.isnull().sum()

HomePlanet      0
CryoSleep       0
Cabin           0
Destination     0
Age             0
VIP             0
RoomService     0
FoodCourt       0
ShoppingMall    0
Spa             0
VRDeck          0
dtype: int64

# Transforming Cabin to deck/num/side

In [26]:
# split the Cabin column into three columns
train_data_imputed[['Deck', 'Num','Side']] = train_data_imputed['Cabin'].str.split('/', expand=True)
train_data_imputed.head()

Unnamed: 0,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Deck,Num,Side
0,Europa,False,B/0/P,TRAPPIST-1e,39,0,0.0,0.0,0.0,0.0,0.0,B,0,P
1,Earth,False,F/0/S,TRAPPIST-1e,24,0,109.0,9.0,25.0,549.0,44.0,F,0,S
2,Europa,False,A/0/S,TRAPPIST-1e,58,1,43.0,3576.0,0.0,6715.0,49.0,A,0,S
3,Europa,False,A/0/S,TRAPPIST-1e,33,0,0.0,1283.0,371.0,3329.0,193.0,A,0,S
4,Earth,False,F/1/S,TRAPPIST-1e,16,0,303.0,70.0,151.0,565.0,2.0,F,1,S


In [27]:
train_data_imputed.drop(['Cabin'],axis=1,inplace=True)

In [28]:
# split the Cabin column into three columns
test_data_imputed[['Deck', 'Num','Side']] = test_data_imputed['Cabin'].str.split('/', expand=True)
test_data_imputed.head()

Unnamed: 0,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Deck,Num,Side
0,Earth,True,G/3/S,TRAPPIST-1e,27,0,0.0,0.0,0.0,0.0,0.0,G,3,S
1,Earth,False,F/4/S,TRAPPIST-1e,19,0,0.0,9.0,0.0,2823.0,0.0,F,4,S
2,Europa,True,C/0/S,55 Cancri e,31,0,0.0,0.0,0.0,0.0,0.0,C,0,S
3,Europa,False,C/1/S,TRAPPIST-1e,38,0,0.0,6652.0,0.0,181.0,585.0,C,1,S
4,Earth,False,F/5/S,TRAPPIST-1e,20,0,10.0,0.0,635.0,0.0,0.0,F,5,S


In [29]:
test_data_imputed.drop(['Cabin'],axis=1,inplace=True)

Transforming VIP to int

In [30]:
train_data_imputed['VIP'] = train_data_imputed['VIP'].replace({True: 1, False: 0})
test_data_imputed['VIP'] = test_data_imputed['VIP'].replace({True: 1, False: 0})

#  Solving mismatch in train and test set after categorical encoding

In [31]:
train_data_imputed['train']=1
test_data_imputed['train']=0

In [32]:
combined = pd.concat([train_data_imputed,test_data_imputed])

In [33]:
combined = pd.get_dummies(combined)

In [34]:
combined.head()

Unnamed: 0,CryoSleep,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,train,HomePlanet_Earth,...,Num_992,Num_993,Num_994,Num_995,Num_996,Num_997,Num_998,Num_999,Side_P,Side_S
0,False,39,0,0.0,0.0,0.0,0.0,0.0,1,False,...,False,False,False,False,False,False,False,False,True,False
1,False,24,0,109.0,9.0,25.0,549.0,44.0,1,True,...,False,False,False,False,False,False,False,False,False,True
2,False,58,1,43.0,3576.0,0.0,6715.0,49.0,1,False,...,False,False,False,False,False,False,False,False,False,True
3,False,33,0,0.0,1283.0,371.0,3329.0,193.0,1,False,...,False,False,False,False,False,False,False,False,False,True
4,False,16,0,303.0,70.0,151.0,565.0,2.0,1,True,...,False,False,False,False,False,False,False,False,False,True


In [35]:
train_data_dum = combined[combined['train'] == 1]
test_data_dum = combined[combined['train'] == 0]
train_data_dum.drop(['train'],axis=1,inplace=True)
test_data_dum.drop(['train'],axis=1,inplace=True)

# Stratified K-Fold Cross-Validation

In [36]:
X = train_data_dum
y = pd.read_csv('train.csv')['Transported']
X = np.asarray(X)
y = np.asarray(y)

In [38]:


# Define hyperparameters for LightGBM
params = {
    'objective': 'binary',  # specifies that it’s a binary classification task.
    'metric': 'auc',  # sets the evaluation metric to binary log loss.
    'boosting_type': 'gbdt',  #specifies the gradient boosting algorithm.
    "'num_iterations':10000,"  #the same as n_estimators
    'num_leaves': 31,         #sets the maximum number of leaves in each tree.
    'learning_rate': 0.05,    #determines the step size in gradient descent.
    'feature_fraction': 0.9,
    'bagging_fraction': 0.8,
    'bagging_freq': 5,
    'verbose': 1   #controls the fraction of features used for each tree.
}

# Number of folds for stratified cross-validation
num_folds = 5
 
# Initialize StratifiedKFold
skf = StratifiedKFold(n_splits=num_folds, shuffle=True, random_state=42)
 
# Initialize an empty list to store cross-validation scores
cv_scores = []
 
# Perform stratified k-fold cross-validation
for train_index, val_index in skf.split(X, y):
    X_train, X_val = X[train_index], X[val_index]
    y_train, y_val = y[train_index], y[val_index]
    
    model = lgb.LGBMClassifier(**params)
    model.fit(X_train, y_train)
    
    predictions = model.predict(X_val)
    accuracy = accuracy_score(y_val, predictions) 
    cv_scores.append(accuracy)


# Print the output.
print('List of possible accuracy:', cv_scores)
print('\nMaximum Accuracy That can be obtained from this model is:',
      max(cv_scores)*100, '%')
print('\nMinimum Accuracy:',
      min(cv_scores)*100, '%')
print('\nOverall Accuracy:',
      mean(cv_scores)*100, '%')
print('\nStandard Deviation is:', stdev(cv_scores))


[LightGBM] [Info] Number of positive: 3502, number of negative: 3452
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000347 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1391
[LightGBM] [Info] Number of data points in the train set: 6954, number of used features: 25
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.503595 -> initscore=0.014380
[LightGBM] [Info] Start training from score 0.014380
[LightGBM] [Info] Number of positive: 3502, number of negative: 3452
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000341 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1393
[LightGBM] [Info] Number of data points in the train set: 6954, number of used features: 26
[LightGBM] [Info] [binary:

In [None]:
import lightgbm as lgb
import numpy as np
import pandas as pd
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import accuracy_score
from sklearn.datasets import load_iris

# Load the Iris dataset
data = load_iris()
X = data.data
y = data.target

# Define hyperparameters for LightGBM
params = {
	'objective': 'multiclass', # For multi-class classification
	'metric': 'multi_logloss', # Logarithmic loss for multiclass
	'boosting_type': 'gbdt',
	'num_class': 3, # Number of classes in Iris dataset
	'num_leaves': 31,
	'learning_rate': 0.05,
	'feature_fraction': 0.9
}

# Number of folds for stratified cross-validation
num_folds = 5

# Initialize StratifiedKFold
skf = StratifiedKFold(n_splits=num_folds, shuffle=True, random_state=42)

# Initialize an empty list to store cross-validation scores
cv_scores = []

# Perform stratified k-fold cross-validation
for train_index, val_index in skf.split(X, y):
	X_train, X_val = X[train_index], X[val_index]
	y_train, y_val = y[train_index], y[val_index]
	
	train_data = lgb.Dataset(X_train, label=y_train)
	val_data = lgb.Dataset(X_val, label=y_val, reference=train_data)
	
	# Train LightGBM model with early stopping
	model = lgb.train(params, train_data, num_boost_round=1000, valid_sets=[val_data])
	
	# Make predictions on the validation set
	val_pred = model.predict(X_val, num_iteration=model.best_iteration)
	
	print(val_pred)

# Calculate the mean and standard deviation of accuracy across folds
mean_accuracy = np.mean(cv_scores)
std_accuracy = np.std(cv_scores)

print(f'Mean Accuracy: {mean_accuracy:.4f}')
print(f'Std Accuracy: {std_accuracy:.4f}')


In [None]:
targets = pd.read_csv('train.csv')['Transported']
X_train, X_test, y_train, y_test = train_test_split(train_data_dum, targets,random_state=42,test_size=0.2,stratify=targets)

train_test_split with stratify=True results in consistent class distribution betwen training and test sets.

In [None]:
print('y_train class distribution')
print(y_train.value_counts(normalize=True))

print('y_test class distribution')
print(y_test.value_counts(normalize=True))

# Building the lightgbm model

In [None]:
import lightgbm as lgb
clf = lgb.LGBMClassifier()
clf.fit(X_train, y_train)

In [None]:
# predict the results
y_pred=clf.predict(X_test)

In [None]:
# view accuracy
from sklearn.metrics import accuracy_score
accuracy=accuracy_score(y_pred, y_test)
print('LightGBM Model accuracy score: {0:0.4f}'.format(accuracy_score(y_test, y_pred)))

In [None]:
predictions = clf.predict(test_data_dum)

In [None]:
predictions = predictions.astype(dtype=bool)

In [None]:
output = pd.DataFrame({'PassengerId': pd.read_csv("test.csv").PassengerId, 'Transported': predictions})
output.to_csv('submission.csv', index=False)
print("Your submission was successfully saved!")