In [40]:
import numpy as np
import pandas as pd
from sklearn.pipeline import Pipeline
from sklearn.base import BaseEstimator, ClassifierMixin

In [2]:
df = pd.read_csv("train.csv")

In [3]:
df

Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Name,Transported
0,0001_01,Europa,False,B/0/P,TRAPPIST-1e,39.0,False,0.0,0.0,0.0,0.0,0.0,Maham Ofracculy,False
1,0002_01,Earth,False,F/0/S,TRAPPIST-1e,24.0,False,109.0,9.0,25.0,549.0,44.0,Juanna Vines,True
2,0003_01,Europa,False,A/0/S,TRAPPIST-1e,58.0,True,43.0,3576.0,0.0,6715.0,49.0,Altark Susent,False
3,0003_02,Europa,False,A/0/S,TRAPPIST-1e,33.0,False,0.0,1283.0,371.0,3329.0,193.0,Solam Susent,False
4,0004_01,Earth,False,F/1/S,TRAPPIST-1e,16.0,False,303.0,70.0,151.0,565.0,2.0,Willy Santantines,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8688,9276_01,Europa,False,A/98/P,55 Cancri e,41.0,True,0.0,6819.0,0.0,1643.0,74.0,Gravior Noxnuther,False
8689,9278_01,Earth,True,G/1499/S,PSO J318.5-22,18.0,False,0.0,0.0,0.0,0.0,0.0,Kurta Mondalley,False
8690,9279_01,Earth,False,G/1500/S,TRAPPIST-1e,26.0,False,0.0,0.0,1872.0,1.0,0.0,Fayey Connon,True
8691,9280_01,Europa,False,E/608/S,55 Cancri e,32.0,False,0.0,1049.0,0.0,353.0,3235.0,Celeon Hontichre,False


In [4]:
df.isnull().sum()

PassengerId       0
HomePlanet      201
CryoSleep       217
Cabin           199
Destination     182
Age             179
VIP             203
RoomService     181
FoodCourt       183
ShoppingMall    208
Spa             183
VRDeck          188
Name            200
Transported       0
dtype: int64

In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8693 entries, 0 to 8692
Data columns (total 14 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   PassengerId   8693 non-null   object 
 1   HomePlanet    8492 non-null   object 
 2   CryoSleep     8476 non-null   object 
 3   Cabin         8494 non-null   object 
 4   Destination   8511 non-null   object 
 5   Age           8514 non-null   float64
 6   VIP           8490 non-null   object 
 7   RoomService   8512 non-null   float64
 8   FoodCourt     8510 non-null   float64
 9   ShoppingMall  8485 non-null   float64
 10  Spa           8510 non-null   float64
 11  VRDeck        8505 non-null   float64
 12  Name          8493 non-null   object 
 13  Transported   8693 non-null   bool   
dtypes: bool(1), float64(6), object(7)
memory usage: 891.5+ KB


In [6]:
df.columns

Index(['PassengerId', 'HomePlanet', 'CryoSleep', 'Cabin', 'Destination', 'Age',
       'VIP', 'RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck',
       'Name', 'Transported'],
      dtype='object')

# Fill missing values in Cabin

In [7]:
df["Cabin"].value_counts()

G/734/S     8
G/109/P     7
B/201/P     7
G/1368/P    7
G/981/S     7
           ..
G/556/P     1
E/231/S     1
G/545/S     1
G/543/S     1
F/947/P     1
Name: Cabin, Length: 6560, dtype: int64

In [8]:
df[["Deck", "Number", "Side"]] = df["Cabin"].str.split("/", expand = True)

In [9]:
df[["Passenger", "Group"]] = df["PassengerId"].str.split("_", expand = True)

In [10]:
df["Deck"].value_counts()

F    2794
G    2559
E     876
B     779
C     747
D     478
A     256
T       5
Name: Deck, dtype: int64

In [11]:
df["Deck"].isnull().sum()

199

In [12]:
df["Side"].value_counts()

S    4288
P    4206
Name: Side, dtype: int64

In [13]:
df["Side"].isnull().sum()    

199

In [14]:
df.groupby("Group")["Deck"].count()

Group
01    6083
02    1377
03     551
04     225
05     127
06      75
07      43
08      13
Name: Deck, dtype: int64

In [15]:
df["Deck"] = df["Deck"].fillna(df.groupby("Group")["Deck"].transform(lambda x: x.mode()[0]))

In [16]:
df["Deck"].value_counts()

F    2963
G    2589
E     876
B     779
C     747
D     478
A     256
T       5
Name: Deck, dtype: int64

In [17]:
df.isnull().sum()

PassengerId       0
HomePlanet      201
CryoSleep       217
Cabin           199
Destination     182
Age             179
VIP             203
RoomService     181
FoodCourt       183
ShoppingMall    208
Spa             183
VRDeck          188
Name            200
Transported       0
Deck              0
Number          199
Side            199
Passenger         0
Group             0
dtype: int64

In [18]:
df = df.drop(["PassengerId", "Cabin", "Name", "Destination", "Number"], axis = 1)

In [19]:
df

Unnamed: 0,HomePlanet,CryoSleep,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Transported,Deck,Side,Passenger,Group
0,Europa,False,39.0,False,0.0,0.0,0.0,0.0,0.0,False,B,P,0001,01
1,Earth,False,24.0,False,109.0,9.0,25.0,549.0,44.0,True,F,S,0002,01
2,Europa,False,58.0,True,43.0,3576.0,0.0,6715.0,49.0,False,A,S,0003,01
3,Europa,False,33.0,False,0.0,1283.0,371.0,3329.0,193.0,False,A,S,0003,02
4,Earth,False,16.0,False,303.0,70.0,151.0,565.0,2.0,True,F,S,0004,01
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8688,Europa,False,41.0,True,0.0,6819.0,0.0,1643.0,74.0,False,A,P,9276,01
8689,Earth,True,18.0,False,0.0,0.0,0.0,0.0,0.0,False,G,S,9278,01
8690,Earth,False,26.0,False,0.0,0.0,1872.0,1.0,0.0,True,G,S,9279,01
8691,Europa,False,32.0,False,0.0,1049.0,0.0,353.0,3235.0,False,E,S,9280,01


In [20]:
df.isnull().sum()

HomePlanet      201
CryoSleep       217
Age             179
VIP             203
RoomService     181
FoodCourt       183
ShoppingMall    208
Spa             183
VRDeck          188
Transported       0
Deck              0
Side            199
Passenger         0
Group             0
dtype: int64

In [21]:
df["HomePlanet"].value_counts()

Earth     4602
Europa    2131
Mars      1759
Name: HomePlanet, dtype: int64

In [22]:
df.groupby("Deck")["FoodCourt"].count() 

Deck
A     254
B     766
C     724
D     469
E     862
F    2893
G    2537
T       5
Name: FoodCourt, dtype: int64

In [23]:
df["Side"] = df["Side"].fillna(df.groupby("Group")["Side"].transform(lambda x: x.mode()[0]))

In [24]:
df["Side"].value_counts()

S    4353
P    4340
Name: Side, dtype: int64

In [25]:
df.isnull().sum()

HomePlanet      201
CryoSleep       217
Age             179
VIP             203
RoomService     181
FoodCourt       183
ShoppingMall    208
Spa             183
VRDeck          188
Transported       0
Deck              0
Side              0
Passenger         0
Group             0
dtype: int64

In [26]:
df[["CryoSleep", "HomePlanet", "VIP", "Transported"]] = df[["CryoSleep", "HomePlanet", "VIP", "Transported"]].fillna(df.mode().iloc[0])


In [27]:
df[["CryoSleep", "VIP", "Transported"]] = df[["CryoSleep", "VIP", "Transported"]].astype(int)

In [28]:
one_hot = pd.get_dummies(df["HomePlanet"], prefix = "is_")
df_ = pd.concat([df, one_hot], axis = 1)
df = df_.drop("HomePlanet", axis = 1)

In [29]:
df.isnull().sum()

CryoSleep         0
Age             179
VIP               0
RoomService     181
FoodCourt       183
ShoppingMall    208
Spa             183
VRDeck          188
Transported       0
Deck              0
Side              0
Passenger         0
Group             0
is__Earth         0
is__Europa        0
is__Mars          0
dtype: int64

In [30]:
df.isnull().sum()

CryoSleep         0
Age             179
VIP               0
RoomService     181
FoodCourt       183
ShoppingMall    208
Spa             183
VRDeck          188
Transported       0
Deck              0
Side              0
Passenger         0
Group             0
is__Earth         0
is__Europa        0
is__Mars          0
dtype: int64

In [31]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8693 entries, 0 to 8692
Data columns (total 16 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   CryoSleep     8693 non-null   int64  
 1   Age           8514 non-null   float64
 2   VIP           8693 non-null   int64  
 3   RoomService   8512 non-null   float64
 4   FoodCourt     8510 non-null   float64
 5   ShoppingMall  8485 non-null   float64
 6   Spa           8510 non-null   float64
 7   VRDeck        8505 non-null   float64
 8   Transported   8693 non-null   int64  
 9   Deck          8693 non-null   object 
 10  Side          8693 non-null   object 
 11  Passenger     8693 non-null   object 
 12  Group         8693 non-null   object 
 13  is__Earth     8693 non-null   uint8  
 14  is__Europa    8693 non-null   uint8  
 15  is__Mars      8693 non-null   uint8  
dtypes: float64(6), int64(3), object(4), uint8(3)
memory usage: 908.5+ KB


In [32]:
num_cols = df.select_dtypes(include =["int64", "float64"]).columns.tolist()

In [33]:
df[num_cols] = df[num_cols].fillna(df[num_cols].median())

In [34]:
df.isnull().sum()

CryoSleep       0
Age             0
VIP             0
RoomService     0
FoodCourt       0
ShoppingMall    0
Spa             0
VRDeck          0
Transported     0
Deck            0
Side            0
Passenger       0
Group           0
is__Earth       0
is__Europa      0
is__Mars        0
dtype: int64

In [35]:
one_hot_Deck = pd.get_dummies(df["Deck"], prefix = "Deck_")
df_ = pd.concat([df, one_hot_Deck], axis = 1)
df = df_.drop("Deck", axis = 1)




In [36]:
df

Unnamed: 0,CryoSleep,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Transported,Side,...,is__Europa,is__Mars,Deck__A,Deck__B,Deck__C,Deck__D,Deck__E,Deck__F,Deck__G,Deck__T
0,0,39.0,0,0.0,0.0,0.0,0.0,0.0,0,P,...,1,0,0,1,0,0,0,0,0,0
1,0,24.0,0,109.0,9.0,25.0,549.0,44.0,1,S,...,0,0,0,0,0,0,0,1,0,0
2,0,58.0,1,43.0,3576.0,0.0,6715.0,49.0,0,S,...,1,0,1,0,0,0,0,0,0,0
3,0,33.0,0,0.0,1283.0,371.0,3329.0,193.0,0,S,...,1,0,1,0,0,0,0,0,0,0
4,0,16.0,0,303.0,70.0,151.0,565.0,2.0,1,S,...,0,0,0,0,0,0,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8688,0,41.0,1,0.0,6819.0,0.0,1643.0,74.0,0,P,...,1,0,1,0,0,0,0,0,0,0
8689,1,18.0,0,0.0,0.0,0.0,0.0,0.0,0,S,...,0,0,0,0,0,0,0,0,1,0
8690,0,26.0,0,0.0,0.0,1872.0,1.0,0.0,1,S,...,0,0,0,0,0,0,0,0,1,0
8691,0,32.0,0,0.0,1049.0,0.0,353.0,3235.0,0,S,...,1,0,0,0,0,0,1,0,0,0


# Logistic Regression

In [41]:
class LogisticReg(BaseEstimator, ClassifierMixin):
    
    def __init__(self, alpha, learning_rate):
        self.alpha = alpha
        self.learning_rate = learning_rate
        self.theta = None
        
    def fit(self, X, y):
        
        self.cost_list = []
        self.lr_list = []
        m = X.shape[0]
        X = np.c_[X, np.ones(X.shape[0])]
        self.theta = np.zeros(X.shape[1])
        
        for i in range(self.learning_rate):
        
            z = np.dot(X, self.theta)
            h_x = 1/(1+np.exp(-z))

            cost = (-1/m)*np.sum(( y* np.log(h_x)) + ((1-y)*np.log(1-h_x)))

            grad = (1/m)*np.dot(X.T, (h_x - y))

            self.theta -= self.alpha* grad
            
            self.cost_list.append(cost)
            self.lr_list.append(lr_list)
            
            return self 
            
        
        def predict(self, X):
            
            X = np.c_[X, np.ones(X.shape[0])]
            z = np.dot(X, self.theta)
            y_pred = 1/(1+np.exp(-z))
            
            return (y_pred >= 0.5).astype(int)
        
        
    