In [1]:
import numpy as np
import pandas as pd
from sklearn.pipeline import Pipeline
from sklearn.base import BaseEstimator, ClassifierMixin
from sklearn.model_selection import train_test_split, KFold
from sklearn.preprocessing import StandardScaler


In [2]:
df = pd.read_csv("train.csv")

In [3]:
df

Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Name,Transported
0,0001_01,Europa,False,B/0/P,TRAPPIST-1e,39.0,False,0.0,0.0,0.0,0.0,0.0,Maham Ofracculy,False
1,0002_01,Earth,False,F/0/S,TRAPPIST-1e,24.0,False,109.0,9.0,25.0,549.0,44.0,Juanna Vines,True
2,0003_01,Europa,False,A/0/S,TRAPPIST-1e,58.0,True,43.0,3576.0,0.0,6715.0,49.0,Altark Susent,False
3,0003_02,Europa,False,A/0/S,TRAPPIST-1e,33.0,False,0.0,1283.0,371.0,3329.0,193.0,Solam Susent,False
4,0004_01,Earth,False,F/1/S,TRAPPIST-1e,16.0,False,303.0,70.0,151.0,565.0,2.0,Willy Santantines,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8688,9276_01,Europa,False,A/98/P,55 Cancri e,41.0,True,0.0,6819.0,0.0,1643.0,74.0,Gravior Noxnuther,False
8689,9278_01,Earth,True,G/1499/S,PSO J318.5-22,18.0,False,0.0,0.0,0.0,0.0,0.0,Kurta Mondalley,False
8690,9279_01,Earth,False,G/1500/S,TRAPPIST-1e,26.0,False,0.0,0.0,1872.0,1.0,0.0,Fayey Connon,True
8691,9280_01,Europa,False,E/608/S,55 Cancri e,32.0,False,0.0,1049.0,0.0,353.0,3235.0,Celeon Hontichre,False


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8693 entries, 0 to 8692
Data columns (total 14 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   PassengerId   8693 non-null   object 
 1   HomePlanet    8492 non-null   object 
 2   CryoSleep     8476 non-null   object 
 3   Cabin         8494 non-null   object 
 4   Destination   8511 non-null   object 
 5   Age           8514 non-null   float64
 6   VIP           8490 non-null   object 
 7   RoomService   8512 non-null   float64
 8   FoodCourt     8510 non-null   float64
 9   ShoppingMall  8485 non-null   float64
 10  Spa           8510 non-null   float64
 11  VRDeck        8505 non-null   float64
 12  Name          8493 non-null   object 
 13  Transported   8693 non-null   bool   
dtypes: bool(1), float64(6), object(7)
memory usage: 891.5+ KB


In [5]:
class prep():
    
    def fit(self, df):
        df[["Deck", "Number", "Side"]] = df["Cabin"].str.split("/", expand = True)       
        df[["Passenger", "Group"]] = df["PassengerId"].str.split("_", expand = True)                
        
        df["Side"] = df["Side"].fillna(df.groupby("Group")["Side"].transform(lambda x: x.mode()[0]))
        df["Deck"] = df["Deck"].fillna(df.groupby("Group")["Deck"].transform(lambda x: x.mode()[0]))
        df[["CryoSleep", "HomePlanet", "VIP"]] = df[["CryoSleep", "HomePlanet", "VIP"]].fillna(df.mode().iloc[0])
        df[["CryoSleep", "VIP"]] = df[["CryoSleep", "VIP"]].astype(int)

        
        one_hot = pd.get_dummies(df["HomePlanet"], prefix = "is_")
        df_ = pd.concat([df, one_hot], axis = 1)
        df = df_.drop("HomePlanet", axis = 1)
        
        one_hot_Deck = pd.get_dummies(df["Deck"], prefix = "Deck_")
        df_ = pd.concat([df, one_hot_Deck], axis = 1)
        df = df_.drop("Deck", axis = 1)
        
        num_cols = df.select_dtypes(include =["int64", "float64"]).columns.tolist()
        df[num_cols] = df[num_cols].fillna(df[num_cols].median())
        
        
        df["Side"] = (df["Side"] == 'P').astype(int)
        df.drop(["PassengerId", "Cabin", "Name", "Destination", "Number","Group","Passenger"], axis = 1, inplace = True)
        
        
        return df
    
    
    
    

# Logistic Regression

In [6]:
class LogisticReg(BaseEstimator, ClassifierMixin):
    
    def __init__(self, alpha = 0.01, learning_rate = 1000):
        self.alpha = alpha
        self.learning_rate = learning_rate
        self.theta = None
        
    def fit(self, X, y):
        
        self.cost_list = []
        self.lr_list = []
        m = X.shape[0]
        X = np.c_[X, np.ones(X.shape[0])]
        self.theta = np.zeros(X.shape[1])
        
        for i in range(self.learning_rate):
        
            z = np.dot(X, self.theta)
            
            h_x = 1 / (1 + np.exp(-z))
            
            epsilon = 1e-10  # Small constant to prevent log(0)
            h_x = np.clip(h_x, epsilon, 1 - epsilon)


            cost = (-1/m)*np.sum(( y* np.log(h_x)) + ((1-y)*np.log(1-h_x)))

            grad = (1/m)*np.dot(X.T, (h_x - y))

            self.theta -= self.alpha* grad
            
            self.cost_list.append(cost)
            self.lr_list.append(i)
            
        return self 
            
        
    def predict(self, X):
            
        X = np.c_[X, np.ones(X.shape[0])]
        z = np.dot(X, self.theta)
        y_pred = 1/(1+np.exp(-z))
            
        return (y_pred >= 0.5)
        
        
    def get_cost_list(self):
        return self.cost_list
        
    def get_lr_list(self):
        return self.lr_list
        
        
    

In [7]:
def GridSearch(pipeline, X, y, param_grid, cv = 5):
    best_score = -np.inf
    results = []
    best_params = None
    
    kf = KFold(n_splits = cv, shuffle = True, random_state = 42)
    
    for alpha in param_grid["alpha"]:
        for learning_rate in param_grid["learning_rate"]:
            scores = []
            
            for train_index, val_index in kf.split(X):
                
                X_train, X_val = X.iloc[train_index], X.iloc[val_index]
                y_train, y_val = y.iloc[train_index], y.iloc[val_index]
                
                
            
                pipeline.set_params(

                    model__alpha = alpha,
                    model__learning_rate = learning_rate               
                )

                pipeline.fit(X_train, y_train)

                y_pred = pipeline.predict(X_val)
                scores.append(np.mean(y_pred == y_val))

            mean_score = np.mean(scores)
            results.append((alpha, learning_rate, mean_score))

            if mean_score > best_score:
                best_score = mean_score
                best_params = {
                    "alpha" : alpha,
                    "learning_rate" : learning_rate
                }
                
    return best_score, results, best_params
            
    
    
    
    

In [9]:
param_grid = {
    "alpha": np.arange(0.001, 0.1, 0.05).tolist(),  # Learning rate step size
    "learning_rate": np.arange(500, 6000, 500).tolist()   # Number of iterations
}

In [10]:
pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ("model", LogisticReg(alpha = 0.01, learning_rate = 1000))
])

In [11]:
Prep = prep()
df = Prep.fit(df)

In [12]:
X = df.drop("Transported", axis = 1)
y = df["Transported"]

In [13]:
X_train, X_test, y_train, y_test = train_test_split(X,y, train_size = 0.8, random_state = 42)

In [14]:
pipeline.fit(X_train, y_train)

In [15]:
Accuracy, results, best_params = GridSearch(pipeline, X, y, param_grid, cv = 5)

In [16]:
best_params

{'alpha': 0.051000000000000004, 'learning_rate': 4500}

In [17]:
results

[(0.001, 500, 0.7499153316820971),
 (0.001, 1000, 0.7522160335788131),
 (0.001, 1500, 0.7535964017784648),
 (0.001, 2000, 0.7550917786037636),
 (0.001, 2500, 0.7557817641846729),
 (0.001, 3000, 0.7567022302276813),
 (0.001, 3500, 0.7587729810460756),
 (0.001, 4000, 0.7601534154186995),
 (0.001, 4500, 0.7615335850994349),
 (0.001, 5000, 0.7619938842939112),
 (0.001, 5500, 0.7630294251355388),
 (0.051000000000000004, 500, 0.7795940420502768),
 (0.051000000000000004, 1000, 0.7856908888419796),
 (0.051000000000000004, 1500, 0.7871860671483618),
 (0.051000000000000004, 2000, 0.7874163490915442),
 (0.051000000000000004, 2500, 0.787416547610461),
 (0.051000000000000004, 3000, 0.7883368151345528),
 (0.051000000000000004, 3500, 0.7896021085355855),
 (0.051000000000000004, 4000, 0.7913275687851502),
 (0.051000000000000004, 4500, 0.7919029427782458),
 (0.051000000000000004, 5000, 0.7913278334770389),
 (0.051000000000000004, 5500, 0.7919029427782458)]

In [17]:
pipeline.set_params(model__alpha = best_params["n_estimators"], model__learning_rate = best_params["iterations"])

pipeline.fit(X_train, y_train)

KeyError: 'n_estimators'

In [None]:
y_pred_test = pipeline.predict(X_test)

In [None]:
accuracy_train = np.mean(y_pred_test == y_test)
accuracy_train

In [None]:

df_test = pd.read_csv("test.csv")

In [None]:
df_test = Prep.fit(df_test)

In [None]:
df_test

In [None]:
y_final_test = pipeline.predict(df_test)

In [None]:
y_final_test

In [None]:
test_df = pd.read_csv("test.csv")

In [None]:
submission = pd.DataFrame({
    'PassengerId': test_df['PassengerId'],
    'Transported': np.where(y_final_test, 'True', 'False')
})
submission.to_csv('submission2.csv', index=False)

In [None]:
model = pipeline.named_steps['model']    
import matplotlib.pyplot as plt    
plt.plot(model.get_lr_list(), model.get_cost_list())     
plt.xlabel("Iterations")    
plt.ylabel("Cost")   
plt.title("Cost vs Iterations")   
plt.show()     