In [28]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.cluster import KMeans
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import silhouette_score
from sklearn.model_selection import train_test_split
from sklearn.base import BaseEstimator, ClassifierMixin

In [3]:
df = pd.read_csv("smoking_driking_dataset_Ver01.csv.gz")

In [4]:
df

Unnamed: 0,sex,age,height,weight,waistline,sight_left,sight_right,hear_left,hear_right,SBP,...,LDL_chole,triglyceride,hemoglobin,urine_protein,serum_creatinine,SGOT_AST,SGOT_ALT,gamma_GTP,SMK_stat_type_cd,DRK_YN
0,Male,35,170,75,90.0,1.0,1.0,1.0,1.0,120.0,...,126.0,92.0,17.1,1.0,1.0,21.0,35.0,40.0,1.0,Y
1,Male,30,180,80,89.0,0.9,1.2,1.0,1.0,130.0,...,148.0,121.0,15.8,1.0,0.9,20.0,36.0,27.0,3.0,N
2,Male,40,165,75,91.0,1.2,1.5,1.0,1.0,120.0,...,74.0,104.0,15.8,1.0,0.9,47.0,32.0,68.0,1.0,N
3,Male,50,175,80,91.0,1.5,1.2,1.0,1.0,145.0,...,104.0,106.0,17.6,1.0,1.1,29.0,34.0,18.0,1.0,N
4,Male,50,165,60,80.0,1.0,1.2,1.0,1.0,138.0,...,117.0,104.0,13.8,1.0,0.8,19.0,12.0,25.0,1.0,N
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
991341,Male,45,175,80,92.1,1.5,1.5,1.0,1.0,114.0,...,125.0,132.0,15.0,1.0,1.0,26.0,36.0,27.0,1.0,N
991342,Male,35,170,75,86.0,1.0,1.5,1.0,1.0,119.0,...,84.0,45.0,15.8,1.0,1.1,14.0,17.0,15.0,1.0,N
991343,Female,40,155,50,68.0,1.0,0.7,1.0,1.0,110.0,...,77.0,157.0,14.3,1.0,0.8,30.0,27.0,17.0,3.0,Y
991344,Male,25,175,60,72.0,1.5,1.0,1.0,1.0,119.0,...,73.0,53.0,14.5,1.0,0.8,21.0,14.0,17.0,1.0,N


# Data Preparation
## Check for Missing Values

In [5]:
df.isnull().sum()

sex                 0
age                 0
height              0
weight              0
waistline           0
sight_left          0
sight_right         0
hear_left           0
hear_right          0
SBP                 0
DBP                 0
BLDS                0
tot_chole           0
HDL_chole           0
LDL_chole           0
triglyceride        0
hemoglobin          0
urine_protein       0
serum_creatinine    0
SGOT_AST            0
SGOT_ALT            0
gamma_GTP           0
SMK_stat_type_cd    0
DRK_YN              0
dtype: int64

# Data Processing

## Converting categorical columns into 0 and 1

In [7]:
df.head(5)

Unnamed: 0,sex,age,height,weight,waistline,sight_left,sight_right,hear_left,hear_right,SBP,...,LDL_chole,triglyceride,hemoglobin,urine_protein,serum_creatinine,SGOT_AST,SGOT_ALT,gamma_GTP,SMK_stat_type_cd,DRK_YN
0,Male,35,170,75,90.0,1.0,1.0,1.0,1.0,120.0,...,126.0,92.0,17.1,1.0,1.0,21.0,35.0,40.0,1.0,Y
1,Male,30,180,80,89.0,0.9,1.2,1.0,1.0,130.0,...,148.0,121.0,15.8,1.0,0.9,20.0,36.0,27.0,3.0,N
2,Male,40,165,75,91.0,1.2,1.5,1.0,1.0,120.0,...,74.0,104.0,15.8,1.0,0.9,47.0,32.0,68.0,1.0,N
3,Male,50,175,80,91.0,1.5,1.2,1.0,1.0,145.0,...,104.0,106.0,17.6,1.0,1.1,29.0,34.0,18.0,1.0,N
4,Male,50,165,60,80.0,1.0,1.2,1.0,1.0,138.0,...,117.0,104.0,13.8,1.0,0.8,19.0,12.0,25.0,1.0,N


In [13]:
df["sex"] = (df["sex"] == "Male").astype(int)
df["DRK_YN"] = (df["DRK_YN"] == "Y").astype(int)

In [18]:
X = df.drop("DRK_YN", axis = 1)
y = df["DRK_YN"]

In [21]:
X_train, X_test, y_train, y_test = train_test_split(X,y, train_size = 0.8,shuffle = True, random_state = 42)

In [85]:
class Logistic(BaseEstimator, ClassifierMixin):
    
    def __init__(self, learning_rate = 1000, alpha = 0.01):
        self.learning_rate = learning_rate
        self.alpha = alpha
              
        
    def fit(self, X, y):
        
       
        self.cost_list = []
        self.ite_list = []
        X = np.c_[X, np.ones(X.shape[0])]
        self.theta = np.zeros(X.shape[1])
        m = len(X)
        
        for i in range(self.learning_rate):
        
            z = np.dot(X, self.theta)
            h_x = 1/(1+np.exp(-z))
            epsilon = 1e-15

            J_theta = (-1/m)*np.sum((y * np.log(h_x + epsilon)) + ((1-y)* np.log(1-h_x + epsilon)))

            grad_theta = (1/m)*np.dot(X.T, (h_x - y))

            self.theta -= self.alpha * grad_theta
            
            self.ite_list.append(i)
            self.cost_list.append(J_theta)
            
        return self
    
    def predict(self, X):
        
        X = np.c_[X, np.ones(X.shape[0])]
        
        z = np.dot(X, self.theta)
        h_x = 1/(1+ np.exp(-z))
        
        return ((h_x >= 0.5)).astype(int), X
              

In [86]:
pipeline = Pipeline([
    ('standardising', StandardScaler()),
    ('mode', Logistic(1000, 0.01))
])

In [87]:
pipeline.fit(X_train, y_train)

In [88]:
Y_pred_train, re_X = pipeline.predict(X_train)

In [83]:
np.mean(Y_pred_train ==  y_train)

0.7181758116498292