In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score
from sklearn.linear_model import LogisticRegression


In [2]:
data = pd.read_csv('pima-indians-diabetes.csv')
df = pd.DataFrame(data=data.values, columns=['Pregnancies', 'Glucose', 'Blood_Pressure', 'Skin_Thickness', 'Insulin', 'Bmi', 'Diabetes_Pedigree', 'Age', 'Prediction'])
df.head()


Unnamed: 0,Pregnancies,Glucose,Blood_Pressure,Skin_Thickness,Insulin,Bmi,Diabetes_Pedigree,Age,Prediction
0,1.0,85.0,66.0,29.0,0.0,26.6,0.351,31.0,0.0
1,8.0,183.0,64.0,0.0,0.0,23.3,0.672,32.0,1.0
2,1.0,89.0,66.0,23.0,94.0,28.1,0.167,21.0,0.0
3,0.0,137.0,40.0,35.0,168.0,43.1,2.288,33.0,1.0
4,5.0,116.0,74.0,0.0,0.0,25.6,0.201,30.0,0.0


In [3]:
def clean_outliers(col) :
    
    Q1 = df[col].quantile(0.25)
    Q3 = df[col].quantile(0.75)
    
    IqR = Q3 - Q1 
    
    lower_bound = Q1 - 1.5 * IqR 
    upper_bound = Q3 + 1.5 * IqR
    
    return df[col].clip(lower=lower_bound, upper=upper_bound)

x = df.columns

for i in x :
    df[i] = clean_outliers(i)
     

In [53]:
df.shape

(767, 9)

In [4]:
y = df['Prediction'].values.reshape(-1, 1)

X = df.drop(columns='Prediction')
X = np.hstack((X, np.ones((X.shape[0],1))))
X

array([[1.00e+00, 8.50e+01, 6.60e+01, ..., 3.51e-01, 3.10e+01, 1.00e+00],
       [8.00e+00, 1.83e+02, 6.40e+01, ..., 6.72e-01, 3.20e+01, 1.00e+00],
       [1.00e+00, 8.90e+01, 6.60e+01, ..., 1.67e-01, 2.10e+01, 1.00e+00],
       ...,
       [5.00e+00, 1.21e+02, 7.20e+01, ..., 2.45e-01, 3.00e+01, 1.00e+00],
       [1.00e+00, 1.26e+02, 6.00e+01, ..., 3.49e-01, 4.70e+01, 1.00e+00],
       [1.00e+00, 9.30e+01, 7.00e+01, ..., 3.15e-01, 2.30e+01, 1.00e+00]],
      shape=(767, 9))

In [5]:
sc = StandardScaler()
X = sc.fit_transform(X)

In [6]:
x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [7]:
def segmoind(z) :
    return (1 / (1 + np.exp(-z)))


def model(x, theta) :
    fc = x.dot(theta)
    return segmoind(fc)


def fonction_obj(X,y, theta):
    m = len(y)
    y_pred = model(X,theta)
    cost = -(1/m)*np.sum(y*np.log(y_pred)+(1-y)*np.log(1-y_pred))
    return cost

def gradient_opt(X,y, theta, lr=0.01, iter=20000):
    m = len(y)
    cost_list = []
    for i in range(iter):
        y_pred = model(X, theta)
        gradient = (1/m) * X.T.dot(y_pred - y)
        cost = fonction_obj(X,y, theta)
        theta -= lr * gradient
        cost_list.append(cost)
       
            
    return theta, cost_list


In [22]:
theta_initial = np.random.randn(x_train.shape[1],1)
theta_opti, cost_list = gradient_opt(X,y, theta_initial, lr=0.1, iter=25000)

In [23]:
ypredection = model(x_test, theta_opti)
y0 = ypredection[ypredection>=0.45]
len(y0)

80

In [24]:
y_pred = model(x_test,theta_opti)
y_pred1 = (y_pred>=0.45)
performance = accuracy_score(y_test,y_pred1)
print(f"Sccuracy score : {performance*100:.4f}")

Sccuracy score : 77.2727


In [25]:
mse = fonction_obj(X, y, theta_opti)
mse

np.float64(0.5248554008023796)

In [21]:
import pickle

V = {
    'theta_opti': theta_opti, 
    'sc': sc
}

# Enregistrer dans un fichier .pkl
file_name = 'classification 2.pkl'
with open(file_name, 'wb') as file:
    pickle.dump(V, file)
    

print("Objet enregistré avec succès !")

Objet enregistré avec succès !
