In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as pt

In [2]:
diabetes_data = pd.read_csv('diabetes.csv')

In [3]:
diabetes_data.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1


In [4]:
diabetes_data.shape

(768, 9)

In [5]:
diabetes_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 768 entries, 0 to 767
Data columns (total 9 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   Pregnancies               768 non-null    int64  
 1   Glucose                   768 non-null    int64  
 2   BloodPressure             768 non-null    int64  
 3   SkinThickness             768 non-null    int64  
 4   Insulin                   768 non-null    int64  
 5   BMI                       768 non-null    float64
 6   DiabetesPedigreeFunction  768 non-null    float64
 7   Age                       768 non-null    int64  
 8   Outcome                   768 non-null    int64  
dtypes: float64(2), int64(7)
memory usage: 54.1 KB


In [6]:
diabetes_data.isnull().sum()

Pregnancies                 0
Glucose                     0
BloodPressure               0
SkinThickness               0
Insulin                     0
BMI                         0
DiabetesPedigreeFunction    0
Age                         0
Outcome                     0
dtype: int64

In [7]:
x=diabetes_data.drop(columns='Outcome')
print(x)

     Pregnancies  Glucose  BloodPressure  SkinThickness  Insulin   BMI  \
0              6      148             72             35        0  33.6   
1              1       85             66             29        0  26.6   
2              8      183             64              0        0  23.3   
3              1       89             66             23       94  28.1   
4              0      137             40             35      168  43.1   
..           ...      ...            ...            ...      ...   ...   
763           10      101             76             48      180  32.9   
764            2      122             70             27        0  36.8   
765            5      121             72             23      112  26.2   
766            1      126             60              0        0  30.1   
767            1       93             70             31        0  30.4   

     DiabetesPedigreeFunction  Age  
0                       0.627   50  
1                       0.351   31  


In [8]:
y= diabetes_data['Outcome']
print(y)

0      1
1      0
2      1
3      0
4      1
      ..
763    0
764    0
765    0
766    1
767    0
Name: Outcome, Length: 768, dtype: int64


In [9]:
from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.03,random_state=0)

In [10]:
x_train.shape

(744, 8)

In [11]:
x_test.shape

(24, 8)

In [12]:
y_train.shape

(744,)

In [13]:
y_test.shape

(24,)

In [14]:
x_test

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age
661,1,199,76,43,0,42.9,1.394,22
122,2,107,74,30,100,33.6,0.404,23
113,4,76,62,0,0,34.0,0.391,25
14,5,166,72,19,175,25.8,0.587,51
529,0,111,65,0,0,24.6,0.66,31
103,1,81,72,18,40,26.6,0.283,24
338,9,152,78,34,171,34.2,0.893,33
588,3,176,86,27,156,33.3,1.154,52
395,2,127,58,24,275,27.7,1.6,25
204,6,103,72,32,190,37.7,0.324,55


In [15]:
y_test

661    1
122    0
113    0
14     1
529    0
103    0
338    1
588    1
395    0
204    0
31     1
546    1
278    0
593    0
737    0
202    0
175    1
55     0
479    0
365    0
417    1
577    1
172    0
352    0
Name: Outcome, dtype: int64

In [16]:
from sklearn.ensemble import RandomForestClassifier
classifier = RandomForestClassifier(random_state=42)

In [21]:
# Fit the classifier
classifier.fit(x_train, y_train)

# Predict on test data
y_pred = classifier.predict(x_test)

# Evaluate accuracy
from sklearn.metrics import accuracy_score
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.2f}")

params = {
    "model": "RandomForestClassifier",
    "accuracy":accuracy,
}


Accuracy: 0.96


In [31]:
import mlflow

In [32]:
mlflow.set_tracking_uri('http://localhost:5000')
from mlflow.models import infer_signature

In [33]:
# Create a new MLflow Experiment
mlflow.set_experiment("Diabetes Prediction")

# Start an MLflow run
with mlflow.start_run(run_name="Random Forest Classifier"):
    # Log the hyperparameters
    mlflow.log_params(params)

    # Log the loss metric
    mlflow.log_metric("accuracy", accuracy)

    # Set a tag that we can use to remind ourselves what this run was for
    mlflow.set_tag("Diabetes Prediction RFC", "Basic RFC model for diabetes prediction")

    # Infer the model signature
    signature = infer_signature(x_train, classifier.predict(x_train))

    # Log the model
    model_info = mlflow.sklearn.log_model(
        sk_model=classifier,
        artifact_path="diabetes_model",
        signature=signature,
        input_example=x_train,
        registered_model_name="Diabetes-Prediction",
    )



Downloading artifacts:   0%|          | 0/7 [00:00<?, ?it/s]

Registered model 'Diabetes-Prediction' already exists. Creating a new version of this model...
2025/05/14 00:50:46 INFO mlflow.store.model_registry.abstract_store: Waiting up to 300 seconds for model version to finish creation. Model name: Diabetes-Prediction, version 2


🏃 View run Random Forest Classifier at: http://localhost:5000/#/experiments/497620571649005844/runs/c4cfee3ba9fa4bc8a0fda488e64551f6
🧪 View experiment at: http://localhost:5000/#/experiments/497620571649005844


Created version '2' of model 'Diabetes-Prediction'.


In [34]:
# Load the model back for predictions as a generic Python Function model
loaded_model = mlflow.pyfunc.load_model(model_info.model_uri)

predictions = loaded_model.predict(x_test)

Downloading artifacts:   0%|          | 0/7 [00:00<?, ?it/s]

In [35]:
# Sample data for prediction
sample_1 = [[2, 85, 66, 29, 0, 26.6, 0.351, 31]]  # Likely Non-Diabetic (Healthy)
sample_2 = [[3, 125, 70, 30, 96, 28.5, 0.350, 35]]  # Borderline
sample_3 = [[6, 148, 72, 35, 0, 33.6, 0.627, 50]]  # Likely Diabetic

# Convert the sample into a pandas DataFrame to match model input format (if needed)
sample_df = pd.DataFrame(sample_3, columns=['Pregnancies', 'Glucose', 'BloodPressure', 'SkinThickness', 
                                             'Insulin', 'BMI', 'DiabetesPedigreeFunction', 'Age'])

# Get predictions from the model
prediction = loaded_model.predict(sample_df)

# Print prediction result
print("Prediction:", 'Yes' if prediction[0] else 'No')


Prediction: Yes
