# Dataset load-in


In [None]:
## load csv file from cloud drive
from google.colab import drive
drive.mount('/content/drive/')
data_path = "/content/drive/MyDrive/Colab Notebooks/"  # this is your drive

Mounted at /content/drive/


In [None]:
import pandas as pd
df = pd.read_csv(data_path +"cleaned_dataset2.csv")
df.head()

Unnamed: 0,ID,No_Pation,Gender,AGE,Urea,Cr,HbA1c,Chol,TG,HDL,LDL,VLDL,BMI,CLASS
0,502,17975,0,50,4.7,46,4.9,4.2,0.9,2.4,1.4,0.5,24.0,0
1,735,34221,1,26,4.5,62,4.9,3.7,1.4,1.1,2.1,0.6,23.0,0
2,420,47975,0,50,4.7,46,4.9,4.2,0.9,2.4,1.4,0.5,24.0,0
3,680,87656,0,50,4.7,46,4.9,4.2,0.9,2.4,1.4,0.5,24.0,0
4,504,34223,1,33,7.1,46,4.9,4.9,1.0,0.8,2.0,0.4,21.0,0


# Preprocess the dataset for model training

In [None]:
# split target and features
X = df.drop(columns=['CLASS','ID','No_Pation'])
y = df['CLASS']


X.head()


Unnamed: 0,Gender,AGE,Urea,Cr,HbA1c,Chol,TG,HDL,LDL,VLDL,BMI
0,0,50,4.7,46,4.9,4.2,0.9,2.4,1.4,0.5,24.0
1,1,26,4.5,62,4.9,3.7,1.4,1.1,2.1,0.6,23.0
2,0,50,4.7,46,4.9,4.2,0.9,2.4,1.4,0.5,24.0
3,0,50,4.7,46,4.9,4.2,0.9,2.4,1.4,0.5,24.0
4,1,33,7.1,46,4.9,4.9,1.0,0.8,2.0,0.4,21.0


In [None]:
y.head()

Unnamed: 0,CLASS
0,0
1,0
2,0
3,0
4,0


In [None]:
from sklearn.model_selection import train_test_split

# divide dataset into trainingpart and testing part
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Multinomial Logistic Regression model apply

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler

# Standard Scale the Dataset
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)

# create Multinomial Logistic Regression Model
model = LogisticRegression(multi_class='multinomial', solver='lbfgs', max_iter=2000)

# training model
model.fit(X_train_scaled, y_train)

feature_names = X_train.columns



# Evaluation of the performance

In [None]:
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

# predict the result of the test dataset
y_pred = model.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy:.2f}')
print(classification_report(y_test, y_pred))
print(confusion_matrix(y_test, y_pred))

Accuracy: 0.86
              precision    recall  f1-score   support

           0       0.00      0.00      0.00        21
           1       0.86      1.00      0.93       173
           2       0.00      0.00      0.00         6

    accuracy                           0.86       200
   macro avg       0.29      0.33      0.31       200
weighted avg       0.75      0.86      0.80       200

[[  0  21   0]
 [  0 173   0]
 [  0   6   0]]


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


# Add weight factor to the model

As we can see that, the performance of current multinomial logistic regression model on 0 type("Non Diabetic")and 2 type("Probably Diabetic") is too poor, so back into our data preprocessing, we can find out that there are three columns, HbA1c, BMI and AGE, are the top 3 important features in this data set. So I decided to add weight on them, see if it well help the model performs better.

In [None]:

weight_factor = 2.0
X_train_weighted = X_train.copy()
X_test_weighted = X_test.copy()

X_train_weighted[['HbA1c', 'BMI', 'AGE']] *= weight_factor
X_test_weighted[['HbA1c', 'BMI', 'AGE']] *= weight_factor

# Standard scale the data
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train_weighted)
X_test_scaled = scaler.transform(X_test_weighted)

# create multunomial logistic regression model and add the max_iter into 2000
model = LogisticRegression(multi_class='multinomial', solver='lbfgs', max_iter=2000)

model.fit(X_train_scaled, y_train)


y_pred = model.predict(X_test_scaled)

accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy:.2f}')
print(classification_report(y_test, y_pred, zero_division=0))

Accuracy: 0.94
              precision    recall  f1-score   support

           0       0.75      0.86      0.80        21
           1       0.98      0.97      0.97       173
           2       0.50      0.33      0.40         6

    accuracy                           0.94       200
   macro avg       0.74      0.72      0.72       200
weighted avg       0.94      0.94      0.94       200





Possible reason that the accuracy of P sort of classification is significantly lower than other type of classification is that the data of P tpye is too small (53/1000), may resign higher weight to it later.

In [None]:
import joblib

model_path = 'MultinomialLogisticRegression_data2.pkl'
joblib.dump(model, model_path)
print(f"File saved as {model_path}")

File saved as MultinomialLogisticRegression_data2.pkl
