In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.metrics import classification_report, roc_auc_score

In [2]:
new_csv_path = '/content/CleanedNewData.csv'

new_csv_path
df = pd.read_csv(new_csv_path)

In [3]:
# Capitalize all column names
df.columns = [column.upper() for column in df.columns]

# Verify the updated column names to ensure they are all capitalized
print(df.columns)


Index(['MMSE', 'AGE', 'SMOKING', 'ALCOHOL', 'DM', 'INSULIN', 'HYPERLIPIDEMIA',
       'KAH', 'HYPOTHYROIDISM', 'ASTIM', 'KOAH', 'OP', 'HT', 'CST',
       'GAIT_SPEED', 'GRIP_STRENGTH', 'SARCOPENIA', 'GENDER', 'ACTIVE',
       'WEIGHT-STATUS'],
      dtype='object')


In [4]:
df.head()

Unnamed: 0,MMSE,AGE,SMOKING,ALCOHOL,DM,INSULIN,HYPERLIPIDEMIA,KAH,HYPOTHYROIDISM,ASTIM,KOAH,OP,HT,CST,GAIT_SPEED,GRIP_STRENGTH,SARCOPENIA,GENDER,ACTIVE,WEIGHT-STATUS
0,,64,0.0,0.0,0,0.0,0.0,0.0,0.0,0.0,0.0,,0,8.1,1.28,28.0,0.0,F,2.0,Overweight
1,,53,0.0,0.0,0,0.0,0.0,0.0,0.0,0.0,0.0,,0,8.0,1.47,16.0,0.0,F,,Healthy Weight
2,24.0,56,0.0,0.0,0,0.0,0.0,0.0,0.0,0.0,0.0,,1,11.7,0.8,23.0,0.0,F,1.0,Obesity
3,,58,0.0,0.0,1,0.0,0.0,0.0,0.0,0.0,0.0,,1,18.0,1.34,23.0,0.0,F,1.0,Overweight
4,30.0,55,1.0,0.0,0,0.0,0.0,0.0,0.0,,0.0,1.0,0,10.9,1.09,21.0,0.0,F,,Overweight


In [5]:
df.shape

(1303, 20)

In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1303 entries, 0 to 1302
Data columns (total 20 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   MMSE            812 non-null    float64
 1   AGE             1303 non-null   int64  
 2   SMOKING         1302 non-null   float64
 3   ALCOHOL         1229 non-null   float64
 4   DM              1303 non-null   int64  
 5   INSULIN         1297 non-null   float64
 6   HYPERLIPIDEMIA  1301 non-null   float64
 7   KAH             1261 non-null   float64
 8   HYPOTHYROIDISM  1271 non-null   float64
 9   ASTIM           1180 non-null   float64
 10  KOAH            1225 non-null   float64
 11  OP              355 non-null    float64
 12  HT              1303 non-null   int64  
 13  CST             1303 non-null   float64
 14  GAIT_SPEED      1300 non-null   float64
 15  GRIP_STRENGTH   1303 non-null   float64
 16  SARCOPENIA      1302 non-null   float64
 17  GENDER          1303 non-null   o

In [7]:
df.describe()

Unnamed: 0,MMSE,AGE,SMOKING,ALCOHOL,DM,INSULIN,HYPERLIPIDEMIA,KAH,HYPOTHYROIDISM,ASTIM,KOAH,OP,HT,CST,GAIT_SPEED,GRIP_STRENGTH,SARCOPENIA,ACTIVE
count,812.0,1303.0,1302.0,1229.0,1303.0,1297.0,1301.0,1261.0,1271.0,1180.0,1225.0,355.0,1303.0,1303.0,1300.0,1303.0,1302.0,840.0
mean,27.519704,61.348427,0.321813,0.087063,0.303147,0.06091,0.204458,0.126883,0.130606,0.055085,0.014694,0.408451,0.634689,11.236761,0.988775,26.947045,0.18894,1.057143
std,2.767765,9.724303,0.467351,0.296126,0.459795,0.239257,0.40346,0.332974,0.337101,0.228242,0.120374,0.492241,0.481702,4.449019,0.281349,9.291133,0.391611,0.232254
min,13.0,44.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.1,0.25,7.0,0.0,1.0
25%,27.0,53.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,8.8,0.78,20.0,0.0,1.0
50%,28.0,61.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,10.5,0.98,25.0,0.0,1.0
75%,30.0,68.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,12.6,1.17,32.0,0.0,1.0
max,30.0,92.0,1.0,2.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,50.0,2.41,77.0,1.0,2.0


In [8]:
# Splitting the data based on Gender
data_male = df[df['GENDER'] == 'M']
data_female = df[df['GENDER'] == 'F']

# Checking the sizes of the datasets to ensure proper split
data_male.shape, data_female.shape


((379, 20), (924, 20))

In [9]:
# Split the dataset by gender
data_male = df[df['GENDER'] == 'M']
data_female = df[df['GENDER'] == 'F']

num_cols = ['AGE', 'ACTIVE', 'HT', 'MMSE', 'SMOKING', 'ALCOHOL', 'DM', 'INSULIN', 'HYPERLIPIDEMIA', 'KAH', 'HYPOTHYROIDISM', 'ASTIM', 'KOAH', 'OP', 'CST', 'GAIT_SPEED', 'GRIP_STRENGTH']

cat_cols = ['WEIGHT-STATUS']

# Create preprocessing pipelines
numerical_pipeline = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

categorical_pipeline = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('encoder', OneHotEncoder())
])

# Combine preprocessing steps
preprocessor = ColumnTransformer(transformers=[
    ('num', numerical_pipeline, num_cols),
    ('cat', categorical_pipeline, cat_cols)
])

# Create a full pipeline with the classifier
from sklearn.linear_model import LogisticRegression

full_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', LogisticRegression())
])


# Removing rows where the target variable 'Sarcopenia' is NaN for both male and female datasets
data_male = data_male.dropna(subset=['SARCOPENIA'])
data_female = data_female.dropna(subset=['SARCOPENIA'])

# Split the cleaned data
X_male = data_male.drop('SARCOPENIA', axis=1)
y_male = data_male['SARCOPENIA']
X_female = data_female.drop('SARCOPENIA', axis=1)
y_female = data_female['SARCOPENIA']

# Apply preprocessing again if needed
X_male_preprocessed = preprocessor.fit_transform(X_male)
X_female_preprocessed = preprocessor.fit_transform(X_female)

# Apply the preprocessing
X_male_preprocessed = preprocessor.fit_transform(X_male)
X_female_preprocessed = preprocessor.fit_transform(X_female)


In [10]:
# Example of splitting data into training and test sets
X_train_male, X_test_male, y_train_male, y_test_male = train_test_split(X_male_preprocessed, y_male, test_size=0.2, random_state=42)
X_train_female, X_test_female, y_train_female, y_test_female = train_test_split(X_female_preprocessed, y_female, test_size=0.2, random_state=42)


In [11]:
# Fit Model 1: Logistic Regression
lr = LogisticRegression()
lr.fit(X_train_male, y_train_male)  # Train on the entire training dataset
probs_logreg_test = lr.predict_proba(X_test_male)[:, 1]  # Predict probabilities on the test set

# Define thresholds for moving to the next model
threshold_high = 0.8
threshold_low = 0.2

# Filter cases based on thresholds in the test set
intermediate_indices_test = (probs_logreg_test > threshold_low) & (probs_logreg_test < threshold_high)
X_intermediate_test_male = X_test_male[intermediate_indices_test]
y_intermediate_test_male = y_test_male[intermediate_indices_test]

# Fit Model 2: Random Forest on filtered intermediate test data
# Ensure to train Random Forest on the corresponding filtered training set if this step is needed
rf = RandomForestClassifier()
rf.fit(X_train_male, y_train_male)  # Typically, you'd want a filtered version or all data depending on approach
probs_rf_test = rf.predict_proba(X_intermediate_test_male)[:, 1]  # Predict on the filtered intermediate test data

# Define updated thresholds for the next filtering
threshold_rf_high = 0.75
threshold_rf_low = 0.25
final_indices_test = (probs_rf_test > threshold_rf_low) & (probs_rf_test < threshold_rf_high)
X_final_test_male = X_intermediate_test_male[final_indices_test]
y_final_test_male = y_intermediate_test_male[final_indices_test]

# Fit Model 3: Gradient Boosting on final filtered test data
gr = GradientBoostingClassifier()
gr.fit(X_train_male, y_train_male)  # Similar consideration for training set
final_probs_gb = gr.predict_proba(X_final_test_male)[:, 1]  # Final probabilities on the test set

# Evaluate the final model
print("Final model evaluation on male test data:")
print(classification_report(y_final_test_male, (final_probs_gb > 0.5).astype(int)))


Final model evaluation on male test data:
              precision    recall  f1-score   support

         0.0       0.20      0.50      0.29         2
         1.0       0.86      0.60      0.71        10

    accuracy                           0.58        12
   macro avg       0.53      0.55      0.50        12
weighted avg       0.75      0.58      0.64        12



In [12]:
# Fit Model 1: Logistic Regression
lr = LogisticRegression()
lr.fit(X_train_female, y_train_female)  # Train on the entire training dataset
probs_logreg_test = lr.predict_proba(X_test_female)[:, 1]  # Predict probabilities on the test set

# Define thresholds for moving to the next model
threshold_high = 0.8
threshold_low = 0.2

# Filter cases based on thresholds in the test set
intermediate_indices_test = (probs_logreg_test > threshold_low) & (probs_logreg_test < threshold_high)
X_intermediate_test_female = X_test_female[intermediate_indices_test]
y_intermediate_test_female = y_test_female[intermediate_indices_test]

# Fit Model 2: Random Forest on filtered intermediate test data
# Ensure to train Random Forest on the corresponding filtered training set if this step is needed
rf = RandomForestClassifier()
rf.fit(X_train_female, y_train_female)  # Typically, you'd want a filtered version or all data depending on approach
probs_rf_female = rf.predict_proba(X_intermediate_test_female)[:, 1]  # Predict on the filtered intermediate test data

# Update thresholds for more precision
threshold_rf_high_female = 0.75
threshold_rf_low_female = 0.25
final_indices_female = (probs_rf_female > threshold_rf_low_female) & (probs_rf_female < threshold_rf_high_female)
X_final_test_female = X_intermediate_test_female[final_indices_female]
y_final_test_female = y_intermediate_test_female[final_indices_female]


# Fit Model 3: Gradient Boosting on final filtered test data
gb = GradientBoostingClassifier()
gb.fit(X_train_female, y_train_female)  # Similar consideration for training set
final_probs_gb = gb.predict_proba(X_final_test_female)[:, 1]  # Final probabilities on the test set

# Evaluate the final model
print("Final model evaluation on female test data:")
print(classification_report(y_final_test_female, (final_probs_gb > 0.5).astype(int)))


Final model evaluation on female test data:
              precision    recall  f1-score   support

         0.0       0.36      0.31      0.33        13
         1.0       0.50      0.56      0.53        16

    accuracy                           0.45        29
   macro avg       0.43      0.44      0.43        29
weighted avg       0.44      0.45      0.44        29



In [13]:
import joblib

# Assuming you have three trained models: logreg, random_forest, and gradient_boosting
models = {
    'logistic_regression': lr,
    'random_forest': rf,
    'gradient_boosting': gb
}

# Save all models to a single file
joblib.dump(models, 'three_tiered_models.joblib')

['three_tiered_models.joblib']

In [14]:
from IPython.display import FileLink

# Provide a link to download the saved model file
FileLink(r'three_tiered_models.joblib')


In [19]:
from flask import Flask, request, render_template, jsonify
import joblib
import sklearn

app = Flask(__name__)

# Load your model
models = joblib.load('/content/three_tiered_models.joblib')

# Define a route for the default page
@app.route('/')
def index():
    return render_template('index.html')  # Render a user form

# Define a route to handle form submission
@app.route('/predict', methods=['POST'])
def predict():
    # Extract features from the form
    input_features = [float(request.form['Age']), float(request.form['Weight Status']),  float(request.form['MMSE']), float(request.form['Smoking']),
                       float(request.form['Alcohol']), float(request.form['DM']), float(request.form['Insulin']), float(request.form['KAH']), float(request.form['Hypothyroidism']),
                       float(request.form['CST']), float(request.form['Gait Speed']), float(request.form['Astim']), float(request.form['KOAH']), float(request.form['Active']),
                         float(request.form['Gender']), float(request.form['Hyperlipidemia']), float(request.form['OP']), float(request.form['HT'])]  # Adjust according to your features
    # Process through the tiered models
    # Assume input_features is properly formatted for the model
    result = process_through_models(input_features)

    # You can choose to return a template or simply a response
    return jsonify({'prediction': result})

def process_through_models(features):
    # First model
    prob1 = models['logistic_regression'].predict_proba([features])[0, 1]
    if prob1 < 0.2:
        return 'Low risk of Sarcopenia'
    elif prob1 > 0.8:
        return 'High risk of Sarcopenia'

    # Second model
    prob2 = models['random_forest'].predict_proba([features])[0, 1]
    if prob2 < 0.25:
        return 'Low risk of Sarcopenia'
    elif prob2 > 0.75:
        return 'High risk of Sarcopenia'

    # Third model
    prob3 = models['gradient_boosting'].predict_proba([features])[0, 1]
    if prob3 < 0.5:
        return 'Low risk of Sarcopenia'
    else:
        return 'High risk of Sarcopenia'

if __name__ == '__main__':
    app.run(debug=True)


 * Serving Flask app '__main__'
 * Debug mode: on


 * Running on http://127.0.0.1:5000
INFO:werkzeug:[33mPress CTRL+C to quit[0m
INFO:werkzeug: * Restarting with stat
