In [1]:
import pandas as pd
import numpy as np
import joblib
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score
from sklearn.preprocessing import StandardScaler, LabelEncoder

In [16]:
df = pd.read_csv('../data/Pregnant and Expecting refined.csv')
df.head()

Unnamed: 0,age,bmi,diabp,sysbp,fam_history_hypertension,glucose_levels,heart_rate,Complication
0,18,21.55,73,123,0,6.9,70,0
1,18,33.99,82,130,0,7.5,86,0
2,18,33.37,78,124,1,6.9,76,0
3,18,22.93,87,120,0,6.9,76,0
4,18,25.18,86,131,1,6.7,76,0


In [4]:
import joblib
rf_model = joblib.load('../models/preeclampsia_joblib')

In [17]:
df.drop(columns=["Complication", 'heart_rate'], inplace=True)

In [18]:
df.head()

Unnamed: 0,age,bmi,diabp,sysbp,fam_history_hypertension,glucose_levels
0,18,21.55,73,123,0,6.9
1,18,33.99,82,130,0,7.5
2,18,33.37,78,124,1,6.9
3,18,22.93,87,120,0,6.9
4,18,25.18,86,131,1,6.7


In [19]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 415 entries, 0 to 414
Data columns (total 6 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   age                       415 non-null    int64  
 1   bmi                       415 non-null    float64
 2   diabp                     415 non-null    int64  
 3   sysbp                     415 non-null    int64  
 4   fam_history_hypertension  415 non-null    int64  
 5   glucose_levels            415 non-null    float64
dtypes: float64(2), int64(4)
memory usage: 19.6 KB


In [8]:
# Define a function to classify complications based on symptom count
def classify_complications(row, threshold=3):
    # Count the number of symptoms that indicate 'High risk'
    symptom_count = sum([row['age'] > 35,
                         row['age'] < 20,
                         row['bmi'] > 35, 
                         row['diabp'] >= 90, 
                         row['sysbp'] >= 140, 
                         row['fam_history_hypertension'] == 1,
                         row['glucose_levels'] >= 11.1,
                        ])
    
    # Classify as 'At Risk' if symptom count is >= threshold
    if symptom_count >= threshold or row['diabp'] >= 90 or row['sysbp'] >= 140:
        return 'High'
    return 'Low'

# Apply the function to each row to create the 'Complications' column
# df['Risk Level'] = df.apply(classify_complications, axis=1)

In [21]:
df.head()

Unnamed: 0,age,bmi,diabp,sysbp,fam_history_hypertension,glucose_levels,Risk Level
0,18,21.55,73,123,0,6.9,Low
1,18,33.99,82,130,0,7.5,Low
2,18,33.37,78,124,1,6.9,Low
3,18,22.93,87,120,0,6.9,Low
4,18,25.18,86,131,1,6.7,Low


In [22]:
df['Risk Level'].value_counts()

Risk Level
Low     403
High     12
Name: count, dtype: int64

In [23]:
df[df['Risk Level'] == 'High']

Unnamed: 0,age,bmi,diabp,sysbp,fam_history_hypertension,glucose_levels,Risk Level
399,34,26.94,90,130,1,9.0,High
400,35,32.5,92,140,1,8.0,High
401,40,34.2,95,145,1,7.5,High
402,30,31.0,91,138,1,7.8,High
403,29,28.5,90,135,0,6.9,High
404,37,36.8,94,150,1,7.6,High
405,34,33.7,92,139,1,8.1,High
407,31,30.5,90,136,1,7.7,High
408,38,35.2,95,148,1,8.0,High
410,36,32.9,93,142,1,7.9,High


In [41]:
from sklearn.preprocessing import LabelEncoder

In [24]:
label_encoder = LabelEncoder()
df['Risk Level'] = label_encoder.fit_transform(df['Risk Level'])

In [25]:
df.head()

Unnamed: 0,age,bmi,diabp,sysbp,fam_history_hypertension,glucose_levels,Risk Level
0,18,21.55,73,123,0,6.9,1
1,18,33.99,82,130,0,7.5,1
2,18,33.37,78,124,1,6.9,1
3,18,22.93,87,120,0,6.9,1
4,18,25.18,86,131,1,6.7,1


From the above label encoding in Risk Level column;0 represents high, 1 represents Low

In [72]:
X = df.drop(columns=['Risk Level'])

In [73]:
X.head()

Unnamed: 0,age,bmi,diabp,sysbp,fam_history_hypertension,glucose_levels
0,18,21.55,73,123,0,6.9
1,18,33.99,82,130,0,7.5
2,18,33.37,78,124,1,6.9
3,18,22.93,87,120,0,6.9
4,18,25.18,86,131,1,6.7


In [46]:
y = df['Risk Level']

In [47]:
y.head()

0    1
1    1
2    1
3    1
4    1
Name: Risk Level, dtype: int64

In [48]:
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
rf_model.fit(X, y)

In [26]:
test_data1 = {
    'age': [17, 40, 23, 35, 19],
    'bmi': [36.2, 30.3, 29.4, 33.2, 34.1],
    'diabp': [84, 90, 85, 84, 86],
    'sysbp': [133, 135, 142, 134, 145],
    'fam_history_hypertension': [0, 1, 0, 1, 1],
    'glucose_levels': [11.2, 9.9, 10.9, 11.1, 11.3],
    'Risk Level': [0, 0, 0, 1, 0]
}
df_test1 = pd.DataFrame(test_data1)
df_test1.head()

Unnamed: 0,age,bmi,diabp,sysbp,fam_history_hypertension,glucose_levels,Risk Level
0,17,36.2,84,133,0,11.2,0
1,40,30.3,90,135,1,9.9,0
2,23,29.4,85,142,0,10.9,0
3,35,33.2,84,134,1,11.1,1
4,19,34.1,86,145,1,11.3,0


In [27]:
df_test1['Risk Level Test'] = df_test1.apply(classify_complications, axis=1)

In [28]:
df_test1.head()

Unnamed: 0,age,bmi,diabp,sysbp,fam_history_hypertension,glucose_levels,Risk Level,Risk Level Test
0,17,36.2,84,133,0,11.2,0,High
1,40,30.3,90,135,1,9.9,0,High
2,23,29.4,85,142,0,10.9,0,High
3,35,33.2,84,134,1,11.1,1,Low
4,19,34.1,86,145,1,11.3,0,High


In [55]:
X_test = df_test.drop(columns=['Risk Level', 'Risk Level Test'])
X_test.head()

Unnamed: 0,age,bmi,diabp,sysbp,fam_history_hypertension,glucose_levels
0,17,36.2,84,133,0,11.2
1,40,30.3,90,135,1,9.9
2,23,29.4,85,142,0,10.9
3,35,33.2,84,134,1,11.1
4,19,34.1,86,145,1,11.3


In [56]:
y_test = df_test['Risk Level']
y_test.head()

0    0
1    0
2    0
3    1
4    0
Name: Risk Level, dtype: int64

In [57]:
rf_model.score(X_test, y_test)

0.4

In [58]:
rf_model.predict(X_test)

array([1, 0, 1, 1, 1])

In [29]:
df_test1.drop(columns=['Risk Level Test'], inplace=True)

In [30]:
df_test1.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5 entries, 0 to 4
Data columns (total 7 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   age                       5 non-null      int64  
 1   bmi                       5 non-null      float64
 2   diabp                     5 non-null      int64  
 3   sysbp                     5 non-null      int64  
 4   fam_history_hypertension  5 non-null      int64  
 5   glucose_levels            5 non-null      float64
 6   Risk Level                5 non-null      int64  
dtypes: float64(2), int64(5)
memory usage: 412.0 bytes


In [31]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 415 entries, 0 to 414
Data columns (total 7 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   age                       415 non-null    int64  
 1   bmi                       415 non-null    float64
 2   diabp                     415 non-null    int64  
 3   sysbp                     415 non-null    int64  
 4   fam_history_hypertension  415 non-null    int64  
 5   glucose_levels            415 non-null    float64
 6   Risk Level                415 non-null    int64  
dtypes: float64(2), int64(5)
memory usage: 22.8 KB


In [65]:
df.drop(columns=['Risk Level Test'], inplace=True)

In [32]:
df.head()

Unnamed: 0,age,bmi,diabp,sysbp,fam_history_hypertension,glucose_levels,Risk Level
0,18,21.55,73,123,0,6.9,1
1,18,33.99,82,130,0,7.5,1
2,18,33.37,78,124,1,6.9,1
3,18,22.93,87,120,0,6.9,1
4,18,25.18,86,131,1,6.7,1


In [68]:
df_final = pd.concat([df_test, df])

In [69]:
df_final.info()

<class 'pandas.core.frame.DataFrame'>
Index: 420 entries, 0 to 414
Data columns (total 7 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   age                       420 non-null    int64  
 1   bmi                       420 non-null    float64
 2   diabp                     420 non-null    int64  
 3   sysbp                     420 non-null    int64  
 4   fam_history_hypertension  420 non-null    int64  
 5   glucose_levels            420 non-null    float64
 6   Risk Level                420 non-null    int64  
dtypes: float64(2), int64(5)
memory usage: 26.2 KB


In [70]:
df_final.head()

Unnamed: 0,age,bmi,diabp,sysbp,fam_history_hypertension,glucose_levels,Risk Level
0,17,36.2,84,133,0,11.2,0
1,40,30.3,90,135,1,9.9,0
2,23,29.4,85,142,0,10.9,0
3,35,33.2,84,134,1,11.1,1
4,19,34.1,86,145,1,11.3,0


In [74]:
X = df_final.drop(columns=['Risk Level'])

In [75]:
X.head()

Unnamed: 0,age,bmi,diabp,sysbp,fam_history_hypertension,glucose_levels
0,17,36.2,84,133,0,11.2
1,40,30.3,90,135,1,9.9
2,23,29.4,85,142,0,10.9
3,35,33.2,84,134,1,11.1
4,19,34.1,86,145,1,11.3


In [76]:
y= df_final['Risk Level']

In [77]:
y.head()

0    0
1    0
2    0
3    1
4    0
Name: Risk Level, dtype: int64

In [78]:
rf_model.fit(X, y)

In [79]:
test_data = {
    'age': [18, 39, 27, 35, 17],
    'bmi': [37.7, 32.4, 28.9, 33.2, 32.3],
    'diabp': [82, 94, 85, 82, 88],
    'sysbp': [123, 138, 142, 136, 147],
    'fam_history_hypertension': [0, 1, 0, 1, 1],
    'glucose_levels': [12.0, 8.6, 10.9, 11.4, 11.1],
    'Risk Level': [0, 0, 0, 1, 0]
}
df_test = pd.DataFrame(test_data)
df_test.head()

Unnamed: 0,age,bmi,diabp,sysbp,fam_history_hypertension,glucose_levels,Risk Level
0,18,37.7,82,123,0,12.0,0
1,39,32.4,94,138,1,8.6,0
2,27,28.9,85,142,0,10.9,0
3,35,33.2,82,136,1,11.4,1
4,17,32.3,88,147,1,11.1,0


In [80]:
X_test = df_test.drop(columns=['Risk Level'])
X_test.head()

Unnamed: 0,age,bmi,diabp,sysbp,fam_history_hypertension,glucose_levels
0,18,37.7,82,123,0,12.0
1,39,32.4,94,138,1,8.6
2,27,28.9,85,142,0,10.9
3,35,33.2,82,136,1,11.4
4,17,32.3,88,147,1,11.1


In [81]:
y_test = df_test['Risk Level']
y_test.head()

0    0
1    0
2    0
3    1
4    0
Name: Risk Level, dtype: int64

In [82]:
rf_model.score(X_test, y_test)

0.8

In [95]:
import joblib
joblib.dump(rf_model, "pre-eclampsia_joblib_v2")

['pre-eclampsia_joblib_v2']

In [83]:
rf_model.predict(X_test)

array([1, 0, 0, 1, 0])

In [84]:
df_test['Risk Level Test'] = df_test.apply(classify_complications, axis=1)
df_test.head()

Unnamed: 0,age,bmi,diabp,sysbp,fam_history_hypertension,glucose_levels,Risk Level,Risk Level Test
0,18,37.7,82,123,0,12.0,0,High
1,39,32.4,94,138,1,8.6,0,High
2,27,28.9,85,142,0,10.9,0,High
3,35,33.2,82,136,1,11.4,1,Low
4,17,32.3,88,147,1,11.1,0,High


In [85]:
df_test.drop(columns=['Risk Level Test'], inplace=True)

# Using Logistic Regression

In [86]:
from sklearn.linear_model import LogisticRegression

In [88]:
l_model = LogisticRegression(max_iter=500)
l_model.fit(X, y)

In [89]:
l_model.score(X_test, y_test)

0.6

In [90]:
from sklearn.preprocessing import StandardScaler

In [91]:
# Standardize the features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X)
X_test_scaled = scaler.transform(X_test)

In [93]:
l_model.fit(X_train_scaled, y)

In [94]:
l_model.score(X_test_scaled, y_test)

0.6

In [96]:
rf_model.fit(X_train_scaled, y)

In [97]:
rf_model.score(X_test_scaled, y_test)

0.8

In [3]:
import joblib
rf_model = joblib.load('../models/pre-eclampsia_joblib_v2')

In [7]:
test_data = {
    'age': [19, 39, 17, 36, 25, 18],
    'bmi':[36.2,38.2, 38.94, 34.3, 38.2, 36.4],
    'diabp': [80, 76, 89, 84, 87, 86],
    'sysbp': [134, 131, 139, 128, 131, 133],
    'fam_history_hypertension': [1, 1, 0, 1, 1, 0],
    'glucose_levels': [11.3, 10.2, 11.1, 11.5, 11.2, 11.6]
}
df_test = pd.DataFrame(test_data)
df_test

Unnamed: 0,age,bmi,diabp,sysbp,fam_history_hypertension,glucose_levels
0,19,36.2,80,134,1,11.3
1,39,38.2,76,131,1,10.2
2,17,38.94,89,139,0,11.1
3,36,34.3,84,128,1,11.5
4,25,38.2,87,131,1,11.2
5,18,36.4,86,133,0,11.6


In [8]:
rf_model.predict(df_test)

array([1, 0, 0, 1, 1, 1])

In [10]:
df_test['Risk Level'] = df_test.apply(classify_complications, axis=1)

<bound method NDFrame.head of    age    bmi  diabp  sysbp  fam_history_hypertension  glucose_levels  \
0   19  36.20     80    134                         1            11.3   
1   39  38.20     76    131                         1            10.2   
2   17  38.94     89    139                         0            11.1   
3   36  34.30     84    128                         1            11.5   
4   25  38.20     87    131                         1            11.2   
5   18  36.40     86    133                         0            11.6   

  Risk Level  
0       High  
1       High  
2       High  
3       High  
4       High  
5       High  >

In [11]:
df_test.head()

Unnamed: 0,age,bmi,diabp,sysbp,fam_history_hypertension,glucose_levels,Risk Level
0,19,36.2,80,134,1,11.3,High
1,39,38.2,76,131,1,10.2,High
2,17,38.94,89,139,0,11.1,High
3,36,34.3,84,128,1,11.5,High
4,25,38.2,87,131,1,11.2,High


In [12]:
label_encoder = LabelEncoder()
df_test['Risk Level'] = label_encoder.fit_transform(df_test['Risk Level'])

In [13]:
df_test.head()

Unnamed: 0,age,bmi,diabp,sysbp,fam_history_hypertension,glucose_levels,Risk Level
0,19,36.2,80,134,1,11.3,0
1,39,38.2,76,131,1,10.2,0
2,17,38.94,89,139,0,11.1,0
3,36,34.3,84,128,1,11.5,0
4,25,38.2,87,131,1,11.2,0


In [15]:
df_test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6 entries, 0 to 5
Data columns (total 7 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   age                       6 non-null      int64  
 1   bmi                       6 non-null      float64
 2   diabp                     6 non-null      int64  
 3   sysbp                     6 non-null      int64  
 4   fam_history_hypertension  6 non-null      int64  
 5   glucose_levels            6 non-null      float64
 6   Risk Level                6 non-null      int64  
dtypes: float64(2), int64(5)
memory usage: 468.0 bytes


In [33]:
df_final = pd.concat([df_test, df_test1, df])

In [34]:
df_final.info()

<class 'pandas.core.frame.DataFrame'>
Index: 426 entries, 0 to 414
Data columns (total 7 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   age                       426 non-null    int64  
 1   bmi                       426 non-null    float64
 2   diabp                     426 non-null    int64  
 3   sysbp                     426 non-null    int64  
 4   fam_history_hypertension  426 non-null    int64  
 5   glucose_levels            426 non-null    float64
 6   Risk Level                426 non-null    int64  
dtypes: float64(2), int64(5)
memory usage: 26.6 KB


In [36]:
df_final.head(10)

Unnamed: 0,age,bmi,diabp,sysbp,fam_history_hypertension,glucose_levels,Risk Level
0,19,36.2,80,134,1,11.3,0
1,39,38.2,76,131,1,10.2,0
2,17,38.94,89,139,0,11.1,0
3,36,34.3,84,128,1,11.5,0
4,25,38.2,87,131,1,11.2,0
5,18,36.4,86,133,0,11.6,0
0,17,36.2,84,133,0,11.2,0
1,40,30.3,90,135,1,9.9,0
2,23,29.4,85,142,0,10.9,0
3,35,33.2,84,134,1,11.1,1


In [37]:
X = df_final.drop(columns=['Risk Level'])

In [38]:
X.head()

Unnamed: 0,age,bmi,diabp,sysbp,fam_history_hypertension,glucose_levels
0,19,36.2,80,134,1,11.3
1,39,38.2,76,131,1,10.2
2,17,38.94,89,139,0,11.1
3,36,34.3,84,128,1,11.5
4,25,38.2,87,131,1,11.2


In [39]:
y = df_final['Risk Level']

In [40]:
y.head()

0    0
1    0
2    0
3    0
4    0
Name: Risk Level, dtype: int64

In [41]:
rf_model_v2 = RandomForestClassifier(n_estimators=100, random_state=42)
rf_model_v2.fit(X, y)

In [42]:
test_data4 = {
    'age': [18, 39, 27, 35, 17],
    'bmi': [37.7, 32.4, 28.9, 33.2, 32.3],
    'diabp': [82, 94, 85, 82, 88],
    'sysbp': [123, 138, 142, 136, 147],
    'fam_history_hypertension': [0, 1, 0, 1, 1],
    'glucose_levels': [12.0, 8.6, 10.9, 11.4, 11.1],
    'Risk Level': [0, 0, 0, 1, 0]
}
df_test4 = pd.DataFrame(test_data4)
df_test4.head()

Unnamed: 0,age,bmi,diabp,sysbp,fam_history_hypertension,glucose_levels,Risk Level
0,18,37.7,82,123,0,12.0,0
1,39,32.4,94,138,1,8.6,0
2,27,28.9,85,142,0,10.9,0
3,35,33.2,82,136,1,11.4,1
4,17,32.3,88,147,1,11.1,0


In [43]:
X_test = df_test4.drop(columns=['Risk Level'])
X_test.head()

Unnamed: 0,age,bmi,diabp,sysbp,fam_history_hypertension,glucose_levels
0,18,37.7,82,123,0,12.0
1,39,32.4,94,138,1,8.6
2,27,28.9,85,142,0,10.9
3,35,33.2,82,136,1,11.4
4,17,32.3,88,147,1,11.1


In [44]:
y_test = df_test4['Risk Level']
y_test.head()

0    0
1    0
2    0
3    1
4    0
Name: Risk Level, dtype: int64

In [45]:
rf_model_v2.score(X_test, y_test)

1.0

In [46]:
rf_model_v2.predict(X_test)

array([0, 0, 0, 1, 0])

In [52]:
# Set a seed for reproducibility
np.random.seed(42)

# Generate random test data
test_data5 = {
    'age': np.random.randint(16, 45, size=25),  # Age between 16 and 45
    'bmi': np.round(np.random.uniform(25, 40, size=25), 1),  # BMI between 25 and 40
    'diabp': np.random.randint(75, 100, size=25),  # Diastolic BP between 75 and 100
    'sysbp': np.random.randint(120, 150, size=25),  # Systolic BP between 120 and 150
    'fam_history_hypertension': np.random.choice([0, 1], size=25),  # Family history (0 or 1)
    'glucose_levels': np.round(np.random.uniform(7.0, 13.0, size=25), 1)  # Glucose levels between 7.0 and 13.0
}

# Create DataFrame
df_test5 = pd.DataFrame(test_data5)

# Display first few rows
df_test5.tail(10)

Unnamed: 0,age,bmi,diabp,sysbp,fam_history_hypertension,glucose_levels
0,22,39.1,76,133,0,12.5
1,35,25.0,94,135,0,12.1
2,44,39.9,89,134,0,9.7
3,30,34.3,81,127,0,7.6
4,26,34.2,86,133,0,9.2
5,23,25.1,82,142,0,11.0
6,44,25.3,89,147,1,11.0
7,36,32.9,77,144,0,10.5
8,22,31.0,88,149,1,8.6
9,41,25.7,91,127,1,10.4


In [53]:
df_test5.tail()

Unnamed: 0,age,bmi,diabp,sysbp,fam_history_hypertension,glucose_levels
20,37,25.2,92,147,0,8.2
21,36,39.1,86,147,1,12.4
22,17,33.4,76,132,0,9.9
23,39,30.8,84,128,1,10.4
24,27,25.2,78,148,0,11.2


In [54]:
rf_model_v2.predict(df_test5)

array([0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0,
       1, 0, 0])

In [55]:
df_test5['Risk Level'] = df_test5.apply(classify_complications, axis=1)

In [56]:
df_test5.head(10)

Unnamed: 0,age,bmi,diabp,sysbp,fam_history_hypertension,glucose_levels,Risk Level
0,22,39.1,76,133,0,12.5,Low
1,35,25.0,94,135,0,12.1,High
2,44,39.9,89,134,0,9.7,Low
3,30,34.3,81,127,0,7.6,Low
4,26,34.2,86,133,0,9.2,Low
5,23,25.1,82,142,0,11.0,High
6,44,25.3,89,147,1,11.0,High
7,36,32.9,77,144,0,10.5,High
8,22,31.0,88,149,1,8.6,High
9,41,25.7,91,127,1,10.4,High


In [57]:
df_test5['Risk Level'] = label_encoder.fit_transform(df_test5['Risk Level'])

In [64]:
df_test5.tail()

Unnamed: 0,age,bmi,diabp,sysbp,fam_history_hypertension,glucose_levels,Risk Level
20,37,25.2,92,147,0,8.2,0
21,36,39.1,86,147,1,12.4,0
22,17,33.4,76,132,0,9.9,1
23,39,30.8,84,128,1,10.4,1
24,27,25.2,78,148,0,11.2,0


In [59]:
df_final.info()

<class 'pandas.core.frame.DataFrame'>
Index: 426 entries, 0 to 414
Data columns (total 7 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   age                       426 non-null    int64  
 1   bmi                       426 non-null    float64
 2   diabp                     426 non-null    int64  
 3   sysbp                     426 non-null    int64  
 4   fam_history_hypertension  426 non-null    int64  
 5   glucose_levels            426 non-null    float64
 6   Risk Level                426 non-null    int64  
dtypes: float64(2), int64(5)
memory usage: 26.6 KB


In [60]:
df_final_v2 = pd.concat([df_final, df_test5])

In [62]:
df_final_v2.info()

<class 'pandas.core.frame.DataFrame'>
Index: 451 entries, 0 to 24
Data columns (total 7 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   age                       451 non-null    int64  
 1   bmi                       451 non-null    float64
 2   diabp                     451 non-null    int64  
 3   sysbp                     451 non-null    int64  
 4   fam_history_hypertension  451 non-null    int64  
 5   glucose_levels            451 non-null    float64
 6   Risk Level                451 non-null    int64  
dtypes: float64(2), int64(5)
memory usage: 28.2 KB


In [63]:
df_final_v2.tail()

Unnamed: 0,age,bmi,diabp,sysbp,fam_history_hypertension,glucose_levels,Risk Level
20,37,25.2,92,147,0,8.2,0
21,36,39.1,86,147,1,12.4,0
22,17,33.4,76,132,0,9.9,1
23,39,30.8,84,128,1,10.4,1
24,27,25.2,78,148,0,11.2,0


In [65]:
X = df_final_v2.drop(columns=['Risk Level'])

In [66]:
X.head()

Unnamed: 0,age,bmi,diabp,sysbp,fam_history_hypertension,glucose_levels
0,19,36.2,80,134,1,11.3
1,39,38.2,76,131,1,10.2
2,17,38.94,89,139,0,11.1
3,36,34.3,84,128,1,11.5
4,25,38.2,87,131,1,11.2


In [68]:
y = df_final_v2['Risk Level']

In [69]:
y.head()

0    0
1    0
2    0
3    0
4    0
Name: Risk Level, dtype: int64

In [70]:
rf_model_v2.fit(X,y)

In [71]:
# Set a seed for reproducibility
np.random.seed(42)

# Generate random test data
test_data6 = {
    'age': np.random.randint(16, 45, size=10),  # Age between 16 and 45
    'bmi': np.round(np.random.uniform(25, 40, size=10), 1),  # BMI between 25 and 40
    'diabp': np.random.randint(75, 100, size=10),  # Diastolic BP between 75 and 100
    'sysbp': np.random.randint(120, 150, size=10),  # Systolic BP between 120 and 150
    'fam_history_hypertension': np.random.choice([0, 1], size=10),  # Family history (0 or 1)
    'glucose_levels': np.round(np.random.uniform(7.0, 13.0, size=10), 1)  # Glucose levels between 7.0 and 13.0
}

# Create DataFrame
df_test6 = pd.DataFrame(test_data6)

# Display first few rows
df_test6.head(10)

Unnamed: 0,age,bmi,diabp,sysbp,fam_history_hypertension,glucose_levels
0,22,27.3,95,149,0,10.4
1,35,25.9,75,149,0,9.3
2,44,38.0,86,134,0,7.1
3,30,34.0,96,149,0,8.4
4,26,35.6,86,138,0,8.4
5,23,25.3,99,131,0,11.1
6,44,39.5,91,142,1,10.7
7,36,37.5,84,139,1,12.0
8,22,28.2,90,144,0,8.0
9,41,27.7,89,122,1,9.3


In [72]:
df_test6['Risk Level'] = df_test6.apply(classify_complications, axis=1)

In [73]:
df_test6.head()

Unnamed: 0,age,bmi,diabp,sysbp,fam_history_hypertension,glucose_levels,Risk Level
0,22,27.3,95,149,0,10.4,High
1,35,25.9,75,149,0,9.3,High
2,44,38.0,86,134,0,7.1,Low
3,30,34.0,96,149,0,8.4,High
4,26,35.6,86,138,0,8.4,Low


In [74]:
df_test6['Risk Level'] = label_encoder.fit_transform(df_test6['Risk Level'])

In [79]:
df_test6.head(10)

Unnamed: 0,age,bmi,diabp,sysbp,fam_history_hypertension,glucose_levels,Risk Level
0,22,27.3,95,149,0,10.4,0
1,35,25.9,75,149,0,9.3,0
2,44,38.0,86,134,0,7.1,1
3,30,34.0,96,149,0,8.4,0
4,26,35.6,86,138,0,8.4,1
5,23,25.3,99,131,0,11.1,0
6,44,39.5,91,142,1,10.7,0
7,36,37.5,84,139,1,12.0,0
8,22,28.2,90,144,0,8.0,0
9,41,27.7,89,122,1,9.3,1


In [76]:
X_test = df_test6.drop(columns=['Risk Level'])
y_test = df_test6['Risk Level']

In [77]:
rf_model_v2.score(X_test, y_test)

0.9

In [78]:
rf_model.predict(X_test)

array([0, 0, 1, 0, 1, 0, 0, 1, 0, 1])

In [90]:
# Set a seed for reproducibility
np.random.seed(42)

# Generate random test data
test_data7 = {
    'age': np.random.randint(16, 45, size=10),  # Age between 16 and 45
    'bmi': np.round(np.random.uniform(25, 40, size=10), 1),  # BMI between 25 and 40
    'diabp': np.random.randint(75, 86, size=10),  # Diastolic BP between 75 and 100
    'sysbp': np.random.randint(120, 136, size=10),  # Systolic BP between 120 and 150
    'fam_history_hypertension': np.random.choice([0, 1], size=10),  # Family history (0 or 1)
    'glucose_levels': np.round(np.random.uniform(7.0, 13.0, size=10), 1)  # Glucose levels between 7.0 and 13.0
}

# Create DataFrame
df_test7 = pd.DataFrame(test_data6)

# Display first few rows
df_test7.head(10)

Unnamed: 0,age,bmi,diabp,sysbp,fam_history_hypertension,glucose_levels
0,22,27.3,95,149,0,10.4
1,35,25.9,75,149,0,9.3
2,44,38.0,86,134,0,7.1
3,30,34.0,96,149,0,8.4
4,26,35.6,86,138,0,8.4
5,23,25.3,99,131,0,11.1
6,44,39.5,91,142,1,10.7
7,36,37.5,84,139,1,12.0
8,22,28.2,90,144,0,8.0
9,41,27.7,89,122,1,9.3


In [81]:
df_test7['Risk Level'] = df_test7.apply(classify_complications, axis=1)

In [85]:
df_test7.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10 entries, 0 to 9
Data columns (total 6 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   age                       10 non-null     int32  
 1   bmi                       10 non-null     float64
 2   diabp                     10 non-null     int32  
 3   sysbp                     10 non-null     int32  
 4   fam_history_hypertension  10 non-null     int64  
 5   glucose_levels            10 non-null     float64
dtypes: float64(2), int32(3), int64(1)
memory usage: 492.0 bytes


In [92]:
df_final_v2.info()

<class 'pandas.core.frame.DataFrame'>
Index: 451 entries, 0 to 24
Data columns (total 7 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   age                       451 non-null    int64  
 1   bmi                       451 non-null    float64
 2   diabp                     451 non-null    int64  
 3   sysbp                     451 non-null    int64  
 4   fam_history_hypertension  451 non-null    int64  
 5   glucose_levels            451 non-null    float64
 6   Risk Level                451 non-null    int64  
dtypes: float64(2), int64(5)
memory usage: 28.2 KB


In [93]:
df_final_v2.to_csv("../data/preeclampsia_90.csv", index=False)

In [None]:
joblib.dump(rf_model, "../models/preeclampsia_v3"

In [20]:
np.random.seed(42)

# Generate random test data
test_data8 = {
    'age': np.random.randint(16, 45, size=30),  # Age between 16 and 45
    'bmi': np.round(np.random.uniform(35, 40, size=30), 1),  # BMI between 25 and 40
    'diabp': np.random.randint(75, 86, size=30),  # Diastolic BP between 75 and 100
    'sysbp': np.random.randint(120, 136, size=30),  # Systolic BP between 120 and 150
    'fam_history_hypertension': np.random.choice([0, 1], size=30),  # Family history (0 or 1)
    'glucose_levels': np.round(np.random.uniform(7.0, 13.0, size=30), 1)  # Glucose levels between 7.0 and 13.0
}

# Create DataFrame
df_test8 = pd.DataFrame(test_data8)

# Display first few rows
df_test8.head(10)

Unnamed: 0,age,bmi,diabp,sysbp,fam_history_hypertension,glucose_levels
0,22,36.5,76,134,1,10.4
1,35,37.6,80,130,0,11.2
2,44,37.2,80,122,0,7.8
3,30,36.5,84,120,0,10.6
4,26,38.1,78,127,0,10.2
5,23,35.7,80,122,0,8.2
6,44,36.5,76,122,0,12.7
7,36,36.8,84,120,1,10.6
8,22,37.3,76,130,0,11.2
9,41,38.9,84,124,0,12.3


In [5]:
rf_model_v3 = joblib.load("../models/preeclampsia_joblib_v3")

In [23]:
rf_model_v3.predict(df_test8)

array([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       0, 1, 1, 1, 1, 1, 1, 0])

In [24]:
df_test8['Risk Level'] = df_test8.apply(classify_complications, axis=1)

In [27]:
df_test8.head(13)

Unnamed: 0,age,bmi,diabp,sysbp,fam_history_hypertension,glucose_levels,Risk Level
0,22,36.5,76,134,1,10.4,Low
1,35,37.6,80,130,0,11.2,Low
2,44,37.2,80,122,0,7.8,Low
3,30,36.5,84,120,0,10.6,Low
4,26,38.1,78,127,0,10.2,Low
5,23,35.7,80,122,0,8.2,Low
6,44,36.5,76,122,0,12.7,High
7,36,36.8,84,120,1,10.6,High
8,22,37.3,76,130,0,11.2,Low
9,41,38.9,84,124,0,12.3,High


In [26]:
df_test8['Risk Level'].value_counts()

Risk Level
Low     19
High    11
Name: count, dtype: int64

In [28]:
label_encoder = LabelEncoder()

In [29]:
df_test8['Risk Level'] = label_encoder.fit_transform(df_test8['Risk Level'])

In [30]:
df_test8.head(10)

Unnamed: 0,age,bmi,diabp,sysbp,fam_history_hypertension,glucose_levels,Risk Level
0,22,36.5,76,134,1,10.4,1
1,35,37.6,80,130,0,11.2,1
2,44,37.2,80,122,0,7.8,1
3,30,36.5,84,120,0,10.6,1
4,26,38.1,78,127,0,10.2,1
5,23,35.7,80,122,0,8.2,1
6,44,36.5,76,122,0,12.7,0
7,36,36.8,84,120,1,10.6,0
8,22,37.3,76,130,0,11.2,1
9,41,38.9,84,124,0,12.3,0


In [31]:
df1 = pd.read_csv("../data/preeclampsia_90.csv")

In [32]:
df1.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 451 entries, 0 to 450
Data columns (total 7 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   age                       451 non-null    int64  
 1   bmi                       451 non-null    float64
 2   diabp                     451 non-null    int64  
 3   sysbp                     451 non-null    int64  
 4   fam_history_hypertension  451 non-null    int64  
 5   glucose_levels            451 non-null    float64
 6   Risk Level                451 non-null    int64  
dtypes: float64(2), int64(5)
memory usage: 24.8 KB


In [33]:
df_test8.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 30 entries, 0 to 29
Data columns (total 7 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   age                       30 non-null     int32  
 1   bmi                       30 non-null     float64
 2   diabp                     30 non-null     int32  
 3   sysbp                     30 non-null     int32  
 4   fam_history_hypertension  30 non-null     int64  
 5   glucose_levels            30 non-null     float64
 6   Risk Level                30 non-null     int64  
dtypes: float64(2), int32(3), int64(2)
memory usage: 1.4 KB


In [36]:
df1.tail(10)

Unnamed: 0,age,bmi,diabp,sysbp,fam_history_hypertension,glucose_levels,Risk Level
441,36,39.7,80,140,1,8.5,0
442,19,32.0,96,143,0,7.2,0
443,23,37.9,84,145,0,11.3,0
444,39,35.2,78,144,0,7.7,0
445,18,31.8,96,147,0,9.6,0
446,37,25.2,92,147,0,8.2,0
447,36,39.1,86,147,1,12.4,0
448,17,33.4,76,132,0,9.9,1
449,39,30.8,84,128,1,10.4,1
450,27,25.2,78,148,0,11.2,0


In [37]:
df = pd.concat([df_test8, df1])

In [38]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 481 entries, 0 to 450
Data columns (total 7 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   age                       481 non-null    int64  
 1   bmi                       481 non-null    float64
 2   diabp                     481 non-null    int64  
 3   sysbp                     481 non-null    int64  
 4   fam_history_hypertension  481 non-null    int64  
 5   glucose_levels            481 non-null    float64
 6   Risk Level                481 non-null    int64  
dtypes: float64(2), int64(5)
memory usage: 30.1 KB


In [39]:
df.head()

Unnamed: 0,age,bmi,diabp,sysbp,fam_history_hypertension,glucose_levels,Risk Level
0,22,36.5,76,134,1,10.4,1
1,35,37.6,80,130,0,11.2,1
2,44,37.2,80,122,0,7.8,1
3,30,36.5,84,120,0,10.6,1
4,26,38.1,78,127,0,10.2,1


In [40]:
X = df.drop(columns=['Risk Level'])

In [41]:
y = df['Risk Level']

In [42]:
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
rf_model.fit(X, y)

In [43]:
np.random.seed(42)

# Generate random test data
test_data9 = {
    'age': np.random.randint(16, 45, size=20),  # Age between 16 and 45
    'bmi': np.round(np.random.uniform(35, 40, size=20), 1),  # BMI between 25 and 40
    'diabp': np.random.randint(75, 86, size=20),  # Diastolic BP between 75 and 100
    'sysbp': np.random.randint(120, 136, size=20),  # Systolic BP between 120 and 150
    'fam_history_hypertension': np.random.choice([0, 1], size=20),  # Family history (0 or 1)
    'glucose_levels': np.round(np.random.uniform(9.0, 13.0, size=20), 1)  # Glucose levels between 7.0 and 13.0
}

# Create DataFrame
df_test9 = pd.DataFrame(test_data9)

# Display first few rows
df_test9.head(10)

Unnamed: 0,age,bmi,diabp,sysbp,fam_history_hypertension,glucose_levels
0,22,35.1,77,121,0,11.4
1,35,39.8,81,127,1,10.1
2,44,39.2,79,123,0,10.2
3,30,36.1,83,121,1,9.7
4,26,35.9,81,133,1,9.1
5,23,35.9,76,125,0,10.7
6,44,36.5,78,125,1,10.6
7,36,37.6,83,129,0,10.2
8,22,37.2,76,123,1,9.1
9,41,36.5,84,125,1,9.8


In [44]:
df_test9['Risk Level'] = df_test9.apply(classify_complications, axis=1)
df_test9.head()

Unnamed: 0,age,bmi,diabp,sysbp,fam_history_hypertension,glucose_levels,Risk Level
0,22,35.1,77,121,0,11.4,Low
1,35,39.8,81,127,1,10.1,Low
2,44,39.2,79,123,0,10.2,Low
3,30,36.1,83,121,1,9.7,Low
4,26,35.9,81,133,1,9.1,Low


In [45]:
df_test9['Risk Level'].value_counts()

Risk Level
Low     12
High     8
Name: count, dtype: int64

In [46]:
df_test9['Risk Level'] = label_encoder.fit_transform(df_test9['Risk Level'])

In [53]:
df_test9.tail(10)

Unnamed: 0,age,bmi,diabp,sysbp,fam_history_hypertension,glucose_levels,Risk Level
10,34,38.1,83,132,0,11.8,1
11,38,35.7,84,134,1,12.2,0
12,26,36.5,79,121,0,11.4,1
13,26,36.8,76,129,1,12.7,0
14,39,37.3,78,131,0,11.6,0
15,36,38.9,81,121,0,12.7,0
16,19,36.0,82,129,1,12.4,0
17,23,37.6,77,133,1,10.8,1
18,39,38.0,75,123,0,9.4,1
19,18,35.2,78,133,1,10.5,0


In [48]:
X_test = df_test9.drop(columns=['Risk Level'])

In [49]:
y_test = df_test9['Risk Level']
y_test.head()

0    1
1    1
2    1
3    1
4    1
Name: Risk Level, dtype: int64

In [50]:
rf_model.score(X_test, y_test)

0.85

In [51]:
rf_model.predict(X_test)

array([1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 1, 0, 1, 1, 1, 0, 0, 1, 1, 1])

In [54]:
joblib.dump(rf_model, "preeclampsia_joblib_v4")

['preeclampsia_joblib_v4']

In [55]:
df.to_csv("../data/preeclampsia85.csv", index=False)