In [4]:
import pandas as pd
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import SMOTE
from sklearn.preprocessing import StandardScaler

In [5]:
# Load your dataset
data = pd.read_csv(r"C:\Users\DELL 7470\Desktop\PREG COMPLICATION\Maternal Risk Prediction\Maternal Health Risk Data Set.csv")
data.head()

Unnamed: 0,Age,SystolicBP,DiastolicBP,BS,BodyTemp,HeartRate,RiskLevel
0,25,130,80,15.0,98.0,86,high risk
1,35,140,90,13.0,98.0,70,high risk
2,29,90,70,8.0,100.0,80,high risk
3,30,140,85,7.0,98.0,70,high risk
4,35,120,60,6.1,98.0,76,low risk


In [6]:
data.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1014 entries, 0 to 1013
Data columns (total 7 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   Age          1014 non-null   int64  
 1   SystolicBP   1014 non-null   int64  
 2   DiastolicBP  1014 non-null   int64  
 3   BS           1014 non-null   float64
 4   BodyTemp     1014 non-null   float64
 5   HeartRate    1014 non-null   int64  
 6   RiskLevel    1014 non-null   object 
dtypes: float64(2), int64(4), object(1)
memory usage: 55.6+ KB


In [9]:
print(data.shape)

(1014, 7)


In [27]:
# Acall the target variable
X = data.drop(columns='RiskLevel')
y = data['RiskLevel']

# Split the dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# One-hot encode categorical variables
X_train = pd.get_dummies(X_train, drop_first=True)
X_test = pd.get_dummies(X_test, drop_first=True)

# Align the train and test sets to have the same columns
X_train, X_test = X_train.align(X_test, join='left', axis=1, fill_value=0)


# Standardize the features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [33]:
from sklearn.preprocessing import PolynomialFeatures
# Generate polynomial features
poly = PolynomialFeatures(degree=2, interaction_only=False, include_bias=False)
poly_features = poly.fit_transform(X_train_scaled)

# Convert the polynomial features to a DataFrame
poly_features_df = pd.DataFrame(poly_features, columns=poly.get_feature_names_out(X_train.columns))

# Combine with the original dataset
enhanced_data = pd.concat([X_train.reset_index(drop=True), poly_features_df], axis=1)

# Print the first few rows of the enhanced dataset
print(enhanced_data.head())

   Age  SystolicBP  DiastolicBP    BS  BodyTemp  HeartRate       Age  \
0   30         120           80   9.0     101.0         76  0.015869   
1   23         130           70   6.9      98.0         70 -0.507898   
2   32         120           90   6.9      98.0         70  0.165517   
3   31         120           60   6.1      98.0         76  0.090693   
4   35         100           60  15.0      98.0         80  0.389988   

   SystolicBP  DiastolicBP        BS  ...  DiastolicBP^2  DiastolicBP BS  \
0    0.359605     0.260182  0.068678  ...       0.067695        0.017869   
1    0.900380    -0.465180 -0.563601  ...       0.216392        0.262176   
2    0.359605     0.985544 -0.563601  ...       0.971297       -0.555454   
3    0.359605    -1.190541 -0.804469  ...       1.417389        0.957754   
4   -0.721944    -1.190541  1.875189  ...       1.417389       -2.232490   

   DiastolicBP BodyTemp  DiastolicBP HeartRate      BS^2  BS BodyTemp  \
0              0.430011              

In [29]:
# Apply SMOTE
smote = SMOTE(random_state=42)
X_resampled, y_resampled = smote.fit_resample(X_train_scaled, y_train)

In [30]:
# Convert numpy arrays to pandas DataFrames
X_train_resampled = pd.DataFrame(X_resampled, columns=X.columns)
y_train_resampled = pd.Series(y_resampled, name='RiskLevel')

In [31]:
# Combine resampled training data with original test data
X_combined = pd.concat([X_train_resampled, pd.DataFrame(X_test_scaled, columns=X.columns)], ignore_index=True)
y_combined = pd.concat([y_train_resampled, y_test.reset_index(drop=True)], ignore_index=True)

combined_dataset = pd.concat([X_combined, y_combined], axis=1)

In [20]:
# Check the shape of the combined dataset
print(f'Original dataset shape: {X.shape}')
print(f'Resampled dataset shape: {X_resampled.shape}')
print(f'Combined dataset shape: {X_combined.shape}, {y_combined.shape}')

Original dataset shape: (1014, 6)
Resampled dataset shape: (978, 6)
Combined dataset shape: (1181, 6), (1181,)


In [32]:
print(combined_dataset.head())

        Age  SystolicBP  DiastolicBP        BS  BodyTemp  HeartRate  RiskLevel
0  0.015869    0.359605     0.260182  0.068678  1.652729   0.195418   mid risk
1 -0.507898    0.900380    -0.465180 -0.563601 -0.500350  -0.531017   mid risk
2  0.165517    0.359605     0.985544 -0.563601 -0.500350  -0.531017   mid risk
3  0.090693    0.359605    -1.190541 -0.804469 -0.500350   0.195418   mid risk
4  0.389988   -0.721944    -1.190541  1.875189 -0.500350   0.679707  high risk


In [22]:
print(combined_dataset.shape)

(1181, 7)


In [17]:
print(X_combined.head())
print(y_combined.head())

        Age  SystolicBP  DiastolicBP        BS  BodyTemp  HeartRate
0  0.015869    0.359605     0.260182  0.068678  1.652729   0.195418
1 -0.507898    0.900380    -0.465180 -0.563601 -0.500350  -0.531017
2  0.165517    0.359605     0.985544 -0.563601 -0.500350  -0.531017
3  0.090693    0.359605    -1.190541 -0.804469 -0.500350   0.195418
4  0.389988   -0.721944    -1.190541  1.875189 -0.500350   0.679707
0     mid risk
1     mid risk
2     mid risk
3     mid risk
4    high risk
Name: RiskLevel, dtype: object


In [23]:
combined_dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1181 entries, 0 to 1180
Data columns (total 7 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   Age          1181 non-null   float64
 1   SystolicBP   1181 non-null   float64
 2   DiastolicBP  1181 non-null   float64
 3   BS           1181 non-null   float64
 4   BodyTemp     1181 non-null   float64
 5   HeartRate    1181 non-null   float64
 6   RiskLevel    1181 non-null   object 
dtypes: float64(6), object(1)
memory usage: 64.7+ KB


In [24]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import PolynomialFeatures

In [25]:
# Load your dataset
data = pd.read_csv(r"C:\Users\DELL 7470\Desktop\PREG COMPLICATION\Maternal Risk Prediction\Maternal Health Risk Data Set.csv")

data.head()

Unnamed: 0,Age,SystolicBP,DiastolicBP,BS,BodyTemp,HeartRate,RiskLevel
0,25,130,80,15.0,98.0,86,high risk
1,35,140,90,13.0,98.0,70,high risk
2,29,90,70,8.0,100.0,80,high risk
3,30,140,85,7.0,98.0,70,high risk
4,35,120,60,6.1,98.0,76,low risk
