## Import Libraries and data

In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
sns.set()
%matplotlib inline

import warnings
warnings.filterwarnings("ignore")

import pickle

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import QuantileTransformer
from sklearn.tree import DecisionTreeRegressor
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder

from scipy import stats
from tqdm import tqdm

from imblearn.over_sampling import SMOTE, ADASYN

from utils import cap_values, categorize_value, calculate_risk_score

In [3]:
df = pd.read_csv('df_cleaned.csv')
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 918 entries, 0 to 917
Data columns (total 12 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   Age             918 non-null    int64  
 1   Sex             918 non-null    object 
 2   ChestPainType   918 non-null    object 
 3   RestingBP       918 non-null    int64  
 4   Cholesterol     918 non-null    int64  
 5   FastingBS       918 non-null    int64  
 6   RestingECG      918 non-null    object 
 7   MaxHR           918 non-null    int64  
 8   ExerciseAngina  918 non-null    object 
 9   Oldpeak         918 non-null    float64
 10  ST_Slope        918 non-null    object 
 11  MentalHealth    918 non-null    int64  
dtypes: float64(1), int64(6), object(5)
memory usage: 86.2+ KB


In [4]:
with open(r'.\dumps\categorical_columns.pkl', 'rb') as f:
    categorical_cols = pickle.load(f)

with open(r'.\dumps\numeric_columns.pkl', 'rb') as f:
    numeric_cols = pickle.load(f)

In [5]:
df[categorical_cols] = df[categorical_cols].astype(str)
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 918 entries, 0 to 917
Data columns (total 12 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   Age             918 non-null    int64  
 1   Sex             918 non-null    object 
 2   ChestPainType   918 non-null    object 
 3   RestingBP       918 non-null    int64  
 4   Cholesterol     918 non-null    int64  
 5   FastingBS       918 non-null    object 
 6   RestingECG      918 non-null    object 
 7   MaxHR           918 non-null    int64  
 8   ExerciseAngina  918 non-null    object 
 9   Oldpeak         918 non-null    float64
 10  ST_Slope        918 non-null    object 
 11  MentalHealth    918 non-null    int64  
dtypes: float64(1), int64(5), object(6)
memory usage: 86.2+ KB


### Split the data

In [7]:
X = df.drop(['MentalHealth'], axis=1) # Features
y = df['MentalHealth']   # Target

In [8]:
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.15, random_state=42, stratify = y)
X_train = X_train.reset_index(drop=True)
X_test = X_test.reset_index(drop=True)
y_train = y_train.reset_index(drop=True)
y_test = y_test.reset_index(drop=True)

print('train data shape', X_train.shape, y_train.shape)
print('test data shape', X_test.shape, y_test.shape)

train data shape (780, 11) (780,)
test data shape (138, 11) (138,)


In [9]:
with open(r'.\data_for_model\test\test_data.pkl','wb') as f:
    pickle.dump([X_test, y_test], f)

## Preprocessing

### Numeric variables

In [12]:
for feature in numeric_cols:
    X_train[feature] = cap_values(X_train[feature], feature)

### FE features

In [14]:
X_train['New_Cholesterol_Bin'] = X_train['Cholesterol'].map(categorize_value)

In [15]:
X_train['New_Risk_Score'] = X_train.apply(calculate_risk_score, axis=1)

In [16]:
num_FE_features = ['New_Risk_Score']
cat_FE_features = ['New_Cholesterol_Bin']

with open(r'.\dumps\num_FE_features.pkl', 'wb') as f:
    pickle.dump(num_FE_features, f)

with open(r'.\dumps\cat_FE_features.pkl', 'wb') as f:
    pickle.dump(cat_FE_features, f)

### Categorial variables

In [18]:
for feature in categorical_cols+cat_FE_features:
    print(feature,":", X_train[feature].unique())

Sex : ['m' 'f']
ChestPainType : ['asy' 'nap' 'ata' 'ta']
FastingBS : ['1' '0']
RestingECG : ['lvh' 'normal' 'st']
ExerciseAngina : ['y' 'n']
ST_Slope : ['flat' 'up' 'down']
New_Cholesterol_Bin : ['high' 'medium' 'zero' 'low']


In [19]:
mapping_features = ['Sex', 'FastingBS', 'ExerciseAngina']
one_hot_features = ['ChestPainType', 'RestingECG', 'ST_Slope', 'New_Cholesterol_Bin']

with open(r'.\dumps\mapping_features.pkl', 'wb') as f:
    pickle.dump(mapping_features, f)

with open(r'.\dumps\one_hot_features.pkl', 'wb') as f:
    pickle.dump(one_hot_features, f)

**Mapping**

In [21]:
mapping = {
    'm' : 1, 'f' : 0,
    '0' : 0, '1' : 1,
    'n' : 0, 'y' : 1
}

with open(r'.\dumps\mapping.pkl', 'wb') as f:
    pickle.dump(mapping, f)

In [22]:
for feature in tqdm(mapping_features):
    X_train[feature] = X_train[feature].map(mapping).astype(int)

100%|████████████████████████████████████████████████████████████████████████████████████████████| 3/3 [00:00<?, ?it/s]


**One hot Encoding**

In [24]:
encoder = OneHotEncoder(sparse=False)

ohe_features_encoded = pd.DataFrame(encoder.fit_transform(X_train[one_hot_features]), columns= encoder.get_feature_names_out())
ohe_features_encoded = ohe_features_encoded.astype(int)

one_hot_encoder_feature_names = encoder.get_feature_names_out().tolist()

with open(r'.\dumps\one_hot_encoder.pkl', 'wb') as f:
    pickle.dump(encoder, f)

with open(r'.\dumps\one_hot_encoder_feature_names.pkl', 'wb') as f:
    pickle.dump(one_hot_encoder_feature_names, f)

### Final Features

In [26]:
X_train = pd.concat([X_train[numeric_cols + num_FE_features + mapping_features], ohe_features_encoded], axis=1)
X_train.shape, y_train.shape

((780, 23), (780,))

Imbalanced data

In [28]:
X_tr_imb, y_tr_imb = X_train, y_train

with open(r'.\data_for_model\train\0_imb_data.pkl', 'wb') as f:
    pickle.dump([X_tr_imb, y_tr_imb], f)

In [29]:
final_features = X_train.columns.tolist()
with open(r'.\dumps\final_features.pkl', 'wb') as f:
    pickle.dump(final_features, f)

## Balancing the data

SMOTE

In [32]:
smote = SMOTE(random_state=42)
X_tr_smote, y_tr_smote = smote.fit_resample(X_train, y_train)
print('shape of X_resampled and y_resampled:', X_tr_smote.shape, y_tr_smote.shape)

y_tr_smote.value_counts()

shape of X_resampled and y_resampled: (864, 23) (864,)


MentalHealth
1    432
0    432
Name: count, dtype: int64

In [33]:
with open(r'.\data_for_model\train\1_smote_data.pkl', 'wb') as f:
    pickle.dump([X_tr_smote, y_tr_smote], f)

## Scaling

In [35]:
num_cols = numeric_cols + num_FE_features

Imbalanced scaled data

In [37]:
scaler = StandardScaler()

num_std = pd.DataFrame(scaler.fit_transform(X_train[num_cols]), columns=num_cols)

X_tr_std = pd.concat([num_std, X_train[mapping_features], X_train[one_hot_encoder_feature_names]], axis=1)
y_tr_std = y_tr_imb
with open(r'.\dumps\2_standardscaler.pkl', 'wb') as f:
    pickle.dump(scaler, f)

with open(r'.\data_for_model\train\2_std_data.pkl', 'wb') as f:
    pickle.dump([X_tr_std, y_tr_std], f)

Balanced data

In [39]:
scaler = StandardScaler()

num_std = pd.DataFrame(scaler.fit_transform(X_tr_smote[num_cols]), columns=num_cols)

X_tr_smote_std = pd.concat([num_std, X_tr_smote[mapping_features], X_tr_smote[one_hot_encoder_feature_names]], axis=1)

with open(r'.\dumps\3_smote_standardscaler.pkl', 'wb') as f:
    pickle.dump(scaler, f)

with open(r'.\data_for_model\train\3_smote_std_data.pkl', 'wb') as f:
    pickle.dump([X_tr_smote_std, y_tr_smote], f)

## Final Feature Sets

In [41]:
train_feature_sets = {
    '[X_tr_imb, y_tr_imb]' : [X_tr_imb, y_tr_imb],
    '[X_tr_smote, y_tr_smote]' : [X_tr_smote, y_tr_smote],
    '[X_tr_std, y_tr_std]': [X_tr_std, y_tr_std],
    '[X_tr_smote_std, y_tr_smote]': [X_tr_smote_std, y_tr_smote]
}

with open(r'.\dumps\train_feature_sets.pkl', 'wb') as f:
    pickle.dump(train_feature_sets, f)