In [98]:
import numpy as np
import pandas as pd

In [99]:
df = pd.read_excel('DataSet/FinalData.xlsx')
df.shape

(102, 161)

In [100]:
num_col = df._get_numeric_data().columns.values
num_col

array(['No. of ANC visits', 'Measured_height',
       'Weight in 1st trimester (kg)', 'BMI in 1st trimester',
       'Weight in kg', 'SFH/Fundal height', 'FHS (beats /min)',
       'RBS (mg/dl)', 'Glyco HB', 'Hb', 'TC', 'Platelets', 'TSH',
       'Weight in kg.1', 'SFH/Fundal height.1', 'FHS', 'Weight in kg.2',
       'SFH/Fundal height.2', 'FHS.1', 'Birth weight (grams)', 'POG_y',
       'ACCELERATIONS', 'BASELINE', 'BASELINE_MIN', 'BASELINE_MAX',
       'MINUTES_LOW', 'MINUTES_HIGH', 'AVERAGE_VAR_BPM', 'AVERAGE_VAR_MS',
       'STV_BPM', 'STV_MS', 'DECELS', 'LATE', 'SEV_VAR', 'MOD_VAR',
       'PROLONGED', 'DECEL20_100', 'DECEL100', 'SIGLOSS', 'FMP_COUNT',
       'FMP_AVG_DURATION', 'FMP_FEATURE_DURATION', 'FMP_FEATURE_PERCENT',
       'TOCO_CONTRACTIONS', 'TOCO_CTR_FREQ', 'TOCO_DURATION',
       'TOCO_INTENSITY', 'TOCO_REST_TONE', 'TOCO_RELAXATION'],
      dtype=object)

Select columns having strings

In [101]:
str_col = df
for st in num_col:
  str_col = str_col.drop(columns=[st])

In [102]:
str_col = str_col.drop(columns=['Preterm','Sub. No.','Total weight gain in pregnancy','APGAR score at 1 min','APGAR score at 5 mins'])
str_col.shape

(102, 107)

In [103]:
str_col = str_col.replace(np.nan,'ND')

Embedding the strings

In [104]:
import torch
from transformers import AutoTokenizer, AutoModel

In [105]:
# Load the pre-trained DistilBERT model and tokenizer
tokenizer = AutoTokenizer.from_pretrained('distilbert-base-uncased')
model = AutoModel.from_pretrained('distilbert-base-uncased')

In [106]:
emb_data = pd.DataFrame()
embeddings = []
col = ''
for column in str_col:
  for values in str_col[column]:
    input_ids = torch.tensor(tokenizer.encode(values)).unsqueeze(0)
    with torch.no_grad():
        outputs = model(input_ids)
        embedding = outputs[0][:, 0, :].numpy().T
        embeddings.append(embedding)
  embed_arr = np.array(embeddings)
  embed_arr = embed_arr.reshape(-1,768)
  fea_emb = pd.DataFrame(embed_arr)
  emb_data = pd.merge(emb_data,fea_emb,left_index=True,right_index=True, how='outer',suffixes=(f'_{col}', ''))
  col = column
  embeddings.clear()

In [107]:
emb_data.shape

(102, 82176)

Apply TSNE

In [108]:
from sklearn.manifold import TSNE

emb_data.columns = emb_data.columns.astype(str)
tsne = TSNE(n_components=2, perplexity=5, random_state=42)
reduced_embeddings = tsne.fit_transform(emb_data)

In [109]:
reduced_embeddings.shape

(102, 2)

Merge reduced embeddings and numeric colmuns

In [110]:
n_col = df._get_numeric_data()
e_col = pd.DataFrame(reduced_embeddings)
X = pd.merge(n_col,e_col,left_index=True,right_index=True, how='outer')
y = df['Preterm']

In [111]:
X.columns = X.columns.astype(str)
y = y.replace('No',1)
y = y.replace('Yes',0)

In [112]:
from sklearn.impute import SimpleImputer
imputer = SimpleImputer(missing_values=np.nan, strategy='median')
imputer = imputer.fit(X)
X1 = imputer.transform(X)

In [113]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, test_size = 0.2, random_state = 30)

In [114]:
from sklearn.preprocessing import StandardScaler

sc_x = StandardScaler()
X_train = sc_x.fit_transform(X_train)
X_test = sc_x.transform(X_test)

In [115]:
from lazypredict.Supervised import LazyClassifier

# Initialize the Lazypredict library
clf = LazyClassifier(verbose=0,ignore_warnings=True, custom_metric=None)

# Fit all classification algorithms on training dataset
models,predictions = clf.fit(X_train, X_test, y_train, y_test)

# Calculate performance of all models on test dataset
model_dictionary = clf.provide_models(X_train,X_test,y_train,y_test)
models

100%|██████████| 29/29 [00:00<00:00, 37.33it/s]

[LightGBM] [Info] Number of positive: 71, number of negative: 10
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000116 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1010
[LightGBM] [Info] Number of data points in the train set: 81, number of used features: 47
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.876543 -> initscore=1.960095
[LightGBM] [Info] Start training from score 1.960095





Unnamed: 0_level_0,Accuracy,Balanced Accuracy,ROC AUC,F1 Score,Time Taken
Model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
RidgeClassifier,0.86,0.78,0.78,0.87,0.01
NearestCentroid,0.81,0.75,0.75,0.83,0.01
AdaBoostClassifier,0.71,0.69,0.69,0.75,0.13
Perceptron,0.71,0.69,0.69,0.75,0.01
XGBClassifier,0.9,0.67,0.67,0.88,0.03
LGBMClassifier,0.9,0.67,0.67,0.88,0.02
RidgeClassifierCV,0.86,0.64,0.64,0.84,0.01
LinearDiscriminantAnalysis,0.81,0.61,0.61,0.81,0.01
BernoulliNB,0.81,0.61,0.61,0.81,0.01
LogisticRegression,0.76,0.58,0.58,0.78,0.01
