### Import Required Libraries

In [None]:
import pandas as pd
import numpy as np

# For encoding categorical variables
from sklearn.preprocessing import LabelEncoder, OneHotEncoder

# For scaling features 
from sklearn.preprocessing import StandardScaler, MinMaxScaler

# For dimensionality reduction 
from sklearn.decomposition import PCA

# For handling missing data 
from sklearn.impute import SimpleImputer

In [3]:
df = pd.read_csv('../data/processed/merged_cleaned_trials.csv')

In [4]:
# Check data types
df.dtypes

NCT Number                          object
study_title                         object
study_status                         int64
brief_summary                       object
conditions                          object
interventions                       object
primary_outcome_measures            object
secondary_outcome_measures          object
sponsor                             object
enrollment                           int64
study_type                          object
study_design                        object
last_update_posted                  object
locations                           object
study_duration_days                float64
sex_all                               bool
sex_female                            bool
sex_male                              bool
has_child                            int64
has_adult                            int64
has_older_adult                      int64
phase1                               int64
phase2                               int64
phase3     

In [5]:
# Check unique values of categorical features
for col in df.select_dtypes(include='object').columns:
    print(f"\nColumn: {col}")
    print(df[col].unique()[:10])  


Column: NCT Number
['NCT06767735' 'NCT06767748' 'NCT06767761' 'NCT06305286' 'NCT06727721'
 'NCT06946628' 'NCT06762314' 'NCT06619301' 'NCT06847178' 'NCT06430125']

Column: study_title
['A Phase III Clinical Study to Assess the Efficacy and Safety of GZR4 in Insulin-naive Subjects with Type 2 Diabetes Mellitus (T2DM)'
 'A Phase III Clinical Study to Assess the Efficacy and Safety of GZR4 in Subjects With Type 2 Diabetes Mellitus Treated With Basal Insulin'
 'A Phase III Clinical Study to Assess the Efficacy and Safety of GZR4 in Type 2 Diabetes Mellitus Subjects (T2DM) Treated with Basal + Prandial Insulin'
 'Safety, Tolerability, and Efficacy of Immunomodulation With A Monoclonal Antibody Against CD40L in Combination With Transplanted Islet Cells in Adults With Brittle Type 1 Diabetes Mellitus (T1D)'
 'Safety and Efficacy of OCN19-overexpressed Human Umbilical Cord-derived Mesenchymal Stem Cells in the Treatment of Refractory Type 2 Diabetes Mellitus'
 'New Triple Therapy in Newly Diag

In [6]:
df = df.drop(['NCT Number', 'study_title', 'brief_summary', 'last_update_posted'], axis=1)

In [7]:
df.isnull().sum().sort_values(ascending=False)

study_status                       0
has_older_adult                    0
missing_primary_completion_date    0
missing_start_date                 0
funder_unknown                     0
funder_other_gov                   0
funder_other                       0
funder_nih                         0
funder_network                     0
funder_industry                    0
funder_indiv                       0
funder_fed                         0
phase3                             0
phase2                             0
phase1                             0
has_adult                          0
conditions                         0
has_child                          0
sex_male                           0
sex_female                         0
sex_all                            0
study_duration_days                0
locations                          0
study_design                       0
study_type                         0
enrollment                         0
sponsor                            0
s

In [8]:
le = LabelEncoder()
df['study_type'] = le.fit_transform(df['study_type'])
df['study_design'] = le.fit_transform(df['study_design'])
df['sponsor'] = le.fit_transform(df['sponsor']) 

In [9]:
df['sponsor'].nunique()
df['sponsor'].value_counts()

sponsor
617     170
1098    109
1739    106
1242     93
1158     85
       ... 
691       1
1826      1
1854      1
1425      1
236       1
Name: count, Length: 2218, dtype: int64

In [11]:
# Reduce cardinality
top_sponsors = df['sponsor'].value_counts().nlargest(10).index
df['sponsor'] = df['sponsor'].apply(lambda x: x if x in top_sponsors else 'Other')

# Convert all to string
df['sponsor'] = df['sponsor'].astype(str)

# Encode
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
df['sponsor'] = le.fit_transform(df['sponsor'])

In [12]:
df['sponsor'].nunique()
df['sponsor'].value_counts()

sponsor
9    5880
8     170
0     109
5     106
3      93
1      85
7      66
4      65
2      54
6      48
Name: count, dtype: int64

In [13]:
df['study_status'].value_counts()

study_status
1    5147
0    1529
Name: count, dtype: int64

In [15]:
df.dtypes

study_status                         int64
conditions                          object
interventions                       object
primary_outcome_measures            object
secondary_outcome_measures          object
sponsor                              int64
enrollment                           int64
study_type                           int64
study_design                         int64
locations                           object
study_duration_days                float64
sex_all                               bool
sex_female                            bool
sex_male                              bool
has_child                            int64
has_adult                            int64
has_older_adult                      int64
phase1                               int64
phase2                               int64
phase3                               int64
funder_fed                            bool
funder_indiv                          bool
funder_industry                       bool
funder_netw

In [None]:
df_model = df.copy()

# Droping high-cardinality text columns that are not used right now
df_model = df_model.drop([
    'conditions',
    'interventions',
    'primary_outcome_measures',
    'secondary_outcome_measures',
    'locations'
], axis=1)

In [18]:
df_model.to_csv('../data/processed/df_model.csv', index=False)

In [17]:
df_model.dtypes

study_status                         int64
sponsor                              int64
enrollment                           int64
study_type                           int64
study_design                         int64
study_duration_days                float64
sex_all                               bool
sex_female                            bool
sex_male                              bool
has_child                            int64
has_adult                            int64
has_older_adult                      int64
phase1                               int64
phase2                               int64
phase3                               int64
funder_fed                            bool
funder_indiv                          bool
funder_industry                       bool
funder_network                        bool
funder_nih                            bool
funder_other                          bool
funder_other_gov                      bool
funder_unknown                        bool
missing_sta