In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/nest-competition/676e54b2807db_usecase_2_test_gt_removed.csv
/kaggle/input/nest-competition/X_train_text2.npy
/kaggle/input/nest-competition/eligibilities.txt
/kaggle/input/nest-competition/facilities_cleaned.csv
/kaggle/input/nest-competition/text_features.npy
/kaggle/input/nest-competition/y_train.npy
/kaggle/input/nest-competition/eligibilities_cleaned.csv
/kaggle/input/nest-competition/y_train.pkl
/kaggle/input/nest-competition/y_test.npy
/kaggle/input/nest-competition/drop_withdrawals_cleaned.csv
/kaggle/input/nest-competition/X_train_combined (1).pkl
/kaggle/input/nest-competition/X_test_text.npy
/kaggle/input/nest-competition/X_test_num.npy
/kaggle/input/nest-competition/X_train_num.npy
/kaggle/input/nest-competition/facilities.txt
/kaggle/input/nest-competition/tfidf.npy
/kaggle/input/nest-competition/usecase_2_.csv
/kaggle/input/nest-competition/X_test_combined (1).pkl
/kaggle/input/nest-competition/y_test.pkl
/kaggle/input/nest-competition/drop_withdrawals.txt


In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from transformers import AutoTokenizer, AutoModel
import torch
from tqdm import tqdm
tqdm.pandas()
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, Dataset
from scipy.sparse import csr_matrix
from sklearn.feature_extraction.text import TfidfVectorizer
from scipy.sparse import hstack
import gc

In [3]:
df = pd.read_csv('/kaggle/input/nest-competition/usecase_2_.csv')
# /kaggle/input/nest-competition/usecase_2_.csv
df = df.rename(columns={"NCT Number": "nct_id"})
facilities_cleaned = pd.read_csv('/kaggle/input/nest-competition/facilities_cleaned.csv')
eligibilities_cleaned = pd.read_csv('/kaggle/input/nest-competition/eligibilities_cleaned.csv')
drop_withdrawals_cleaned = pd.read_csv('/kaggle/input/nest-competition/drop_withdrawals_cleaned.csv')

## Preprocessing usecase_2 (main file)

In [4]:
df.drop_duplicates(keep="first", inplace=True)

merged_df = df.merge(facilities_cleaned, on='nct_id', how='left', suffixes=('', '_facilities'))
merged_df = merged_df.merge(eligibilities_cleaned, on='nct_id', how='left', suffixes=('', '_eligibilities'))
merged_df = merged_df.merge(drop_withdrawals_cleaned, on='nct_id', how='left', suffixes=('', '_withdrawals'))

columns_to_fill = ['total_dropouts', 'unique_reasons', 'unique_periods']
merged_df[columns_to_fill] = merged_df[columns_to_fill].fillna(0)

categories = ['ADULT', 'OLDER_ADULT', 'CHILD']

for category in categories:
    merged_df[category] = merged_df['Age'].str.contains(category, case=False, na=False).astype(int)

merged_df = merged_df.drop(['adult', 'child', 'older_adult', 'Age'], axis=1)

merged_df['gender_based'] = merged_df['gender_based'].fillna('f')
merged_df['Sex'] = merged_df['Sex'].fillna('ALL')
merged_df = merged_df.drop('gender', axis=1)

phases = ['EARLY_PHASE1', 'PHASE1', 'PHASE2', 'PHASE3', 'PHASE4']

for phase in phases:
    merged_df[phase] = merged_df['Phases'].str.contains(phase, case=False, na=False).astype(int)

merged_df['Enrollment'] = merged_df['Enrollment'].fillna(merged_df['Enrollment'].median())
merged_df = merged_df.drop('Locations', axis=1)

funder_type_encoded = pd.get_dummies(merged_df['Funder Type'], prefix='Funder_Type', drop_first=False)
merged_df = pd.concat([merged_df, funder_type_encoded], axis=1)
merged_df = merged_df.drop('Funder Type', axis=1)

merged_df['minimum_age'] = merged_df['minimum_age'].fillna('0 Minutes')
merged_df['maximum_age'] = merged_df['maximum_age'].fillna('200 Years')

merged_df['healthy_volunteers'] = merged_df['healthy_volunteers'].fillna('f')

def split_criteria(row):
    if pd.isna(row):
        return pd.Series([pd.NA, pd.NA])

    inclusion = exclusion = ""

    if "inclusion" in row.lower():
        inclusion_part = row.lower().split("inclusion", 1)[1]
        if "exclusion" in inclusion_part:
            inclusion = inclusion_part.split("exclusion", 1)[0].strip()
        else:
            inclusion = inclusion_part.strip()
    
    if "exclusion" in row.lower():
        exclusion_part = row.lower().split("exclusion", 1)[1]
        exclusion = exclusion_part.strip()

    inclusion = inclusion.replace("criteria:", "").strip()
    exclusion = exclusion.replace("criteria:", "").strip()

    return pd.Series([inclusion, exclusion])

merged_df[['inclusion_criteria', 'exclusion_criteria']] = merged_df['criteria'].apply(split_criteria)
merged_df = merged_df.drop(columns=['criteria'])

merged_df['unique_name_count'] = merged_df['unique_name_count'].fillna(merged_df['unique_name_count'].median())
merged_df['unique_city_count'] = merged_df['unique_city_count'].fillna(merged_df['unique_city_count'].median())
merged_df['unique_zip_count'] = merged_df['unique_zip_count'].fillna(merged_df['unique_zip_count'].median())
merged_df['unique_state_count'] = merged_df['unique_state_count'].fillna(merged_df['unique_state_count'].median())
merged_df['unique_country_count'] = merged_df['unique_country_count'].fillna(merged_df['unique_country_count'].median())

def bin_and_encode_all(df, columns, high_cutoff=0.7, medium_cutoff=0.9):
    def calculate_thresholds(value_counts, high_cutoff, medium_cutoff):
        total = value_counts.sum()
        cumulative_percentage = value_counts.cumsum() / total
        high_threshold = value_counts[cumulative_percentage <= high_cutoff].iloc[-1]
        medium_threshold = value_counts[cumulative_percentage <= medium_cutoff].iloc[-1]
        return high_threshold, medium_threshold

    for column in columns:
        counts = df[column].value_counts(dropna=False)  # Include NaN in counts
        high_threshold, medium_threshold = calculate_thresholds(counts, high_cutoff, medium_cutoff)

        def assign_bins(value):
            if pd.isna(value):
                return "Missing"  # Handle NaN explicitly
            count = counts.get(value, 0)
            if count > high_threshold:
                return "High_Freq"
            elif count >= medium_threshold:
                return "Medium_Freq"
            else:
                return "Low_Freq"

        df[f"{column}_bin"] = df[column].apply(assign_bins)

        one_hot = pd.get_dummies(df[f"{column}_bin"], prefix=f"{column}_Bin").astype(int)

        df = pd.concat([df, one_hot], axis=1).drop(f"{column}_bin", axis=1)

    df = df.drop(columns, axis=1)

    return df

columns_to_process = ['mode_country', 'mode_city', 'mode_state', 'mode_name']
merged_df = bin_and_encode_all(merged_df, columns_to_process)

boolean_columns = [col for col in merged_df.columns if merged_df[col].dtype == 'bool']

merged_df['Study Results'] = merged_df['Study Results'].map({'YES': 1, 'NO': 0})

merged_df['healthy_volunteers'] = merged_df['healthy_volunteers'].map({'t': 1, 'f': 0})

merged_df['gender_based'] = merged_df['gender_based'].map({'t': 1, 'f': 0})

sex_encoded = pd.get_dummies(merged_df['Sex'], prefix='Sex')
merged_df = pd.concat([merged_df, sex_encoded], axis=1)

for col in ['Sex_ALL', 'Sex_FEMALE', 'Sex_MALE']:
    if col in merged_df.columns:
        merged_df[col] = merged_df[col].astype(int)

for col in boolean_columns:
    merged_df[col] = merged_df[col].astype(int)

merged_df = merged_df.drop('Sex', axis=1)

In [5]:
data = merged_df.copy()
del merged_df

bin_features = ["Unnamed: 0", "Study URL", "Acronym", "Study Status", "Other IDs", "Study Documents"]
text_columns = ['Study Title', 'Brief Summary', 'Conditions',
       'Interventions', 'Primary Outcome Measures', 'exclusion_criteria',
       'Secondary Outcome Measures', 'Other Outcome Measures', 'Study Design', 'inclusion_criteria']

data = data.drop(bin_features, axis = 1)

data['Secondary Outcome Measures'].fillna('', inplace=True)
data['Other Outcome Measures'].fillna('', inplace=True)
data['inclusion_criteria'].fillna("", inplace=True)
data['exclusion_criteria'].fillna("", inplace=True)

pd.set_option('display.max_columns', None)

conversion_factors = {
    'Minute': 1,
    'Hour': 60,
    'Day': 1440,          # 24 hours × 60 minutes
    'Week': 10080,        # 7 days × 1440 minutes
    'Month': 43800,       # Assuming 30.44 days per month on average
    'Year': 525600        # 365.25 days × 1440 minutes
}

import re

def convert_to_minutes(time_str):
    match = re.match(r"(\d+)\s*(\w+)", time_str)  # Extract number and unit
    if match:
        value, unit = int(match.group(1)), match.group(2).capitalize().rstrip('s')
        return value * conversion_factors.get(unit, 1)  # Default to 1 if unit not found
    return 0

data['minimum_age_min'] = data['minimum_age'].apply(convert_to_minutes)
data['maximum_age_min'] = data['maximum_age'].apply(convert_to_minutes)

data['minimum_age_min'] = np.log1p(data['minimum_age_min'])
data['maximum_age_min'] = np.log1p(data['minimum_age_min'])

numerical_columns = ["Study Results", "Enrollment", 'unique_name_count',
       'unique_state_count', 'unique_zip_count', 'unique_city_count',
       'unique_country_count', 'minimum_age_min', 'maximum_age_min',
        'healthy_volunteers', 'gender_based', 'total_dropouts',
       'unique_reasons', 'unique_periods', 'ADULT', 'OLDER_ADULT', 'CHILD', 'EARLY_PHASE1', 'PHASE1', 'PHASE2',
       'PHASE3', 'PHASE4', 'Funder_Type_FED', 'Funder_Type_INDIV',
       'Funder_Type_INDUSTRY', 'Funder_Type_NETWORK', 'Funder_Type_NIH',
       'Funder_Type_OTHER', 'Funder_Type_OTHER_GOV', 'Funder_Type_UNKNOWN',
        'mode_country_Bin_High_Freq', 'mode_country_Bin_Low_Freq',
       'mode_country_Bin_Medium_Freq', 'mode_country_Bin_Missing',
       'mode_city_Bin_High_Freq', 'mode_city_Bin_Low_Freq',
       'mode_city_Bin_Medium_Freq', 'mode_city_Bin_Missing',
       'mode_state_Bin_High_Freq', 'mode_state_Bin_Low_Freq',
       'mode_state_Bin_Medium_Freq', 'mode_state_Bin_Missing',
       'mode_name_Bin_High_Freq', 'mode_name_Bin_Medium_Freq',
       'mode_name_Bin_Missing', 'Sex_ALL', 'Sex_FEMALE', 'Sex_MALE' ]
scaler = StandardScaler()
data[numerical_columns] = scaler.fit_transform(data[numerical_columns])

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  data['Secondary Outcome Measures'].fillna('', inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  data['Other Outcome Measures'].fillna('', inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which

## Textual Feature Engineering

In [6]:
import nltk
nltk.download('stopwords')
nltk.download('wordnet')
!unzip /usr/share/nltk_data/corpora/wordnet.zip -d /usr/share/nltk_data/corpora/

[nltk_data] Downloading package stopwords to /usr/share/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /usr/share/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
Archive:  /usr/share/nltk_data/corpora/wordnet.zip
   creating: /usr/share/nltk_data/corpora/wordnet/
  inflating: /usr/share/nltk_data/corpora/wordnet/lexnames  
  inflating: /usr/share/nltk_data/corpora/wordnet/data.verb  
  inflating: /usr/share/nltk_data/corpora/wordnet/index.adv  
  inflating: /usr/share/nltk_data/corpora/wordnet/adv.exc  
  inflating: /usr/share/nltk_data/corpora/wordnet/index.verb  
  inflating: /usr/share/nltk_data/corpora/wordnet/cntlist.rev  
  inflating: /usr/share/nltk_data/corpora/wordnet/data.adj  
  inflating: /usr/share/nltk_data/corpora/wordnet/index.adj  
  inflating: /usr/share/nltk_data/corpora/wordnet/LICENSE  
  inflating: /usr/share/nltk_data/corpora/wordnet/citation.bib  
  inflating: /usr/share/nltk_data/corpora/

In [7]:
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import string

stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

def preprocess_text_for_tfidf(text):
    # Rigorous preprocessing for TF-IDF
    if isinstance(text, str):
        text = text.lower()
        text = ''.join(ch for ch in text if ch not in string.punctuation)  # Remove punctuation
        tokens = text.split()
        tokens = [lemmatizer.lemmatize(word) for word in tokens if word not in stop_words]  # Lemmatize and remove stopwords
        return ' '.join(tokens)
    return ''

tfidf_columns = ['Conditions', 'Interventions', 'Secondary Outcome Measures', 'Other Outcome Measures', 'inclusion_criteria']
for col in tfidf_columns:
    data[col] = data[col].apply(preprocess_text_for_tfidf)

# Define TF-IDF vectorizer with limited max features
tfidf_vectorizers = {
    col: TfidfVectorizer(max_features=500, stop_words='english')
    for col in ['Conditions', 'Interventions', 'Secondary Outcome Measures', 'Other Outcome Measures', 'inclusion_criteria']
}

# Fit and transform TF-IDF features
tfidf_features = {}
for col, vectorizer in tfidf_vectorizers.items():
    tfidf_features[col] = vectorizer.fit_transform(data[col])

gc.collect()
# Combine TF-IDF vectors into a single sparse matrix
tfidf_combined = hstack([tfidf_features[col] for col in tfidf_features])

In [8]:
del df, facilities_cleaned, eligibilities_cleaned, drop_withdrawals_cleaned
gc.collect()

0

In [12]:
data.shape

(68960, 70)