In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from transformers import AutoTokenizer, AutoModel
import torch
from tqdm import tqdm
tqdm.pandas()
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, Dataset

# Preprocessing

In [4]:
df = pd.read_csv('/kaggle/input/nest-competition/usecase_2_.csv')
df = df.rename(columns={"NCT Number": "nct_id"})
facilities_cleaned = pd.read_csv('/kaggle/input/nest-competition/facilities_cleaned.csv')
eligibilities_cleaned = pd.read_csv('/kaggle/input/nest-competition/eligibilities_cleaned.csv')
drop_withdrawals_cleaned = pd.read_csv('/kaggle/input/nest-competition/drop_withdrawals_cleaned.csv')

In [6]:
df.drop_duplicates(keep="first", inplace=True)

merged_df = df.merge(facilities_cleaned, on='nct_id', how='left', suffixes=('', '_facilities'))
merged_df = merged_df.merge(eligibilities_cleaned, on='nct_id', how='left', suffixes=('', '_eligibilities'))
merged_df = merged_df.merge(drop_withdrawals_cleaned, on='nct_id', how='left', suffixes=('', '_withdrawals'))

columns_to_fill = ['total_dropouts', 'unique_reasons', 'unique_periods']
merged_df[columns_to_fill] = merged_df[columns_to_fill].fillna(0)

categories = ['ADULT', 'OLDER_ADULT', 'CHILD']

for category in categories:
    merged_df[category] = merged_df['Age'].str.contains(category, case=False, na=False).astype(int)

merged_df = merged_df.drop(['adult', 'child', 'older_adult', 'Age'], axis=1)

merged_df['gender_based'] = merged_df['gender_based'].fillna('f')
merged_df['Sex'] = merged_df['Sex'].fillna('ALL')
merged_df = merged_df.drop('gender', axis=1)

phases = ['EARLY_PHASE1', 'PHASE1', 'PHASE2', 'PHASE3', 'PHASE4']

for phase in phases:
    merged_df[phase] = merged_df['Phases'].str.contains(phase, case=False, na=False).astype(int)

merged_df['Enrollment'] = merged_df['Enrollment'].fillna(merged_df['Enrollment'].median())
merged_df = merged_df.drop('Locations', axis=1)

funder_type_encoded = pd.get_dummies(merged_df['Funder Type'], prefix='Funder_Type', drop_first=False)
merged_df = pd.concat([merged_df, funder_type_encoded], axis=1)
merged_df = merged_df.drop('Funder Type', axis=1)

merged_df['minimum_age'] = merged_df['minimum_age'].fillna('0 Minutes')
merged_df['maximum_age'] = merged_df['maximum_age'].fillna('200 Years')

merged_df['healthy_volunteers'] = merged_df['healthy_volunteers'].fillna('f')

In [7]:
def split_criteria(row):
    if pd.isna(row):
        return pd.Series([pd.NA, pd.NA])

    inclusion = exclusion = ""

    if "inclusion" in row.lower():
        inclusion_part = row.lower().split("inclusion", 1)[1]
        if "exclusion" in inclusion_part:
            inclusion = inclusion_part.split("exclusion", 1)[0].strip()
        else:
            inclusion = inclusion_part.strip()
    
    if "exclusion" in row.lower():
        exclusion_part = row.lower().split("exclusion", 1)[1]
        exclusion = exclusion_part.strip()

    inclusion = inclusion.replace("criteria:", "").strip()
    exclusion = exclusion.replace("criteria:", "").strip()

    return pd.Series([inclusion, exclusion])

merged_df[['inclusion_criteria', 'exclusion_criteria']] = merged_df['criteria'].apply(split_criteria)
merged_df = merged_df.drop(columns=['criteria'])

In [8]:
merged_df['unique_name_count'] = merged_df['unique_name_count'].fillna(merged_df['unique_name_count'].median())
merged_df['unique_city_count'] = merged_df['unique_city_count'].fillna(merged_df['unique_city_count'].median())
merged_df['unique_zip_count'] = merged_df['unique_zip_count'].fillna(merged_df['unique_zip_count'].median())
merged_df['unique_state_count'] = merged_df['unique_state_count'].fillna(merged_df['unique_state_count'].median())
merged_df['unique_country_count'] = merged_df['unique_country_count'].fillna(merged_df['unique_country_count'].median())

def bin_and_encode_all(df, columns, high_cutoff=0.7, medium_cutoff=0.9):
    def calculate_thresholds(value_counts, high_cutoff, medium_cutoff):
        total = value_counts.sum()
        cumulative_percentage = value_counts.cumsum() / total
        high_threshold = value_counts[cumulative_percentage <= high_cutoff].iloc[-1]
        medium_threshold = value_counts[cumulative_percentage <= medium_cutoff].iloc[-1]
        return high_threshold, medium_threshold

    for column in columns:
        counts = df[column].value_counts(dropna=False)  # Include NaN in counts
        high_threshold, medium_threshold = calculate_thresholds(counts, high_cutoff, medium_cutoff)

        def assign_bins(value):
            if pd.isna(value):
                return "Missing"  # Handle NaN explicitly
            count = counts.get(value, 0)
            if count > high_threshold:
                return "High_Freq"
            elif count >= medium_threshold:
                return "Medium_Freq"
            else:
                return "Low_Freq"

        df[f"{column}_bin"] = df[column].apply(assign_bins)

        one_hot = pd.get_dummies(df[f"{column}_bin"], prefix=f"{column}_Bin").astype(int)

        df = pd.concat([df, one_hot], axis=1).drop(f"{column}_bin", axis=1)

    df = df.drop(columns, axis=1)

    return df

columns_to_process = ['mode_country', 'mode_city', 'mode_state', 'mode_name']
merged_df = bin_and_encode_all(merged_df, columns_to_process)

boolean_columns = [col for col in merged_df.columns if merged_df[col].dtype == 'bool']

merged_df['Study Results'] = merged_df['Study Results'].map({'YES': 1, 'NO': 0})

merged_df['healthy_volunteers'] = merged_df['healthy_volunteers'].map({'t': 1, 'f': 0})

merged_df['gender_based'] = merged_df['gender_based'].map({'t': 1, 'f': 0})

sex_encoded = pd.get_dummies(merged_df['Sex'], prefix='Sex')
merged_df = pd.concat([merged_df, sex_encoded], axis=1)

for col in ['Sex_ALL', 'Sex_FEMALE', 'Sex_MALE']:
    if col in merged_df.columns:
        merged_df[col] = merged_df[col].astype(int)

for col in boolean_columns:
    merged_df[col] = merged_df[col].astype(int)

merged_df = merged_df.drop('Sex', axis=1)


In [9]:
data = merged_df.copy()
del merged_df

bin_features = ["Unnamed: 0", "Study URL", "Acronym", "Study Status", "Other IDs", "Study Documents"]
text_columns = ['Study Title', 'Brief Summary', 'Conditions',
       'Interventions', 'Primary Outcome Measures', 'exclusion_criteria',
       'Secondary Outcome Measures', 'Other Outcome Measures', 'Study Design', 'inclusion_criteria']

data = data.drop(bin_features, axis = 1)

data['Secondary Outcome Measures'].fillna('', inplace=True)
data['Other Outcome Measures'].fillna('', inplace=True)
data['inclusion_criteria'].fillna("", inplace=True)
data['exclusion_criteria'].fillna("", inplace=True)


In [None]:
# def preprocess_text(text):
#     if isinstance(text, str):
#         return text.lower().strip()
#     return ''

# for col in text_columns:
#     data[col] = data[col].apply(preprocess_text)

In [7]:
pd.set_option('display.max_columns', None)

data.head()

Unnamed: 0,nct_id,Study Title,Brief Summary,Study Results,Conditions,Interventions,Primary Outcome Measures,Secondary Outcome Measures,Other Outcome Measures,Sponsor,Collaborators,Phases,Enrollment,Study Type,Study Design,Time taken for Enrollment,mode_zip,unique_name_count,unique_state_count,unique_zip_count,unique_city_count,unique_country_count,id,minimum_age,maximum_age,healthy_volunteers,gender_based,total_dropouts,unique_reasons,unique_periods,reason_counts,period_counts,ADULT,OLDER_ADULT,CHILD,EARLY_PHASE1,PHASE1,PHASE2,PHASE3,PHASE4,Funder_Type_FED,Funder_Type_INDIV,Funder_Type_INDUSTRY,Funder_Type_NETWORK,Funder_Type_NIH,Funder_Type_OTHER,Funder_Type_OTHER_GOV,Funder_Type_UNKNOWN,inclusion_criteria,exclusion_criteria,mode_country_Bin_High_Freq,mode_country_Bin_Low_Freq,mode_country_Bin_Medium_Freq,mode_country_Bin_Missing,mode_city_Bin_High_Freq,mode_city_Bin_Low_Freq,mode_city_Bin_Medium_Freq,mode_city_Bin_Missing,mode_state_Bin_High_Freq,mode_state_Bin_Low_Freq,mode_state_Bin_Medium_Freq,mode_state_Bin_Missing,mode_name_Bin_High_Freq,mode_name_Bin_Medium_Freq,mode_name_Bin_Missing,Sex_ALL,Sex_FEMALE,Sex_MALE
0,NCT04841499,Effects of a Seven-day BASIS™ Supplementation ...,The purpose of this study is to determine whet...,0,Menopause,DRUG: BASIS™ (Crystalline Nicotinamide Ribosid...,"Production of Estradiol, To determine whether ...",,,University of South Alabama,"Elysium Health, Inc.",,40.0,INTERVENTIONAL,Allocation: NA|Intervention Model: SINGLE_GROU...,3,36608,1.0,1.0,1.0,1.0,1.0,6329568.0,35 Years,200 Years,1,1,0.0,0.0,0.0,,,1,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,"~* 35 years of age or older~* pre, peri or pos...",~* less than 35 years of age~* hormone replace...,1,0,0,0,0,1,0,0,1,0,0,0,0,1,0,0,1,0
1,NCT03020641,Peritoneal Damage in Laparoscopic Surgery,The investigators hypothesized that applying a...,1,Peritoneal Damage,PROCEDURE: Low pressure pneumoperitoneum|PROCE...,"Inflammatory Peritoneal Markers, logaritmic le...",,,Fundacion para la Investigacion Biomedica del ...,,,100.0,INTERVENTIONAL,Allocation: RANDOMIZED|Intervention Model: PAR...,27,28040,1.0,1.0,1.0,1.0,1.0,6080513.0,18 Years,200 Years,0,0,5.0,3.0,1.0,"{'Converted to open surgery': 3, 'Lost to Foll...",{'Overall Study': 5},1,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,"~* patients older than 18 years, signed inform...",~* emergency surgery.~* previous surgery at su...,0,0,1,0,1,0,0,0,1,0,0,0,0,1,0,1,0,0
2,NCT03727620,Doxycycline in the Treatment of Aggressive Per...,The aim of the study was to compare the clinic...,0,Aggressive Periodontitis,DRUG: amoxicillin plus metronidazole|DRUG: Dox...,"Decrease of periodontal pockets ≥ 4mm, • Probi...","Plaque index decrease, Plaque index was assess...",,Mohammed V Souissi University,,PHASE1|PHASE2,24.0,INTERVENTIONAL,Allocation: NON_RANDOMIZED|Intervention Model:...,5,,1.0,1.0,1.0,1.0,1.0,6054748.0,16 Years,36 Years,1,0,0.0,0.0,0.0,,,1,0,1,0,1,1,0,0,0,0,0,0,0,1,0,0,"~* subjects in good general health,~* presence...","~* pregnancy,~* lactation,~* and smoking.",0,0,0,1,0,0,0,1,0,0,0,1,0,0,1,1,0,0
3,NCT03162926,A Safety and Tolerability Study of VC-02™ Comb...,The purpose of this trial is to test if VC-02™...,0,Type 1 Diabetes Mellitus,COMBINATION_PRODUCT: VC-02 Combination Product...,Incidence of all adverse events reported for s...,,,ViaCyte,,PHASE1,3.0,INTERVENTIONAL,Allocation: NA|Intervention Model: SINGLE_GROU...,7,T6G 1Z2,1.0,1.0,1.0,1.0,1.0,6112439.0,18 Years,65 Years,0,0,0.0,0.0,0.0,,,1,1,0,0,1,0,0,0,0,0,1,0,0,0,0,0,~* men and non-pregnant women of non-childbear...,"~* history of islet cell, kidney, and/or pancr...",1,0,0,0,1,0,0,0,1,0,0,0,1,0,0,1,0,0
4,NCT04434313,Treatment of Hemiparetic Gait Impairments Usin...,The objective of this research is to investiga...,0,"Telemedicine|Gait, Hemiplegic|Gait Disorders, ...",DEVICE: Delivery of iStride™ device gait treat...,Feasibility of safely implementing the treatme...,"Feasibility of screening criteria, To enroll p...",,"Moterum Technologies, Inc.",,,6.0,INTERVENTIONAL,Allocation: NA|Intervention Model: SINGLE_GROU...,8,84112,1.0,1.0,1.0,1.0,1.0,5918890.0,21 Years,80 Years,0,0,0.0,0.0,0.0,,,1,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,~* age 21-80~* caregiver support. the caregive...,~* uncontrolled seizures~* metal implants (ste...,1,0,0,0,1,0,0,0,1,0,0,0,0,1,0,1,0,0


In [8]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 68960 entries, 0 to 68959
Data columns (total 68 columns):
 #   Column                        Non-Null Count  Dtype  
---  ------                        --------------  -----  
 0   nct_id                        68960 non-null  object 
 1   Study Title                   68960 non-null  object 
 2   Brief Summary                 68960 non-null  object 
 3   Study Results                 68960 non-null  int64  
 4   Conditions                    68960 non-null  object 
 5   Interventions                 68956 non-null  object 
 6   Primary Outcome Measures      68960 non-null  object 
 7   Secondary Outcome Measures    68960 non-null  object 
 8   Other Outcome Measures        68960 non-null  object 
 9   Sponsor                       68960 non-null  object 
 10  Collaborators                 21378 non-null  object 
 11  Phases                        25577 non-null  object 
 12  Enrollment                    68960 non-null  float64
 13  S

In [10]:
conversion_factors = {
    'Minute': 1,
    'Hour': 60,
    'Day': 1440,          # 24 hours × 60 minutes
    'Week': 10080,        # 7 days × 1440 minutes
    'Month': 43800,       # Assuming 30.44 days per month on average
    'Year': 525600        # 365.25 days × 1440 minutes
}

import re

def convert_to_minutes(time_str):
    match = re.match(r"(\d+)\s*(\w+)", time_str)  # Extract number and unit
    if match:
        value, unit = int(match.group(1)), match.group(2).capitalize().rstrip('s')
        return value * conversion_factors.get(unit, 1)  # Default to 1 if unit not found
    return 0

data['minimum_age_min'] = data['minimum_age'].apply(convert_to_minutes)
data['maximum_age_min'] = data['maximum_age'].apply(convert_to_minutes)

In [11]:
data['minimum_age_min'] = np.log1p(data['minimum_age_min'])
data['maximum_age_min'] = np.log1p(data['minimum_age_min'])

In [13]:
data.columns

Index(['nct_id', 'Study Title', 'Brief Summary', 'Study Results', 'Conditions',
       'Interventions', 'Primary Outcome Measures',
       'Secondary Outcome Measures', 'Other Outcome Measures', 'Sponsor',
       'Collaborators', 'Phases', 'Enrollment', 'Study Type', 'Study Design',
       'Time taken for Enrollment', 'mode_zip', 'unique_name_count',
       'unique_state_count', 'unique_zip_count', 'unique_city_count',
       'unique_country_count', 'id', 'minimum_age', 'maximum_age',
       'healthy_volunteers', 'gender_based', 'total_dropouts',
       'unique_reasons', 'unique_periods', 'reason_counts', 'period_counts',
       'ADULT', 'OLDER_ADULT', 'CHILD', 'EARLY_PHASE1', 'PHASE1', 'PHASE2',
       'PHASE3', 'PHASE4', 'Funder_Type_FED', 'Funder_Type_INDIV',
       'Funder_Type_INDUSTRY', 'Funder_Type_NETWORK', 'Funder_Type_NIH',
       'Funder_Type_OTHER', 'Funder_Type_OTHER_GOV', 'Funder_Type_UNKNOWN',
       'inclusion_criteria', 'exclusion_criteria',
       'mode_country_Bin_Hi

In [12]:
numerical_columns = ["Study Results", "Enrollment", 'unique_name_count',
       'unique_state_count', 'unique_zip_count', 'unique_city_count',
       'unique_country_count', 'minimum_age_min', 'maximum_age_min',
        'healthy_volunteers', 'gender_based', 'total_dropouts',
       'unique_reasons', 'unique_periods', 'ADULT', 'OLDER_ADULT', 'CHILD', 'EARLY_PHASE1', 'PHASE1', 'PHASE2',
       'PHASE3', 'PHASE4', 'Funder_Type_FED', 'Funder_Type_INDIV',
       'Funder_Type_INDUSTRY', 'Funder_Type_NETWORK', 'Funder_Type_NIH',
       'Funder_Type_OTHER', 'Funder_Type_OTHER_GOV', 'Funder_Type_UNKNOWN',
        'mode_country_Bin_High_Freq', 'mode_country_Bin_Low_Freq',
       'mode_country_Bin_Medium_Freq', 'mode_country_Bin_Missing',
       'mode_city_Bin_High_Freq', 'mode_city_Bin_Low_Freq',
       'mode_city_Bin_Medium_Freq', 'mode_city_Bin_Missing',
       'mode_state_Bin_High_Freq', 'mode_state_Bin_Low_Freq',
       'mode_state_Bin_Medium_Freq', 'mode_state_Bin_Missing',
       'mode_name_Bin_High_Freq', 'mode_name_Bin_Medium_Freq',
       'mode_name_Bin_Missing', 'Sex_ALL', 'Sex_FEMALE', 'Sex_MALE' ]
scaler = StandardScaler()
data[numerical_columns] = scaler.fit_transform(data[numerical_columns])

## Textual Feature Engineering

In [10]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
tokenizer = AutoTokenizer.from_pretrained("dmis-lab/biobert-base-cased-v1.1")
model = AutoModel.from_pretrained("dmis-lab/biobert-base-cased-v1.1").to(device) 

def embed_text(text):
    tokens = tokenizer(text, return_tensors="pt", truncation=True, max_length=256, padding="max_length").to(device)
    with torch.no_grad():
        outputs = model(**tokens)
    return outputs.last_hidden_state.mean(dim=1).squeeze().cpu().numpy()

text_embeddings = []
for col in text_columns:
    col_embeddings = np.vstack(data[col].progress_apply(embed_text).values)
    text_embeddings.append(col_embeddings)

config.json:   0%|          | 0.00/313 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/213k [00:00<?, ?B/s]



pytorch_model.bin:   0%|          | 0.00/436M [00:00<?, ?B/s]

100%|██████████| 68960/68960 [13:48<00:00, 83.20it/s]
100%|██████████| 68960/68960 [14:10<00:00, 81.09it/s]
100%|██████████| 68960/68960 [13:46<00:00, 83.47it/s]
100%|██████████| 68960/68960 [13:47<00:00, 83.34it/s]
100%|██████████| 68960/68960 [14:18<00:00, 80.29it/s]
100%|██████████| 68960/68960 [14:28<00:00, 79.41it/s]
100%|██████████| 68960/68960 [15:03<00:00, 76.34it/s]
100%|██████████| 68960/68960 [13:55<00:00, 82.58it/s]
100%|██████████| 68960/68960 [13:51<00:00, 82.98it/s]
100%|██████████| 68960/68960 [14:16<00:00, 80.47it/s]


In [11]:
text_features = np.hstack(text_embeddings)

In [13]:
text_features.shape

(68960, 7680)

In [14]:
from scipy.sparse import save_npz

np.save("text_features.npy", text_features)

In [30]:
from IPython.display import FileLink
# FileLink('text_features.npy')

In [13]:
import nltk
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package stopwords to /usr/share/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /usr/share/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [14]:
!unzip /usr/share/nltk_data/corpora/wordnet.zip -d /usr/share/nltk_data/corpora/

Archive:  /usr/share/nltk_data/corpora/wordnet.zip
replace /usr/share/nltk_data/corpora/wordnet/lexnames? [y]es, [n]o, [A]ll, [N]one, [r]ename: ^C


In [16]:
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import string

stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

def preprocess_text_for_tfidf(text):
    # Rigorous preprocessing for TF-IDF
    if isinstance(text, str):
        text = text.lower()
        text = ''.join(ch for ch in text if ch not in string.punctuation)  # Remove punctuation
        tokens = text.split()
        tokens = [lemmatizer.lemmatize(word) for word in tokens if word not in stop_words]  # Lemmatize and remove stopwords
        return ' '.join(tokens)
    return ''

tfidf_columns = ['Conditions', 'Interventions', 'Secondary Outcome Measures', 'Other Outcome Measures', 'inclusion_criteria']
for col in tfidf_columns:
    data[col] = data[col].apply(preprocess_text_for_tfidf)

In [17]:
from scipy.sparse import csr_matrix
from sklearn.feature_extraction.text import TfidfVectorizer
from scipy.sparse import hstack

In [18]:
# Define TF-IDF vectorizer with limited max features
tfidf_vectorizers = {
    col: TfidfVectorizer(max_features=500, stop_words='english')
    for col in ['Conditions', 'Interventions', 'Secondary Outcome Measures', 'Other Outcome Measures', 'inclusion_criteria']
}

# Fit and transform TF-IDF features
tfidf_features = {}
for col, vectorizer in tfidf_vectorizers.items():
    tfidf_features[col] = vectorizer.fit_transform(data[col])


# Combine TF-IDF vectors into a single sparse matrix
tfidf_combined = hstack([tfidf_features[col] for col in tfidf_features])

In [23]:
# Convert dense embeddings to sparse format
text_features_sparse = csr_matrix(text_features)

In [24]:
# del text_features_sparse

In [21]:
print(f"Bio-BERT Embedding Dimensions: {text_features.shape}")

# Add TF-IDF vectors
all_text_features = hstack([csr_matrix(text_features), tfidf_combined])

# Print dimensions after adding TF-IDF
print(f"Final Text Feature Dimensions (with TF-IDF): {all_text_features.shape}")

Bio-BERT Embedding Dimensions: (68960, 7680)
Final Text Feature Dimensions (with TF-IDF): (68960, 10180)


In [23]:
np.save("tfidf.npy", tfidf_combined)
FileLink('tfidf.npy')

In [27]:
tfidf_combined.shape

(68960, 2500)

In [20]:
text_features = np.load("/kaggle/input/nest-competition/text_features.npy", allow_pickle=True)
# tfidf_combined = np.load("/kaggle/input/nest-competition/tfidf.npy", allow_pickle=True)

In [15]:
print(text_features.shape, tfidf_combined.shape)

(68960, 7680) ()


In [None]:
if not isinstance(text_features, csr_matrix):
    text_features = csr_matrix(text_features)
if not isinstance(tfidf_combined, csr_matrix):
    tfidf_combined = csr_matrix(tfidf_combined)

all_text_features = hstack([text_features, tfidf_combined])

In [22]:
del text_features, tfidf_combined

In [23]:
all_text_features.shape

(68960, 10180)

In [25]:
numerical_features = data[numerical_columns].values
target = data['Time taken for Enrollment'].values

X_train_text, X_test_text, X_train_num, X_test_num, y_train, y_test = train_test_split(
    all_text_features, numerical_features, target, test_size=0.2, random_state=42)

In [26]:
del all_text_features

import gc
gc.collect()

8

In [28]:
import os 
# np.save('/kaggle/working/X_train_text.npy', X_train_text)
# np.save('/kaggle/working/X_test_text.npy', X_test_text)
# np.save('/kaggle/working/X_train_num.npy', X_train_num)
# np.save('/kaggle/working/X_test_num.npy', X_test_num)
# np.save('/kaggle/working/y_train.npy', y_train)
# np.save('/kaggle/working/y_test.npy', y_test)

# Generate download links
file_names = [
    'X_train_text.npy',
    'X_test_text.npy',
    'X_train_num.npy',
    'X_test_num.npy',
    'y_train.npy',
    'y_test.npy'
]

# Display download links
for file_name in file_names:
    file_path = os.path.join('/kaggle/working', file_name)
    print(f"[Download {file_name}](/{file_path})")

[Download X_train_text.npy](//kaggle/working/X_train_text.npy)
[Download X_test_text.npy](//kaggle/working/X_test_text.npy)
[Download X_train_num.npy](//kaggle/working/X_train_num.npy)
[Download X_test_num.npy](//kaggle/working/X_test_num.npy)
[Download y_train.npy](//kaggle/working/y_train.npy)
[Download y_test.npy](//kaggle/working/y_test.npy)


In [41]:
np.save('/kaggle/working/X_train_text2.npy', X_train_text)

In [42]:
FileLink('X_train_text2.npy')

In [35]:
X_train_num = X_train_num.astype('float32')
X_test_num = X_test_num.astype('float32')

In [36]:
X_train_text = X_train_text.astype('float32')
X_test_text = X_test_text.astype('float32')

In [46]:
del svm_model, xgb_model

In [47]:
gc.collect()

0

In [None]:
from sklearn.svm import SVR 
svr = SVR(kernel='linear') 
svr.fit(X_train_text, y_train)
y_pred = svr.predict(X_test_text)

mae = mean_absolute_error(y_test, y_pred)
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
rscore = r2_score(y_test, y_pred)

print(f"MAE: {mae:.4f}, RMSE: {rmse:.4f}, R2: {rscore:.4f}")

In [44]:
from sklearn.svm import SVR
from xgboost import XGBRegressor
from sklearn.metrics import r2_score, mean_squared_error

# Placeholder for results
def compute_scores(model, X_train, X_test, y_train, y_test, feature_set):
    model.fit(X_train, y_train)
    
    y_train_pred = model.predict(X_train)
    y_test_pred = model.predict(X_test)

    train_r2 = r2_score(y_train, y_train_pred)
    test_r2 = r2_score(y_test, y_test_pred)

    train_rmse = np.sqrt(mean_squared_error(y_train, y_train_pred))
    test_rmse = np.sqrt(mean_squared_error(y_test, y_test_pred))

    return {
        "Feature Set": feature_set,
        "Train R2": train_r2,
        "Test R2": test_r2,
        "Train RMSE": train_rmse,
        "Test RMSE": test_rmse
    }

# Models
svm_model = SVR()
xgb_model = XGBRegressor(random_state=42)

# Placeholder for results
results = []

# 1. Just Text Features
results.append(compute_scores(svm_model, X_train_text, X_test_text, y_train, y_test, "Text Only"))
results.append(compute_scores(xgb_model, X_train_text, X_test_text, y_train, y_test, "Text Only"))

# 2. Just Numerical Features
results.append(compute_scores(svm_model, X_train_num, X_test_num, y_train, y_test, "Numerical Only"))
results.append(compute_scores(xgb_model, X_train_num, X_test_num, y_train, y_test, "Numerical Only"))

# Convert results to a DataFrame
results_df = pd.DataFrame(results, index=["SVM - Text", "XGB - Text", "SVM - Num", "XGB - Num", "SVM - Combined", "XGB - Combined"])

print(results_df)

KeyboardInterrupt: 

In [None]:
# 3. Both Features Combined
X_train_combined = hstack((X_train_text, X_train_num))
X_test_combined = hstack((X_test_text, X_test_num))

results.append(compute_scores(svm_model, X_train_combined, X_test_combined, y_train, y_test, "Combined"))
results.append(compute_scores(xgb_model, X_train_combined, X_test_combined, y_train, y_test, "Combined"))

# AutoEncoder inspired feature reduction

In [None]:
class Autoencoder(nn.Module):
    def __init__(self, input_dim, latent_dim, num_features):
        super(Autoencoder, self).__init__()
        # Encoder
        self.encoder = nn.Sequential(
            nn.Linear(input_dim, 2048),
            nn.ReLU(),
            nn.Dropout(0.2),
            nn.Linear(2048, latent_dim)
        )
        # Decoder
        self.decoder = nn.Sequential(
            nn.Linear(latent_dim + num_features, 2048),
            nn.ReLU(),
            nn.Dropout(0.2),
            nn.Linear(2048, input_dim)
        )

    def forward(self, x_text, x_num):
        latent = self.encoder(x_text)
        combined = torch.cat([latent, x_num], dim=1)
        reconstructed = self.decoder(combined)
        return latent, reconstructed

class FinalModel(nn.Module):
    def __init__(self, latent_dim, num_features):
        super(FinalModel, self).__init__()
        self.fc = nn.Sequential(
            nn.Linear(latent_dim + num_features, 512),
            nn.ReLU(),
            nn.Linear(512, 1)
        )

    def forward(self, latent, x_num):
        combined = torch.cat([latent, x_num], dim=1)
        output = self.fc(combined)
        return output

In [None]:
# Initialize models
input_dim = X_train_text.shape[1]
latent_dim = 256  # Adjust as needed
num_features = X_train_num.shape[1]

autoencoder = Autoencoder(input_dim, latent_dim, num_features)
final_model = FinalModel(latent_dim, num_features)

# Optimizers and loss functions
criterion_reconstruction = nn.MSELoss()
criterion_prediction = nn.MSELoss()
optimizer_ae = optim.Adam(autoencoder.parameters(), lr=1e-3)
optimizer_final = optim.Adam(final_model.parameters(), lr=1e-3)

# Convert data to PyTorch tensors
train_text = torch.tensor(X_train_text, dtype=torch.float32)
train_num = torch.tensor(X_train_num, dtype=torch.float32)
train_y = torch.tensor(y_train, dtype=torch.float32).unsqueeze(1)

# Training loop
for epoch in range(50):  # Adjust epochs as needed
    autoencoder.train()
    final_model.train()

    # Forward pass
    latent, reconstructed = autoencoder(train_text, train_num)
    prediction = final_model(latent, train_num)

    # Loss calculation
    loss_reconstruction = criterion_reconstruction(reconstructed, train_text)
    loss_prediction = criterion_prediction(prediction, train_y)
    loss = loss_reconstruction + loss_prediction

    # Backward pass
    optimizer_ae.zero_grad()
    optimizer_final.zero_grad()
    loss.backward()
    optimizer_ae.step()
    optimizer_final.step()

    print(f"Epoch {epoch + 1}, Loss: {loss.item():.4f}")

In [None]:
# Evaluate on test data
autoencoder.eval()
final_model.eval()

test_text = torch.tensor(X_test_text, dtype=torch.float32)
test_num = torch.tensor(X_test_num, dtype=torch.float32)
test_y = torch.tensor(y_test, dtype=torch.float32).unsqueeze(1)

with torch.no_grad():
    latent, _ = autoencoder(test_text, test_num)
    predictions = final_model(latent, test_num).squeeze()

mae = mean_absolute_error(y_test, predictions.numpy())
rmse = np.sqrt(mean_squared_error(y_test, predictions.numpy()))
r2 = r2_score(y_test, predictions.numpy())

print(f"MAE: {mae:.4f}, RMSE: {rmse:.4f}, R2: {r2:.4f}")