# WiDS 2024 Datathon Competition

- Adapted from: WiDS Datathon 2024 Workshop at New England Microsoft Research

- Event Organizers: Sharut Gupta (MIT CSAIL), Jia He (Microsoft), and Arushi Jain (Microsoft).

- Objective: Predict the duration of time it takes for patients to receive metastatic cancer diagnosis (https://www.kaggle.com/competitions/widsdatathon2024-challenge2/overview)

- Data Source: You can download the data ("train.csv" and "test.csv") from the Kaggle competition (https://www.kaggle.com/competitions/widsdatathon2024-challenge2/data)

### #1. Loading Packages: 

In [2]:
# !pip install statsmodels==0.13.5
!pip install lightgbm==3.3.5
!pip install colorama

# necessary libraries
import numpy as np  # numerical computation with arrays
import pandas as pd # library to manipulate datasets using dataframes
from colorama import Style, Fore
import math
import re
# Statistical libraries
from scipy.stats import norm 
from scipy import stats

# Load plotting libraries
%matplotlib inline
import matplotlib
import matplotlib.pyplot as plt
import seaborn as sns
sns.set(rc={"figure.figsize":(8, 4), "figure.dpi":300}) #width=8, height=4

# Load sklearn libraries for machine learning
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error as mse
import lightgbm as lgbm

# Ignore all warnings
import warnings
warnings.filterwarnings("ignore") 


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m24.0[0m[39;49m -> [0m[32;49m25.1.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m

[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m24.0[0m[39;49m -> [0m[32;49m25.1.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m


### #2. Loading the Data: 

In [17]:
# Load the dataset using pandas
df_train = pd.read_csv("train_2024.csv")
df_test = pd.read_csv("test_2024.csv")

# Set column 'patient_id' as the index of rows
df_train = df_train.set_index('patient_id')
df_test = df_test.set_index('patient_id')

In [11]:
#Reading beginning of the dataset
df_test.head()

Unnamed: 0_level_0,patient_race,payer_type,patient_state,patient_zip3,Region,Division,patient_age,patient_gender,bmi,breast_cancer_diagnosis_code,...,Average of Mar-18,Average of Apr-18,Average of May-18,Average of Jun-18,Average of Jul-18,Average of Aug-18,Average of Sep-18,Average of Oct-18,Average of Nov-18,Average of Dec-18
patient_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
730681,,COMMERCIAL,LA,713,South,West South Central,55,F,,1746,...,62.21,62.23,78.34,81.96,83.58,82.22,80.2,69.73,53.14,51.34
334212,Black,,NC,283,South,South Atlantic,60,F,40.0,C50912,...,48.63,58.14,77.26,80.05,82.88,82.09,78.85,64.6,50.57,48.1
571362,,COMMERCIAL,TX,794,South,West South Central,54,F,32.33,1742,...,57.82,59.95,77.79,82.45,82.44,80.77,72.16,59.31,48.25,42.13
907331,,COMMERCIAL,TN,373,South,East South Central,63,F,27.07,1748,...,47.57,53.5,71.31,75.2,76.96,75.78,74.87,61.06,44.31,42.83
208382,Asian,,WA,980,West,Pacific,62,F,,C50411,...,41.02,46.25,56.92,57.88,66.16,65.21,57.52,49.53,43.75,38.33


In [14]:
#Reading the beginning of the dataset
df_test.head()

Unnamed: 0_level_0,patient_race,payer_type,patient_state,patient_zip3,Region,Division,patient_age,patient_gender,bmi,breast_cancer_diagnosis_code,...,Average of Mar-18,Average of Apr-18,Average of May-18,Average of Jun-18,Average of Jul-18,Average of Aug-18,Average of Sep-18,Average of Oct-18,Average of Nov-18,Average of Dec-18
patient_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
730681,,COMMERCIAL,LA,713,South,West South Central,55,F,,1746,...,62.21,62.23,78.34,81.96,83.58,82.22,80.2,69.73,53.14,51.34
334212,Black,,NC,283,South,South Atlantic,60,F,40.0,C50912,...,48.63,58.14,77.26,80.05,82.88,82.09,78.85,64.6,50.57,48.1
571362,,COMMERCIAL,TX,794,South,West South Central,54,F,32.33,1742,...,57.82,59.95,77.79,82.45,82.44,80.77,72.16,59.31,48.25,42.13
907331,,COMMERCIAL,TN,373,South,East South Central,63,F,27.07,1748,...,47.57,53.5,71.31,75.2,76.96,75.78,74.87,61.06,44.31,42.83
208382,Asian,,WA,980,West,Pacific,62,F,,C50411,...,41.02,46.25,56.92,57.88,66.16,65.21,57.52,49.53,43.75,38.33
