### This is the first experiment where we will be implementing the RNN

In [1]:
import pandas as pd
import numpy as np

In [2]:
xls = pd.ExcelFile('../Datasets/PhyAdRsnComorbMed.xlsx')
df1 = pd.read_excel(xls, 'Hospital-length-of-stay')
df = pd.read_excel(xls, 'Days-breakdown')

### Removing the patients that expired

In [3]:
df1 = df1[df1['did_the_patient_expire_in_hospital']=='No'][['parent_id','hospital_length_of_stay']]
df1

Unnamed: 0,parent_id,hospital_length_of_stay
1,2,5
2,3,7
5,6,32
6,7,4
7,8,10
...,...,...
502,516,13
504,519,9
505,521,7
506,522,5


### Handling null values

### Initially we will handle the null values for the columns which cant have null as the value for their respctive columns systolic_blood_pressure	,diastolic_blood_pressure, heart_rate, respiratory_rate, oxygen_saturation	and temperature
We will enter the mean for their respective ids

In [5]:
df['d_dimer'] = pd.to_numeric(df['d_dimer'], errors='coerce').astype('Float64')

In [6]:
def enter_mean_values(df,column_name):
    for index,value in df[['parent_id',column_name]].iterrows():
        if pd.isna(value[1]):
            mean = df[df['parent_id']==value[0]][column_name].mean()
            df.loc[index, column_name] = mean

In [7]:
# List of columns you want to apply the function to
variables = [
    'systolic_blood_pressure', 'diastolic_blood_pressure', 'heart_rate', 'respiratory_rate', 
    'oxygen_saturation', 'temperature', 'highest_mean_arterial_pressure', 'lowest_mean_arterial_pressure', 
    'highest_heart_rate', 'lowest_heart_rate', 'highest_creatinine', 'lowest_urine_output', 
    'highest_gcs', 'highest_ph', 'lowest_ph', 'fluid_balance', 'wbc', 'rbc', 'hemoglobin', 
    'hematocrit', 'platelet_count', 'aptt_aptr', 'pt', 'alt', 'ast', 'mch', 'mcv', 'mchc', 'rdw', 
    'serum_creatinine', 'sodium', 'potassium', 'total_serum_bilirubin', 'lactate', 'pao2', 'pao2_fio2', 
    'ph', 'high_sensitivity_cardiac_troponin', 'esr', 'inr', 'ferritin', 'd_dimer', 'crp', 'hs_crp'
]



# Loop through the columns in the variables list and check for the error
for column in variables:
    try:
        # Try to apply the mean filling logic for each column
        enter_mean_values(df, column)
    except TypeError as e:
        # Catch the TypeError and print which column caused the error
        print(f"Error with column '{column}': {e}")
    except Exception as e:
        # Catch any other exceptions and print them
        print(f"An unexpected error occurred with column '{column}': {e}")

### Now we will apply binary encoding to the columns having yes and no values
- We applied binary encoding as 0 means no and 1 means yes
- We did this because Yes and no does not have any numerical values where yes being greater than no.
- If there was ordinal relationship between the two we could have used label encoding. Example ranks.

In [8]:
## function to apply binary encoding

def apply_binary_encoding(df,column):
    df[column]=df[column].map({'Yes':1,'No':0})

In [9]:
## applying binary encoding to the columns

columns_binary = [
    'intubated', 'cardiac_arrest', 'arrested_time', 'major_cardiac_events', 
    'clinically_diagnosed_infections', 'mechanical_ventilation', 'antiarrhythmic_therapies', 
    'renal_replacement_therapy_dialysis', 'cardiovascular_mechanical_support', 'echocardiogram', 
    'chest_x_ray', 'chest_ct', 'head_ct', 'antimicrobial', 'anticoagulation', 'steroid'
]
for columns in columns_binary:
    apply_binary_encoding(df,columns)

In [10]:
df.head()

Unnamed: 0,id,parent_id,day,systolic_blood_pressure,diastolic_blood_pressure,heart_rate,respiratory_rate,oxygen_saturation,temperature,highest_mean_arterial_pressure,...,wall_motion_abnormality,chest_x_ray,cxr_findings,chest_ct,chest_ct_findings,head_ct,head_ct_findings,antimicrobial,anticoagulation,steroid
0,1,1,1,119.0,54.0,79.0,18.0,94.0,37.2,85.0,...,,0,"""[\""Unilateral Consolidation\""]""",0,"""[\""Bilateral consolidation/infiltration\""]""",0,"""[]""",1,1,0
1,2,1,2,133.0,64.0,73.0,18.0,98.0,37.1,85.0,...,,0,"""[]""",0,"""[]""",0,"""[]""",1,1,1
2,3,1,3,140.0,74.0,70.0,20.0,95.0,37.5,85.0,...,,0,"""[]""",0,"""[]""",0,"""[]""",1,1,1
3,4,1,4,154.0,78.0,77.0,18.0,95.0,37.1,85.0,...,,1,"""[\""Unilateral Consolidation\""]""",0,"""[]""",0,"""[]""",1,1,1
4,5,1,5,155.0,61.0,64.0,16.0,92.0,36.7,85.0,...,,0,"""[]""",0,"""[]""",0,"""[]""",1,1,1


### Data preprocessing the texts and applying one hot encoding
- There are columns such as cxr_findings which have non numeric values such as "[\"Unilateral Consolidation\"] for which we will first seperate the list of texts in the brackets and then apply one hot encoding.

In [11]:
## We will now address the columns 'cxr_findings', 'chest_ct_findings' and 'head_ct_findings'

## we will create different datasets for the above columns

df_cxr_findings = df[['id','cxr_findings']]

df_chest_ct_findings = df[['id','chest_ct_findings']]

df_head_ct_findings = df[['id','head_ct_findings']]

## We will use the same function like we used in the previous notebook

## defining a function to remove all the strings except a number, letter or a comma

def filter_characters(value):
    # Convert to string
    value = str(value)
    
    # Initialize an empty result string
    result = ''
    
    # Iterate through each character in the string
    for char in value:
        # Append if character is a letter, number, comma, or space
        if char.isalnum() or char == ',' or char.isspace():
            result += char
    
    return result


# Apply the function to the columns
df_cxr_findings['cxr_findings'] = df_cxr_findings['cxr_findings'].apply(filter_characters)

df_chest_ct_findings['chest_ct_findings'] = df_chest_ct_findings['chest_ct_findings'].apply(filter_characters)

df_head_ct_findings['head_ct_findings'] = df['head_ct_findings'].apply(filter_characters)

# Now we will apply one hot encoding to the the columns

## applying the function to split into lists

def one_hot_encode_comorbidities(df, column_name):
    # Split the specified column into lists of comorbidities
    df[column_name] = df[column_name].str.split(',')
    
     # Use MultiLabelBinarizer to one-hot encode the comorbidities
    from sklearn.preprocessing import MultiLabelBinarizer
    
    # Use MultiLabelBinarizer to one-hot encode the comorbidities
    mlb = MultiLabelBinarizer()
    one_hot_encoded = mlb.fit_transform(df[column_name])
    
    # Create a DataFrame from the one-hot encoded data
    one_hot_encoded_df = pd.DataFrame(one_hot_encoded, columns=mlb.classes_)
    
    # Concatenate the original DataFrame with the one-hot encoded DataFrame
    result_df = pd.concat([df.drop(column_name, axis=1), one_hot_encoded_df], axis=1)
    
    result_df = result_df.drop(result_df.columns[1],axis=1)
    
    return result_df

# Apply the one-hot encoding function
df_cxr_findings = one_hot_encode_comorbidities(df_cxr_findings,'cxr_findings')
df_chest_ct_findings = one_hot_encode_comorbidities(df_chest_ct_findings,'chest_ct_findings')
df_head_ct_findings = one_hot_encode_comorbidities(df_head_ct_findings,'head_ct_findings')

# we will now merge all the dataframes based on id

df = df.merge(df_cxr_findings, on='id').merge(df_chest_ct_findings, on='id').merge(df_head_ct_findings, on='id')

# we will now drop the column named id, cxr_findings, chest_ct_findings and head_ct_findings

# Drop the specified columns
columns_to_drop = ['id', 'cxr_findings', 'chest_ct_findings', 'head_ct_findings']
df = df.drop(columns=columns_to_drop)

# final dataframe

df.head()


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_cxr_findings['cxr_findings'] = df_cxr_findings['cxr_findings'].apply(filter_characters)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_chest_ct_findings['chest_ct_findings'] = df_chest_ct_findings['chest_ct_findings'].apply(filter_characters)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_

Unnamed: 0,parent_id,day,systolic_blood_pressure,diastolic_blood_pressure,heart_rate,respiratory_rate,oxygen_saturation,temperature,highest_mean_arterial_pressure,lowest_mean_arterial_pressure,...,Bilateral Ground Glass Opacities,Bilateral consolidationinfiltration,Emphysematous or Bronchiectasis changes,Emphysematous or Bronchiectatic changes,Pulmonary Embolism,Scarring or Fibrosis,Unilateral Ground Glass Opacities,Unilateral consolidationinfiltration,Subarachnoid Hemorrhage,Subdural Hemorrhage
0,1,1,119.0,54.0,79.0,18.0,94.0,37.2,85.0,85.0,...,0,1,0,0,0,0,0,0,0,0
1,1,2,133.0,64.0,73.0,18.0,98.0,37.1,85.0,85.0,...,0,0,0,0,0,0,0,0,0,0
2,1,3,140.0,74.0,70.0,20.0,95.0,37.5,85.0,85.0,...,0,0,0,0,0,0,0,0,0,0
3,1,4,154.0,78.0,77.0,18.0,95.0,37.1,85.0,85.0,...,0,0,0,0,0,0,0,0,0,0
4,1,5,155.0,61.0,64.0,16.0,92.0,36.7,85.0,85.0,...,0,0,0,0,0,0,0,0,0,0


### Droping the columns ejection_fraction and wall_motion_abnormality as they have 0 values in the whole dataset

In [12]:
df = df.drop(['ejection_fraction','wall_motion_abnormality'],axis=1)

In [13]:
df

Unnamed: 0,parent_id,day,systolic_blood_pressure,diastolic_blood_pressure,heart_rate,respiratory_rate,oxygen_saturation,temperature,highest_mean_arterial_pressure,lowest_mean_arterial_pressure,...,Bilateral Ground Glass Opacities,Bilateral consolidationinfiltration,Emphysematous or Bronchiectasis changes,Emphysematous or Bronchiectatic changes,Pulmonary Embolism,Scarring or Fibrosis,Unilateral Ground Glass Opacities,Unilateral consolidationinfiltration,Subarachnoid Hemorrhage,Subdural Hemorrhage
0,1,1,119.0,54.0,79.0,18.0,94.0,37.2,85.0,85.0,...,0,1,0,0,0,0,0,0,0,0
1,1,2,133.0,64.0,73.0,18.0,98.0,37.1,85.0,85.0,...,0,0,0,0,0,0,0,0,0,0
2,1,3,140.0,74.0,70.0,20.0,95.0,37.5,85.0,85.0,...,0,0,0,0,0,0,0,0,0,0
3,1,4,154.0,78.0,77.0,18.0,95.0,37.1,85.0,85.0,...,0,0,0,0,0,0,0,0,0,0
4,1,5,155.0,61.0,64.0,16.0,92.0,36.7,85.0,85.0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4059,522,4,98.0,73.0,82.0,24.0,95.0,36.1,,,...,0,0,0,0,0,0,0,0,0,0
4060,522,5,110.0,64.0,68.0,18.0,95.0,36.5,,,...,0,0,0,0,0,0,0,0,0,0
4061,522,6,112.2,68.0,74.4,20.0,95.2,36.4,,,...,0,0,0,0,0,0,0,0,0,0
4062,522,7,112.2,68.0,74.4,20.0,95.2,36.4,,,...,0,0,0,0,0,0,0,0,0,0


### Now we will add 0 values to the rows that have Nan

In [14]:
df = df.fillna(0)

In [15]:
df

Unnamed: 0,parent_id,day,systolic_blood_pressure,diastolic_blood_pressure,heart_rate,respiratory_rate,oxygen_saturation,temperature,highest_mean_arterial_pressure,lowest_mean_arterial_pressure,...,Bilateral Ground Glass Opacities,Bilateral consolidationinfiltration,Emphysematous or Bronchiectasis changes,Emphysematous or Bronchiectatic changes,Pulmonary Embolism,Scarring or Fibrosis,Unilateral Ground Glass Opacities,Unilateral consolidationinfiltration,Subarachnoid Hemorrhage,Subdural Hemorrhage
0,1,1,119.0,54.0,79.0,18.0,94.0,37.2,85.0,85.0,...,0,1,0,0,0,0,0,0,0,0
1,1,2,133.0,64.0,73.0,18.0,98.0,37.1,85.0,85.0,...,0,0,0,0,0,0,0,0,0,0
2,1,3,140.0,74.0,70.0,20.0,95.0,37.5,85.0,85.0,...,0,0,0,0,0,0,0,0,0,0
3,1,4,154.0,78.0,77.0,18.0,95.0,37.1,85.0,85.0,...,0,0,0,0,0,0,0,0,0,0
4,1,5,155.0,61.0,64.0,16.0,92.0,36.7,85.0,85.0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4059,522,4,98.0,73.0,82.0,24.0,95.0,36.1,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0
4060,522,5,110.0,64.0,68.0,18.0,95.0,36.5,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0
4061,522,6,112.2,68.0,74.4,20.0,95.2,36.4,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0
4062,522,7,112.2,68.0,74.4,20.0,95.2,36.4,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0


### We will now check if any columns have significant numbers of 0s in their rows

In [16]:
colum_max_0 = []  # Initialize the list to store column names with many zeros
count = 0  # Initialize count as an integer, not a list
for column, i in df.iteritems():
    count = 0  # Initialize count as an integer, not a list
    print(f"Processing column: {column}")
    for value in i:
        if value == 0:  # Check if the value is 0
            count += 1  # Increment the count
    print(f"Count of zeros in {column}: {count}")
    # Check if the count of zeros is greater than or equal to 3242
    if count >= 4060:
        colum_max_0.append(column)
# Print the columns that have a high count of zeros
print(colum_max_0)

Processing column: parent_id
Count of zeros in parent_id: 0
Processing column: day
Count of zeros in day: 0
Processing column: systolic_blood_pressure
Count of zeros in systolic_blood_pressure: 8
Processing column: diastolic_blood_pressure
Count of zeros in diastolic_blood_pressure: 8
Processing column: heart_rate
Count of zeros in heart_rate: 8
Processing column: respiratory_rate
Count of zeros in respiratory_rate: 8
Processing column: oxygen_saturation
Count of zeros in oxygen_saturation: 8
Processing column: temperature
Count of zeros in temperature: 8
Processing column: highest_mean_arterial_pressure
Count of zeros in highest_mean_arterial_pressure: 2232
Processing column: lowest_mean_arterial_pressure
Count of zeros in lowest_mean_arterial_pressure: 2232
Processing column: highest_heart_rate
Count of zeros in highest_heart_rate: 8
Processing column: lowest_heart_rate
Count of zeros in lowest_heart_rate: 8
Processing column: highest_creatinine
Count of zeros in highest_creatinine: 

  for column, i in df.iteritems():


###  Dropping those columns from above who have significant number of 0 values

In [17]:
df = df.drop(['cardiac_arrest', 'arrested_time', 'major_cardiac_events', 
              'cardiovascular_mechanical_support', 'high_sensitivity_cardiac_troponin', 
              'hs_crp', 'echocardiogram', 'Pneumothorax', 
              'Unilateral Ground Glass', 'Emphysematous or Bronchiectasis changes', 
              'Emphysematous or Bronchiectatic changes', 'Unilateral Ground Glass Opacities', 
              'Unilateral consolidationinfiltration', 'Subarachnoid Hemorrhage', 
              'Subdural Hemorrhage'],axis=1)

### Merging With HLOS

In [18]:
df = df.merge(df1, how='inner', on='parent_id')

In [19]:
df

Unnamed: 0,parent_id,day,systolic_blood_pressure,diastolic_blood_pressure,heart_rate,respiratory_rate,oxygen_saturation,temperature,highest_mean_arterial_pressure,lowest_mean_arterial_pressure,...,Bilateral Ground Glass,Cardiomegaly,Edema,Effusion,Unilateral Consolidation,Bilateral Ground Glass Opacities,Bilateral consolidationinfiltration,Pulmonary Embolism,Scarring or Fibrosis,hospital_length_of_stay
0,2,1,122.0,80.0,72.0,24.0,92.0,36.3,0.0,0.0,...,1,0,0,0,0,0,0,0,0,5
1,2,2,114.0,72.0,60.0,18.0,92.0,35.9,0.0,0.0,...,0,0,0,0,0,0,0,0,0,5
2,2,3,117.0,73.0,62.0,16.0,95.0,36.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,5
3,2,4,125.0,76.0,76.0,18.0,97.0,36.3,0.0,0.0,...,0,0,0,0,0,0,0,0,0,5
4,2,5,133.0,85.0,73.0,20.0,94.0,36.7,0.0,0.0,...,0,0,0,0,0,0,0,0,0,5
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3339,522,4,98.0,73.0,82.0,24.0,95.0,36.1,0.0,0.0,...,0,0,0,0,0,0,0,0,0,5
3340,522,5,110.0,64.0,68.0,18.0,95.0,36.5,0.0,0.0,...,0,0,0,0,0,0,0,0,0,5
3341,522,6,112.2,68.0,74.4,20.0,95.2,36.4,0.0,0.0,...,0,0,0,0,0,0,0,0,0,5
3342,522,7,112.2,68.0,74.4,20.0,95.2,36.4,0.0,0.0,...,0,0,0,0,0,0,0,0,0,5


In [20]:
# df.to_csv('../Datasets/pre.csv')

### Seggregating two dataframes having hospital lengths of stay until 14 and more than 14

In [21]:
df['days_remaining'] = df['hospital_length_of_stay']-df['day']
df

Unnamed: 0,parent_id,day,systolic_blood_pressure,diastolic_blood_pressure,heart_rate,respiratory_rate,oxygen_saturation,temperature,highest_mean_arterial_pressure,lowest_mean_arterial_pressure,...,Cardiomegaly,Edema,Effusion,Unilateral Consolidation,Bilateral Ground Glass Opacities,Bilateral consolidationinfiltration,Pulmonary Embolism,Scarring or Fibrosis,hospital_length_of_stay,days_remaining
0,2,1,122.0,80.0,72.0,24.0,92.0,36.3,0.0,0.0,...,0,0,0,0,0,0,0,0,5,4
1,2,2,114.0,72.0,60.0,18.0,92.0,35.9,0.0,0.0,...,0,0,0,0,0,0,0,0,5,3
2,2,3,117.0,73.0,62.0,16.0,95.0,36.0,0.0,0.0,...,0,0,0,0,0,0,0,0,5,2
3,2,4,125.0,76.0,76.0,18.0,97.0,36.3,0.0,0.0,...,0,0,0,0,0,0,0,0,5,1
4,2,5,133.0,85.0,73.0,20.0,94.0,36.7,0.0,0.0,...,0,0,0,0,0,0,0,0,5,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3339,522,4,98.0,73.0,82.0,24.0,95.0,36.1,0.0,0.0,...,0,0,0,0,0,0,0,0,5,1
3340,522,5,110.0,64.0,68.0,18.0,95.0,36.5,0.0,0.0,...,0,0,0,0,0,0,0,0,5,0
3341,522,6,112.2,68.0,74.4,20.0,95.2,36.4,0.0,0.0,...,0,0,0,0,0,0,0,0,5,-1
3342,522,7,112.2,68.0,74.4,20.0,95.2,36.4,0.0,0.0,...,0,0,0,0,0,0,0,0,5,-2


In [22]:
df_14 = df[df['hospital_length_of_stay']<15]
df_over_14 = df[df['hospital_length_of_stay']>14]

In [23]:
df_14

Unnamed: 0,parent_id,day,systolic_blood_pressure,diastolic_blood_pressure,heart_rate,respiratory_rate,oxygen_saturation,temperature,highest_mean_arterial_pressure,lowest_mean_arterial_pressure,...,Cardiomegaly,Edema,Effusion,Unilateral Consolidation,Bilateral Ground Glass Opacities,Bilateral consolidationinfiltration,Pulmonary Embolism,Scarring or Fibrosis,hospital_length_of_stay,days_remaining
0,2,1,122.0,80.0,72.0,24.0,92.0,36.3,0.0,0.0,...,0,0,0,0,0,0,0,0,5,4
1,2,2,114.0,72.0,60.0,18.0,92.0,35.9,0.0,0.0,...,0,0,0,0,0,0,0,0,5,3
2,2,3,117.0,73.0,62.0,16.0,95.0,36.0,0.0,0.0,...,0,0,0,0,0,0,0,0,5,2
3,2,4,125.0,76.0,76.0,18.0,97.0,36.3,0.0,0.0,...,0,0,0,0,0,0,0,0,5,1
4,2,5,133.0,85.0,73.0,20.0,94.0,36.7,0.0,0.0,...,0,0,0,0,0,0,0,0,5,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3339,522,4,98.0,73.0,82.0,24.0,95.0,36.1,0.0,0.0,...,0,0,0,0,0,0,0,0,5,1
3340,522,5,110.0,64.0,68.0,18.0,95.0,36.5,0.0,0.0,...,0,0,0,0,0,0,0,0,5,0
3341,522,6,112.2,68.0,74.4,20.0,95.2,36.4,0.0,0.0,...,0,0,0,0,0,0,0,0,5,-1
3342,522,7,112.2,68.0,74.4,20.0,95.2,36.4,0.0,0.0,...,0,0,0,0,0,0,0,0,5,-2


In [24]:
df_over_14

Unnamed: 0,parent_id,day,systolic_blood_pressure,diastolic_blood_pressure,heart_rate,respiratory_rate,oxygen_saturation,temperature,highest_mean_arterial_pressure,lowest_mean_arterial_pressure,...,Cardiomegaly,Edema,Effusion,Unilateral Consolidation,Bilateral Ground Glass Opacities,Bilateral consolidationinfiltration,Pulmonary Embolism,Scarring or Fibrosis,hospital_length_of_stay,days_remaining
16,6,1,127.0,76.0,68.0,19.0,95.0,36.6,92.0,80.0,...,0,0,0,0,0,0,0,0,32,31
17,6,2,97.0,60.0,68.0,22.0,98.0,36.4,71.0,67.0,...,0,0,0,0,0,0,0,0,32,30
18,6,3,140.0,68.0,72.0,22.0,99.0,36.5,84.0,84.0,...,0,0,0,0,0,0,0,0,32,29
19,6,4,108.0,63.0,98.0,22.0,95.0,36.5,77.0,77.0,...,0,0,1,0,0,0,0,0,32,28
20,6,5,126.0,77.0,68.0,24.0,98.0,36.5,92.0,92.0,...,0,0,0,0,1,1,0,0,32,27
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3275,511,4,150.0,50.0,67.0,27.0,93.0,36.8,0.0,0.0,...,0,0,0,0,0,0,0,0,18,14
3276,511,5,116.0,54.0,57.0,21.0,97.0,36.9,0.0,0.0,...,0,0,0,0,0,0,0,0,18,13
3277,511,6,131.0,74.0,53.0,22.0,92.0,36.4,0.0,0.0,...,0,0,0,0,0,0,0,0,18,12
3278,511,7,129.0,58.0,57.0,40.0,96.0,36.9,0.0,0.0,...,0,0,0,0,0,0,0,0,18,11


In [25]:
# df_14.to_csv("../Datasets/df_under_14.csv")
# df_over_14.to_csv("../Datasets/df_over_14.csv")

In [26]:
import torch
import torch.nn as nn

# Define a simple RNN model
class SimpleRNN(nn.Module):
    def __init__(self, input_size, hidden_size, output_size):
        super(SimpleRNN, self).__init__()
        self.rnn = nn.RNN(input_size, hidden_size, batch_first=True)
        self.fc = nn.Linear(hidden_size, output_size)

    def forward(self, x):
        out, _ = self.rnn(x)  # Forward pass through RNN
        out = self.fc(out[:, -1, :])  # Use the output from the last time step
        return out

# Define input parameters
input_size = 2   # Number of features per time step
hidden_size = 4  # Number of hidden units in the RNN layer
output_size = 1  # Final output dimension

# Initialize model, loss function, and optimizer
model = SimpleRNN(input_size, hidden_size, output_size)
criterion = nn.MSELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.01)

# Define two simple sequences as input data (batch size of 2 sequences, each with 3 time steps and 2 features)
sequence_data = torch.tensor([[[1.0, 2.0], [2.0, 3.0], [3.0, 4.0]],  # First sequence
                              [[2.0, 1.0], [3.0, 2.0], [4.0, 3.0]]])  # Second sequence
targets = torch.tensor([[1.0], [0.0]])  # Sample target values for each sequence

# Training loop (basic example, normally we'd use more epochs)
for epoch in range(50):
    optimizer.zero_grad()
    output = model(sequence_data)
    loss = criterion(output, targets)
    loss.backward()
    optimizer.step()

    if (epoch+1) % 10 == 0:
        print(f"Epoch [{epoch+1}/50], Loss: {loss.item():.4f}")

# Prediction
model.eval()
with torch.no_grad():
    output = model(sequence_data)
    print(f"Predicted outputs for the two sequences: \n{output}")


Epoch [10/50], Loss: 0.2475
Epoch [20/50], Loss: 0.2365
Epoch [30/50], Loss: 0.2181
Epoch [40/50], Loss: 0.1800
Epoch [50/50], Loss: 0.1013
Predicted outputs for the two sequences: 
tensor([[0.6797],
        [0.2809]])


In [28]:
sequence_data.shape

torch.Size([2, 3, 2])