In [127]:
import pandas as pd

from datetime import timedelta
from sklearn.feature_extraction.text import TfidfVectorizer

In [128]:
def extract_datetime_features(df, column):
        df[column] = pd.to_datetime(df[column], errors='coerce')
        df[column + '_year'] = df[column].dt.year
        df[column + '_month'] = df[column].dt.month
        df[column + '_day'] = df[column].dt.day
        df[column + '_hour'] = df[column].dt.hour
        df[column + '_weekday'] = df[column].dt.weekday
        return df

def label_preprocessing(df):
    df['datetime'] = pd.to_datetime(df['measurement_datetime'])
    df.drop(columns=['measurement_datetime'], inplace=True)   
    df.dropna(inplace=True)
    df = extract_datetime_features(df, 'datetime')
    return df

train_labels = pd.read_csv('training_data/SepsisLabel_train.csv')
test_labels = pd.read_csv('testing_data/SepsisLabel_test.csv')


train_labels = label_preprocessing(train_labels)
test_labels = label_preprocessing(test_labels)

print(train_labels.shape, test_labels.shape)
train_labels.head()

(331637, 8) (130483, 7)


Unnamed: 0,person_id,SepsisLabel,datetime,datetime_year,datetime_month,datetime_day,datetime_hour,datetime_weekday
0,274096387,0,2024-12-03 20:00:00,2024,12,3,20,1
1,1719359031,0,2024-04-20 09:00:00,2024,4,20,9,5
2,2024544816,0,2021-07-14 07:00:00,2021,7,14,7,2
3,213710896,0,2022-05-24 07:00:00,2022,5,24,7,1
4,1335786468,0,2024-08-25 22:00:00,2024,8,25,22,6


In [129]:
def demo_preprocessing(df):
    df['MALE'] = (df['gender'] == 'MALE').astype(float)
    df['birth_datetime'] = pd.to_datetime(df['birth_datetime'])
    df['visit_start_date'] = pd.to_datetime(df['visit_start_date'])
    df.drop(columns=['visit_occurrence_id', 'age_in_months', 'gender'], inplace=True)
    return df

demo_info_train = pd.read_csv('./training_data/person_demographics_episode_train.csv', index_col=False)
demo_info_test = pd.read_csv('./testing_data/person_demographics_episode_test.csv', index_col=False)

demo_info_train = demo_preprocessing(demo_info_train)
demo_info_test = demo_preprocessing(demo_info_test)


print(demo_info_train.shape, demo_info_test.shape)
demo_info_test.head()

(3391, 4) (1419, 4)


Unnamed: 0,person_id,visit_start_date,birth_datetime,MALE
0,420756027,2020-10-05,2000-08-09,1.0
1,177657874,2020-09-07,2000-09-20,0.0
2,1701213331,2021-03-09,2000-12-30,1.0
3,1656161134,2021-06-28,2001-12-17,0.0
4,261357524,2020-04-25,2002-01-03,0.0


In [130]:
def demo_merge(labels, demo):
    merged = pd.merge(labels, demo, how = 'left', on = 'person_id')
    merged = merged[merged['datetime'] >= merged['visit_start_date'] - timedelta(days = 1)]
    idx = merged.groupby(['person_id', 'datetime'])['visit_start_date'].idxmax()
    merged = merged.loc[idx]
    merged['age_in_months'] = (pd.to_datetime(merged['datetime']) - pd.to_datetime(merged['birth_datetime'])).dt.days/30
    merged['hours_since_admit'] = (pd.to_datetime(merged['datetime']) - pd.to_datetime(merged['visit_start_date'])).dt.total_seconds()/3600
    merged.drop(columns=['visit_start_date', 'birth_datetime'], inplace=True)
    merged.reset_index(drop=True, inplace = True)

    return merged

merged_train = demo_merge(train_labels, demo_info_train)
merged_test = demo_merge(test_labels, demo_info_test)

print(merged_train.shape, merged_test.shape)
merged_train.head()

(331623, 11) (130483, 10)


Unnamed: 0,person_id,SepsisLabel,datetime,datetime_year,datetime_month,datetime_day,datetime_hour,datetime_weekday,MALE,age_in_months,hours_since_admit
0,510305,0,2019-07-01 16:00:00,2019,7,1,16,0,0.0,25.733333,40.0
1,510305,0,2019-07-01 17:00:00,2019,7,1,17,0,0.0,25.733333,41.0
2,510305,0,2019-07-02 13:00:00,2019,7,2,13,1,0.0,25.766667,61.0
3,510305,0,2019-07-03 09:00:00,2019,7,3,9,2,0.0,25.8,81.0
4,510305,0,2019-07-03 10:00:00,2019,7,3,10,2,0.0,25.8,82.0


In [131]:
def observation_merge(merged, obs):
    obs = obs.groupby('person_id').first().reset_index()
    dummies = pd.get_dummies(obs['valuefilled'], dtype=float)
    obs = pd.concat((obs, dummies), axis=1)
    obs.drop(columns=['visit_occurrence_id', 'observation_concept_id', 'observation_concept_name', 'valuefilled', 'observation_datetime'], inplace = True)
    return pd.merge(merged, obs, how = 'left', on = 'person_id')

train_obs = pd.read_csv('./training_data/observation_train.csv')
test_obs = pd.read_csv('./testing_data/observation_test.csv')

merged_train = observation_merge(merged_train, train_obs)
merged_test = observation_merge(merged_test, test_obs)

print(merged_train.shape, merged_test.shape)
merged_test.head()

(331623, 14) (130483, 13)


Unnamed: 0,person_id,datetime,datetime_year,datetime_month,datetime_day,datetime_hour,datetime_weekday,MALE,age_in_months,hours_since_admit,Médico,Quirúrgico - Electivo,Quirúrgico - Urgencia
0,3858662,2019-11-29 01:00:00,2019,11,29,1,4,1.0,73.266667,25.0,1.0,0.0,0.0
1,3858662,2019-11-29 03:00:00,2019,11,29,3,4,1.0,73.266667,27.0,1.0,0.0,0.0
2,3858662,2019-11-29 05:00:00,2019,11,29,5,4,1.0,73.266667,29.0,1.0,0.0,0.0
3,3858662,2019-11-29 06:00:00,2019,11,29,6,4,1.0,73.266667,30.0,1.0,0.0,0.0
4,3858662,2019-11-29 07:00:00,2019,11,29,7,4,1.0,73.266667,31.0,1.0,0.0,0.0


In [132]:
def drugs_aggregation(merged, drugs, start, end):
    time_start= pd.Timedelta(hours=start)
    time_end = pd.Timedelta(hours=end)

    merged_drugs = drugs.merge(merged[['person_id', 'datetime']], on='person_id', how='right')

    merged_drugs = merged_drugs[
        (merged_drugs['datetime_x'] >= merged_drugs['datetime_y'] - time_start) &
        (merged_drugs['datetime_x'] <= merged_drugs['datetime_y'] - time_end)
    ]

    aggregated_drugs = merged_drugs.groupby(['person_id', 'datetime_y']).agg({
        'drug_concept_id': lambda x: " ".join(map(str, x.dropna().astype(str))),
        'route_concept_id': lambda x: " ".join(map(str, x.dropna().astype(str)))
    }).reset_index()

    aggregated_drugs = merged.merge(aggregated_drugs, left_on=['person_id', 'datetime'], right_on=['person_id', 'datetime_y'], how='left').drop(columns=['datetime_y'])
    aggregated_drugs['drug_concept_id'] = aggregated_drugs['drug_concept_id'].fillna("")
    aggregated_drugs['route_concept_id'] = aggregated_drugs['route_concept_id'].fillna("")

    return aggregated_drugs

def build_drug_features(train_drugs, test_drugs, merged_train, merged_test):
    train_6 = drugs_aggregation(merged_train, train_drugs, 6, 0)
    test_6 = drugs_aggregation(merged_test, test_drugs, 6, 0) 

    train_24 = drugs_aggregation(merged_train, train_drugs, 24, 6)
    test_24 = drugs_aggregation(merged_test, test_drugs, 24, 6) 

    train_168 = drugs_aggregation(merged_train, train_drugs, 168, 24)
    test_168 = drugs_aggregation(merged_test, test_drugs, 168, 24) 

    tfidf = TfidfVectorizer(analyzer = 'word', max_features = 50)
    train_6_drug = tfidf.fit_transform(train_6['drug_concept_id'])
    test_6_drug = tfidf.transform(test_6['drug_concept_id'])

    train_24_drug = tfidf.transform(train_24['drug_concept_id'])
    test_24_drug = tfidf.transform(test_24['drug_concept_id'])

    train_168_drug = tfidf.transform(train_168['drug_concept_id'])
    test_168_drug = tfidf.transform(test_168['drug_concept_id'])

    tfidf2 = TfidfVectorizer(analyzer = 'word', max_features = 50)
    train_6_route = tfidf2.fit_transform(train_6['route_concept_id'])
    test_6_route = tfidf2.transform(test_6['route_concept_id'])

    train_24_route = tfidf2.transform(train_24['route_concept_id'])
    test_24_route = tfidf2.transform(test_24['route_concept_id'])

    train_168_route = tfidf2.transform(train_168['route_concept_id'])
    test_168_route = tfidf2.transform(test_168['route_concept_id'])

    train_drug_arrays = [pd.DataFrame(vecs.toarray(), columns=tfidf.get_feature_names_out()) for vecs in [train_6_drug, train_24_drug, train_168_drug]]
    test_drug_arrays = [pd.DataFrame(vecs.toarray(), columns=tfidf.get_feature_names_out()) for vecs in [test_6_drug, test_24_drug, test_168_drug]]

    train_route_arrays = [pd.DataFrame(vecs.toarray(), columns=tfidf2.get_feature_names_out()) for vecs in [train_6_route, train_24_route, train_168_route]]
    test_route_arrays = [pd.DataFrame(vecs.toarray(), columns=tfidf2.get_feature_names_out()) for vecs in [test_6_route, test_24_route, test_168_route]]

    new_merged_train = pd.concat([merged_train] + train_drug_arrays + train_route_arrays, axis = 1)
    new_merged_test = pd.concat([merged_test] + test_drug_arrays + test_route_arrays, axis = 1)

    return new_merged_train, new_merged_test


train_drugs = pd.read_csv('./training_data/drugsexposure_train.csv')
test_drugs = pd.read_csv('./testing_data/drugsexposure_test.csv')

train_drugs['datetime'] = pd.to_datetime(train_drugs['drug_datetime_hourly'])
train_drugs.drop(columns=['drug_datetime_hourly'], inplace=True) 
train_drugs.dropna(inplace=True)  

test_drugs['datetime'] = pd.to_datetime(test_drugs['drug_datetime_hourly'])
test_drugs.drop(columns=['drug_datetime_hourly'], inplace=True) 
test_drugs.dropna(inplace=True)  

merged_train, merged_test = build_drug_features(train_drugs, test_drugs, merged_train, merged_test)

print(merged_train.shape, merged_test.shape)
merged_train.head()

(331623, 188) (130483, 187)


Unnamed: 0,person_id,SepsisLabel,datetime,datetime_year,datetime_month,datetime_day,datetime_hour,datetime_weekday,MALE,age_in_months,...,topical,intramuscula,intrapulmonary,intravenous,nasal,ophthalmic,oral,otic,rectal,topical.1
0,510305,0,2019-07-01 16:00:00,2019,7,1,16,0,0.0,25.733333,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,510305,0,2019-07-01 17:00:00,2019,7,1,17,0,0.0,25.733333,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,510305,0,2019-07-02 13:00:00,2019,7,2,13,1,0.0,25.766667,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,510305,0,2019-07-03 09:00:00,2019,7,3,9,2,0.0,25.8,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,510305,0,2019-07-03 10:00:00,2019,7,3,10,2,0.0,25.8,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0


In [133]:
merged_train.to_csv('drugs_train.csv', index = False)
merged_test.to_csv('drugs_test.csv', index = False)

In [134]:
def process_merge_procedures(merge, procedures):
    procedures['datetime_y'] = pd.to_datetime(procedures['procedure_datetime_hourly'])
    procedures.drop(columns = ['visit_occurrence_id', 'procedure_datetime_hourly'], inplace = True)

    merge = merge.sort_values(by=['datetime', 'person_id']).reset_index(drop=True)
    procedures = procedures.sort_values(by=['datetime_y', 'person_id']).reset_index(drop=True)

    merged_df = pd.merge_asof(
        merge, 
        procedures, 
        left_on ='datetime', 
        right_on = 'datetime_y',
        by='person_id', 
        direction='backward',
        suffixes=('_x', '_y')
    )
    merged_df['time_since_last_procedure'] = (merged_df['datetime'] - merged_df['datetime_y']).dt.days
    merged_df = pd.get_dummies(merged_df, columns=['procedure'])
    merged_df.drop(columns=['datetime_y'], inplace=True)
    return merged_df

procedure_train = pd.read_csv('./training_data/proceduresoccurrences_train.csv')
procedure_test = pd.read_csv('./testing_data/proceduresoccurrences_test.csv')

merged_train = process_merge_procedures(merged_train, procedure_train)
merged_test = process_merge_procedures(merged_test, procedure_test)

print(merged_train.shape, merged_test.shape)
merged_test.head()

(331623, 196) (130483, 195)


Unnamed: 0,person_id,datetime,datetime_year,datetime_month,datetime_day,datetime_hour,datetime_weekday,MALE,age_in_months,hours_since_admit,...,rectal,topical,time_since_last_procedure,procedure_Cannulation,procedure_Dialysis procedure,procedure_Exteriorization of trachea,procedure_Extracorporeal membrane oxygenation,procedure_Invasive ventilation,procedure_Non-invasive ventilation,procedure_Peritoneal dialysis
0,1486823489,2019-04-16 14:00:00,2019,4,16,14,1,0.0,0.0,14.0,...,0.0,0.0,,False,False,False,False,False,False,False
1,1486823489,2019-04-16 15:00:00,2019,4,16,15,1,0.0,0.0,15.0,...,0.0,0.0,,False,False,False,False,False,False,False
2,1486823489,2019-04-17 11:00:00,2019,4,17,11,2,0.0,0.033333,35.0,...,0.0,0.0,,False,False,False,False,False,False,False
3,1486823489,2019-04-17 21:00:00,2019,4,17,21,2,0.0,0.033333,45.0,...,0.0,1.0,,False,False,False,False,False,False,False
4,1486823489,2019-04-18 16:00:00,2019,4,18,16,3,0.0,0.066667,64.0,...,0.0,0.927792,,False,False,False,False,False,False,False


In [135]:
device_train = pd.read_csv('device_train.csv')
device_test = pd.read_csv('device_test.csv')

device_train['datetime'] = pd.to_datetime(device_train['datetime'])
device_test['datetime'] = pd.to_datetime(device_test['datetime'])

merged_train = pd.merge(merged_train, device_train, how = 'inner', on = ['person_id', 'datetime'])
merged_test = pd.merge(merged_test, device_test, how = 'inner', on = ['person_id', 'datetime'])

print(merged_train.shape, merged_test.shape)
merged_train.head()

(331637, 201) (130483, 200)


Unnamed: 0,person_id,SepsisLabel,datetime,datetime_year,datetime_month,datetime_day,datetime_hour,datetime_weekday,MALE,age_in_months,...,procedure_Exteriorization of trachea,procedure_Extracorporeal membrane oxygenation,procedure_Invasive ventilation,procedure_Non-invasive ventilation,procedure_Peritoneal dialysis,arterial blood pressure catheter,central venous catheter,endotracheal tube,nasogastric/orogastric tube stylet,urinary catheter
0,1089329724,0,2018-12-23 16:00:00,2018,12,23,16,6,0.0,0.0,...,False,False,False,False,False,0.0,0.0,0.0,0.0,0.0
1,1089329724,0,2018-12-24 16:00:00,2018,12,24,16,0,0.0,0.033333,...,False,False,False,False,False,0.0,0.0,0.0,0.0,0.0
2,1089329724,0,2018-12-25 14:00:00,2018,12,25,14,1,0.0,0.066667,...,False,False,False,False,False,0.0,0.0,0.0,0.0,0.0
3,1702473143,0,2019-01-07 09:00:00,2019,1,7,9,0,1.0,0.0,...,False,False,False,False,False,0.0,0.0,0.0,0.0,0.0
4,1702473143,0,2019-01-07 10:00:00,2019,1,7,10,0,1.0,0.0,...,False,False,False,False,False,0.0,0.0,0.0,0.0,0.0


In [136]:
merged_train.to_csv('device_meds_train.csv', index = False)
merged_test.to_csv('device_meds_test.csv', index = False)

In [137]:
def add_measurement_features(merged, new, agg_hours = 24):
    new.drop(columns = 'visit_occurrence_id', inplace = True)

    # Convert to date time
    new['measurement_datetime'] = pd.to_datetime(new['measurement_datetime']) 

    # Merge new dfs into old ones on person_id and datetime
    new_merged = pd.merge(merged[['person_id', 'datetime']], new, on = 'person_id', how = 'left')

    # Set the amount of time we will allow past values to propagate forward
    delta = pd.Timedelta(hours = agg_hours)
    
    # Remove rows from future or outside the allowed time frame
    new_merged = new_merged[
        (new_merged['measurement_datetime'] >= new_merged['datetime'] - delta) &
        (new_merged['measurement_datetime'] <= new_merged['datetime'])
    ]
    
    # Sort df for forward fillin
    new_merged = new_merged.sort_values(by=['person_id', 'datetime', 'measurement_datetime'], ascending=[True, True, False])
    new_merged.set_index(['person_id', 'datetime'], inplace=True)
    new_merged = new_merged.groupby(['person_id', 'datetime']).ffill().reset_index()
    
    # Take most recent values for each row
    most_recent = new_merged.drop_duplicates(subset=['person_id', 'datetime'], keep='first')
    
    # Merge recent values back on old dfs, ensuring the number of rows is unchanged
    new_merged = merged.merge(most_recent, on=['person_id', 'datetime'], how='left')

    # add a feature for the amount of time since the new values were taked
    new_merged['hours_since_lab'] = (new_merged['datetime'] - new_merged['measurement_datetime']).dt.total_seconds()/3600
    new_merged.drop(columns=['measurement_datetime'], inplace=True)

    return new_merged

In [138]:
labs_train = pd.read_csv('./training_data/measurement_lab_train.csv')
labs_test = pd.read_csv('./testing_data/measurement_lab_test.csv')

single_value_columns = labs_train.columns[labs_train.nunique(dropna=False) == 1].tolist()

labs_train.drop(columns=single_value_columns, inplace=True)
labs_test.drop(columns=single_value_columns, inplace=True)

merged_train = add_measurement_features(merged_train, labs_train)
merged_test = add_measurement_features(merged_test, labs_test)

print(merged_train.shape, merged_test.shape)
merged_train.head()

(331637, 238) (130483, 237)


Unnamed: 0,person_id,SepsisLabel,datetime,datetime_year,datetime_month,datetime_day,datetime_hour,datetime_weekday,MALE,age_in_months,...,Partial thromboplastin time,activated,Total white blood count,Platelet count,White blood cell count,Blood venous pH,D-dimer level,Blood arterial pH,Hemoglobin [Moles/volume] in Blood,hours_since_lab
0,1089329724,0,2018-12-23 16:00:00,2018,12,23,16,6,0.0,0.0,...,,9.1,,,,,,13.8,,0.0
1,1089329724,0,2018-12-24 16:00:00,2018,12,24,16,0,0.0,0.033333,...,,9.1,,,,,,13.8,,24.0
2,1089329724,0,2018-12-25 14:00:00,2018,12,25,14,1,0.0,0.066667,...,,10.0,197.0,,,,,11.7,,0.0
3,1702473143,0,2019-01-07 09:00:00,2019,1,7,9,0,1.0,0.0,...,,,217.0,,,,,14.2,,0.0
4,1702473143,0,2019-01-07 10:00:00,2019,1,7,10,0,1.0,0.0,...,,5.9,,0.0,,,,,,0.0


In [139]:
merged_train.to_csv('labs_train.csv')
merged_test.to_csv('labs_test.csv')

In [140]:
meds_train = pd.read_csv('./training_data/measurement_meds_train.csv')
meds_test = pd.read_csv('./testing_data/measurement_meds_test.csv')

single_value_columns = meds_train.columns[meds_train.nunique(dropna=False) == 1].tolist()

meds_train.drop(columns=single_value_columns, inplace=True)
meds_test.drop(columns=single_value_columns, inplace=True)

merged_train = add_measurement_features(merged_train, meds_train)
merged_test = add_measurement_features(merged_test, meds_test)

print(merged_train.shape, merged_test.shape)
merged_train.head()

(331637, 245) (130483, 244)


Unnamed: 0,person_id,SepsisLabel,datetime,datetime_year,datetime_month,datetime_day,datetime_hour,datetime_weekday,MALE,age_in_months,...,Blood arterial pH,Hemoglobin [Moles/volume] in Blood,hours_since_lab,Systolic blood pressure,Diastolic blood pressure,Body temperature,Respiratory rate,Heart rate,Measurement of oxygen saturation at periphery,Oxygen/Gas total [Pure volume fraction] Inhaled gas
0,1089329724,0,2018-12-23 16:00:00,2018,12,23,16,6,0.0,0.0,...,13.8,,,,,,,,,
1,1089329724,0,2018-12-24 16:00:00,2018,12,24,16,0,0.0,0.033333,...,13.8,,,,,,,,,
2,1089329724,0,2018-12-25 14:00:00,2018,12,25,14,1,0.0,0.066667,...,11.7,,,,,,,,,
3,1702473143,0,2019-01-07 09:00:00,2019,1,7,9,0,1.0,0.0,...,14.2,,,,,,,,,
4,1702473143,0,2019-01-07 10:00:00,2019,1,7,10,0,1.0,0.0,...,,,,,,,,,,


In [141]:
merged_train.to_csv('meds_train.csv', index = False)
merged_test.to_csv('meds_test.csv', index = False)

In [142]:
obs_train = pd.read_csv('./training_data/measurement_observation_train.csv')
obs_test = pd.read_csv('./testing_data/measurement_observation_test.csv')

dummies = pd.get_dummies(obs_train[['Capillary refill [Time]', 'Pulse', 'Arterial pulse pressure', 'Right pupil Pupillary response', 'Left pupil Pupillary response']], dtype=float)
obs_train = pd.concat((obs_train, dummies), axis=1)
obs_train.drop(columns=['Capillary refill [Time]', 'Pulse', 'Arterial pulse pressure', 'Right pupil Pupillary response', 'Left pupil Pupillary response'], inplace=True)

dummies = pd.get_dummies(obs_test[['Capillary refill [Time]', 'Pulse', 'Arterial pulse pressure', 'Right pupil Pupillary response', 'Left pupil Pupillary response']], dtype=float)
obs_test = pd.concat((obs_test, dummies), axis=1)
obs_test.drop(columns=['Capillary refill [Time]', 'Pulse', 'Arterial pulse pressure', 'Right pupil Pupillary response', 'Left pupil Pupillary response'], inplace=True)

merged_train = add_measurement_features(merged_train, obs_train)
merged_test = add_measurement_features(merged_test, obs_test)


print(merged_train.shape, merged_test.shape)
merged_train.head()

(331637, 263) (130483, 262)


Unnamed: 0,person_id,SepsisLabel,datetime,datetime_year,datetime_month,datetime_day,datetime_hour,datetime_weekday,MALE,age_in_months,...,Pulse_Weak,Arterial pulse pressure_Absent,Arterial pulse pressure_Present,Arterial pulse pressure_Weak,Right pupil Pupillary response_Normal,Right pupil Pupillary response_Sluggish,Right pupil Pupillary response_Unresponsive,Left pupil Pupillary response_Normal,Left pupil Pupillary response_Sluggish,Left pupil Pupillary response_Unresponsive
0,1089329724,0,2018-12-23 16:00:00,2018,12,23,16,6,0.0,0.0,...,,,,,,,,,,
1,1089329724,0,2018-12-24 16:00:00,2018,12,24,16,0,0.0,0.033333,...,,,,,,,,,,
2,1089329724,0,2018-12-25 14:00:00,2018,12,25,14,1,0.0,0.066667,...,,,,,,,,,,
3,1702473143,0,2019-01-07 09:00:00,2019,1,7,9,0,1.0,0.0,...,,,,,,,,,,
4,1702473143,0,2019-01-07 10:00:00,2019,1,7,10,0,1.0,0.0,...,,,,,,,,,,


In [143]:
merged_train.to_csv('obs_train.csv', index = False)
merged_test.to_csv('obs_test.csv', index = False)