In [1]:
# Connect to google drive shared folder
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
import pandas as pd

# Import train data
data = pd.read_parquet(r'/content/drive/MyDrive/Norvartis Datathon/train_data.parquet')

# Add unknown category to all categorical variables for filling missing values
categorical_cols = data.select_dtypes(include=['object', 'category']).columns
for col in categorical_cols:
    if data[col].dtype.name == 'category':
        data[col] = data[col].cat.add_categories('Desconocido')

# Fill missing values
data[categorical_cols] = data[categorical_cols].fillna('Desconocido')
data['hospital_rate'] = data['hospital_rate'].fillna(-1)
data['year'] = data['date'].dt.year

# Check overall data
display(data.head())

Unnamed: 0,brand,phase,country,dayweek,month,wd_perc,ther_area,hospital_rate,n_nwd_bef,n_nwd_aft,...,n_weekday_1,n_weekday_2,n_weekday_3,n_weekday_4,date,wd,wd_left,monthly,main_channel,year
0,AIMST,0.006284,Aldovia,2.0,1.0,0.045455,Desconocido,-1.0,4.0,0.0,...,4,5,5,4,2013-01-02,1,21,0.008092,Desconocido,2013
1,AIMST,0.123459,Aldovia,3.0,1.0,0.090909,Desconocido,-1.0,0.0,0.0,...,4,5,5,4,2013-01-03,2,20,0.008092,Desconocido,2013
2,AIMST,0.055607,Aldovia,4.0,1.0,0.136364,Desconocido,-1.0,0.0,2.0,...,4,5,5,4,2013-01-04,3,19,0.008092,Desconocido,2013
3,AIMST,0.032148,Aldovia,0.0,1.0,0.181818,Desconocido,-1.0,2.0,0.0,...,4,5,5,4,2013-01-07,4,18,0.008092,Desconocido,2013
4,AIMST,0.097054,Aldovia,1.0,1.0,0.227273,Desconocido,-1.0,0.0,0.0,...,4,5,5,4,2013-01-08,5,17,0.008092,Desconocido,2013


In [4]:
cols_to_drop = ["ther_area", "hospital_rate", "n_nwd_bef" , "n_nwd_aft" , "n_weekday_0" , "n_weekday_1","n_weekday_2","n_weekday_3","n_weekday_4", "main_channel"]

clean_df_with_date = data.drop(cols_to_drop, axis=1)
clean_df = data.drop(cols_to_drop  + ["date"], axis=1)

# Specify the full path where you want to save the df as .csv and .parquet files
path_with_date = "/content/drive/MyDrive/Norvartis Datathon/GitHub files/cleaned_dataframes/clean_df_with_date.csv"
path = "/content/drive/MyDrive/Norvartis Datathon/GitHub files/cleaned_dataframes/clean_df.csv"
path_with_date_parquet = "/content/drive/MyDrive/Norvartis Datathon/GitHub files/cleaned_dataframes/clean_df_with_date.parquet"
path_parquet = "/content/drive/MyDrive/Norvartis Datathon/GitHub files/cleaned_dataframes/clean_df.parquet"


# Export the DataFrame to a CSV file
clean_df_with_date.to_csv(path_with_date, index=False)
clean_df.to_csv(path, index=False)

import pyarrow as pa
import pyarrow.parquet as pq

# Export the DataFrames to Parquet files
table_with_date = pa.Table.from_pandas(clean_df_with_date)
pq.write_table(table_with_date, path_with_date_parquet)

table = pa.Table.from_pandas(clean_df)
pq.write_table(table, path_parquet)