In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from nbformat.v1.nbjson import write

In [None]:
df_train = pd.read_csv(r"C:\Users\prath\PycharmProjects\CodeSocHackathon\DiseaseSpreadPrediction\Data\dengue_features_train.csv")
df_labels = pd.read_csv(r"C:\Users\prath\PycharmProjects\CodeSocHackathon\DiseaseSpreadPrediction\Data\dengue_labels_train.csv")
df_test = pd.read_csv(r"C:\Users\prath\PycharmProjects\CodeSocHackathon\DiseaseSpreadPrediction\Data\dengue_features_test.csv")

In [None]:
print("Columns in df_labels:", df_labels.columns.tolist())

In [None]:
# Convert week_start_date to datetime format (handling mixed formats)
date_formats = ["%Y-%m-%d", "%d-%m-%Y"]
df_train["week_start_date"] = pd.to_datetime(df_train["week_start_date"], format='mixed', errors='coerce')
df_test["week_start_date"] = pd.to_datetime(df_test["week_start_date"], format='mixed', errors='coerce')

In [None]:
# Merge train data with labels
df_train = df_train.merge(df_labels, on=["city", "year", "weekofyear"], how="inner")

# Handle total_cases columns
if 'total_cases_x' in df_train.columns and 'total_cases_y' in df_train.columns:
    df_train.rename(columns={'total_cases_x': 'total_cases'}, inplace=True)
    df_train.drop(columns=['total_cases_y'], inplace=True)

In [None]:
# Drop redundant columns
df_train.drop(columns=["week_start_date"], inplace=True)
df_test.drop(columns=["week_start_date"], inplace=True)

In [None]:
# Check for missing values
print("\nMissing Values:")
print(df_train.isnull().sum())

In [None]:
# Identify numeric columns
numeric_cols_train = df_train.select_dtypes(include=["number"]).columns
numeric_cols_test = df_test.select_dtypes(include=["number"]).columns

# Ensure both DataFrames have the same numeric columns
common_numeric_cols = numeric_cols_train.intersection(numeric_cols_test)

In [None]:
# Fill missing values with median of numeric columns
df_train[common_numeric_cols] = df_train[common_numeric_cols].fillna(df_train[common_numeric_cols].median())
df_test[common_numeric_cols] = df_test[common_numeric_cols].fillna(df_test[common_numeric_cols].median())

In [None]:
# Encode categorical variables
df_train = pd.get_dummies(df_train, columns=["city"], drop_first=True)
df_test = pd.get_dummies(df_test, columns=["city"], drop_first=True)

In [None]:
# Save cleaned data
df_train.to_csv(r"C:\Users\prath\PycharmProjects\CodeSocHackathon\DiseaseSpreadPrediction\Data\dengue_train_cleaned.csv", index=False)
df_test.to_csv(r"C:\Users\prath\PycharmProjects\CodeSocHackathon\DiseaseSpreadPrediction\Data\dengue_test_cleaned.csv", index=False)

In [None]:
print("Data preprocessing completed and saved.")