In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import matplotlib.pyplot as plt
import seaborn as sns

def load_and_preprocess_data(covid_file_path, weather_file_path):
    """
    Loads, preprocesses, and merges the COVID-19 and weather datasets.

    Args:
        covid_file_path (str): The file path for the OWID COVID-19 data.
        weather_file_path (str): The file path for the French weather data.

    Returns:
        pandas.DataFrame: A merged and preprocessed DataFrame ready for modeling.
    """
    # Load COVID-19 data and filter for France
    try:
        covid_df = pd.read_csv(covid_file_path)
        covid_df_france = covid_df[covid_df['location'] == 'France'].copy()
    except FileNotFoundError:
        print(f"Error: The file '{covid_file_path}' was not found.")
        return None

    # Load weather data
    try:
        weather_df = pd.read_csv(weather_file_path, sep=';')
    except FileNotFoundError:
        print(f"Error: The file '{weather_file_path}' was not found.")
        return None

    # --- Preprocessing COVID Data ---
    covid_df_france['date'] = pd.to_datetime(covid_df_france['date'])
    # Extract year and month for merging
    covid_df_france['year_month'] = covid_df_france['date'].dt.to_period('M')

    # Aggregate COVID data to a monthly level
    monthly_covid_data = covid_df_france.groupby('year_month').agg({
        'new_cases': 'sum',
        'new_deaths': 'sum'
    }).reset_index()

    # Calculate transmission rate (monthly % change in new cases)
    monthly_covid_data['transmission_rate'] = monthly_covid_data['new_cases'].pct_change().fillna(0) * 100


    # --- Preprocessing Weather Data ---
    # The weather column names are based on the screenshot provided
    # 'AAAAMM' is YearMonth, 'TM' is average temperature, 'RR' is precipitation
    # We will rename them for clarity
    weather_df = weather_df[['AAAAMM', 'TM', 'RR']].rename(columns={
        'AAAAMM': 'year_month',
        'TM': 'avg_temp',
        'RR': 'precipitation'
    })

    # Convert year_month to period for merging
    weather_df['year_month'] = pd.to_datetime(weather_df['year_month'], format='%Y%m').dt.to_period('M')

    # The weather data is per station. We need to aggregate it for France.
    # We'll take the average temperature and precipitation across all stations for each month.
    monthly_weather_data = weather_df.groupby('year_month').agg({
        'avg_temp': 'mean',
        'precipitation': 'mean'
    }).reset_index()


    # Handle missing values in the aggregated weather data
    # For simplicity, we'll fill with the median.
    monthly_weather_data['avg_temp'] = monthly_weather_data['avg_temp'].fillna(monthly_weather_data['avg_temp'].median())
    monthly_weather_data['precipitation'] = monthly_weather_data['precipitation'].fillna(monthly_weather_data['precipitation'].median())


    # --- Merging Data ---
    # Merge the two monthly datasets
    merged_df = pd.merge(monthly_covid_data, monthly_weather_data, on='year_month')

    # Drop rows with NaN values that might remain after merging
    merged_df.dropna(inplace=True)

    return merged_df

def train_and_evaluate_model(df, target_column):
    """
    Trains a RandomForestRegressor model and evaluates its performance.

    Args:
        df (pandas.DataFrame): The preprocessed DataFrame.
        target_column (str): The name of the column to predict ('new_cases', 'new_deaths', 'transmission_rate').

    Returns:
        dict: A dictionary containing the model's evaluation metrics.
    """
    print(f"\n--- Training model for: {target_column} ---")

    # Define features (X) and target (y)
    features = ['avg_temp', 'precipitation']
    X = df[features]
    y = df[target_column]

    if y.empty:
        print(f"Skipping training for {target_column} as there is no data.")
        return None

    # Split data into training and testing sets
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    # Initialize and train the model
    model = RandomForestRegressor(n_estimators=100, random_state=42)
    model.fit(X_train, y_train)

    # Make predictions
    y_pred = model.predict(X_test)

    # Evaluate the model
    mae = mean_absolute_error(y_test, y_pred)
    mse = mean_squared_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)

    print(f"Evaluation metrics for {target_column}:")
    print(f"  Mean Absolute Error (MAE): {mae:.2f}")
    print(f"  Mean Squared Error (MSE): {mse:.2f}")
    print(f"  R-squared (R2 Score): {r2:.2f}")

    # --- Feature Importance ---
    feature_importances = pd.Series(model.feature_importances_, index=features)
    print("\nFeature Importances:")
    print(feature_importances)


    # --- Visualization of Predictions ---
    plt.figure(figsize=(12, 6))
    plt.scatter(y_test, y_pred, alpha=0.7)
    plt.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], '--', color='red', linewidth=2)
    plt.title(f'Actual vs. Predicted - {target_column}')
    plt.xlabel('Actual Values')
    plt.ylabel('Predicted Values')
    plt.grid(True)
    plt.show()


    return {'mae': mae, 'mse': mse, 'r2': r2}

def visualize_data(df):
    """
    Creates visualizations for exploratory data analysis.

    Args:
        df (pandas.DataFrame): The preprocessed DataFrame.
    """
    print("\n--- Visualizing Data ---")

    # Convert period to timestamp for plotting
    df['date'] = df['year_month'].dt.to_timestamp()

    # Time series plots
    fig, axes = plt.subplots(3, 1, figsize=(15, 12), sharex=True)
    fig.suptitle('Monthly COVID-19 Metrics and Weather in France', fontsize=16)

    axes[0].plot(df['date'], df['new_cases'], label='New Cases', color='blue')
    axes[0].set_ylabel('New Cases')
    axes[0].legend()
    axes[0].grid(True)

    axes[1].plot(df['date'], df['new_deaths'], label='New Deaths', color='red')
    axes[1].set_ylabel('New Deaths')
    axes[1].legend()
    axes[1].grid(True)

    ax2_twin = axes[2].twinx()
    axes[2].plot(df['date'], df['avg_temp'], label='Avg Temperature (°C)', color='orange')
    ax2_twin.bar(df['date'], df['precipitation'], label='Precipitation (mm)', color='gray', alpha=0.5, width=20)
    axes[2].set_ylabel('Avg Temperature (°C)')
    ax2_twin.set_ylabel('Precipitation (mm)')
    axes[2].legend(loc='upper left')
    ax2_twin.legend(loc='upper right')
    axes[2].grid(True)

    plt.xlabel('Date')
    plt.tight_layout(rect=[0, 0.03, 1, 0.95])
    plt.show()

    # Correlation Heatmap
    plt.figure(figsize=(10, 7))
    sns.heatmap(df[['new_cases', 'new_deaths', 'transmission_rate', 'avg_temp', 'precipitation']].corr(), annot=True, cmap='coolwarm', fmt=".2f")
    plt.title('Correlation Matrix')
    plt.show()


# --- Main Execution ---
if __name__ == "__main__":
    # IMPORTANT: Replace with the actual paths to your data files
    covid_data_path = 'owid-covid-data.csv'
    weather_data_path = 'donnees-synop-essentielles-omm.csv'

    # 1. Load and process data
    final_df = load_and_preprocess_data(covid_data_path, weather_data_path)

    if final_df is not None and not final_df.empty:
        print("--- Data Preprocessing and Merging Complete ---")
        print("First 5 rows of the final dataset:")
        print(final_df.head())
        print("\nDataset Info:")
        final_df.info()

        # 2. Visualize the data to find patterns
        visualize_data(final_df)

        # 3. Train and evaluate models for each target
        targets = ['new_cases', 'new_deaths', 'transmission_rate']
        for target in targets:
            train_and_evaluate_model(final_df, target)
    else:
        print("\nCould not proceed with training due to data loading or processing errors.")

