# Weather Data Prediction 
### This notebook uses a Ridge regression model to predict high and low temperatures of data from May 2023 to July 2024.

## Import Libraries and Data

In [43]:
import pandas as pd
from sklearn.linear_model import Ridge
import joblib

# Read the CSV file into Pandas DataFrame
df = pd.read_csv("Weather_data_2023_2024.csv", parse_dates=True, index_col="datetime")

## Feature Engineering / Pandas for Data Preparation

In [44]:
# Remove columns of no interest
df = df[['location', 'tempmax', 'tempmin', 'temp', 'feelslikemax', 'feelslikemin',
       'feelslike', 'dew', 'humidity', 'precip', 'precipprob', 'precipcover',
       'snow', 'snowdepth', 'windgust', 'windspeed', 'winddir',
       'sealevelpressure', 'cloudcover', 'visibility',
       'moonphase', 'conditions', 'description']]

# Define columns to drop and features based on R-squared and VIF tests results
columns_to_drop = ['location', 'tempmax', 'tempmin', 'temp', 'feelslikemax', 'feelslikemin', 'feelslike', 'moonphase', 'conditions', 'description', 'predicted_tempmax', 'predicted_tempmin']
features = df.drop(columns=[col for col in columns_to_drop if col in df.columns])

# Define targets
target_max = df['tempmax']
target_min = df['tempmin']

## Analysis / Presentation of Results

In [45]:
# Train the model for tempmax on the entire dataset
ridge_max = Ridge(alpha=0.1)
ridge_max.fit(features, target_max)

# Train the model for tempmin on the entire dataset
ridge_min = Ridge(alpha=100.0)
ridge_min.fit(features, target_min)

# Predict tempmax and tempmin for the entire dataset
y_pred_max = ridge_max.predict(features)
y_pred_min = ridge_min.predict(features)

# Create the prediction columns if they don't exist
if 'predicted_tempmax' not in df.columns:
    df['predicted_tempmax'] = pd.Series([0.0] * len(df), index=df.index)

if 'predicted_tempmin' not in df.columns:
    df['predicted_tempmin'] = pd.Series([0.0] * len(df), index=df.index)

# Add predictions to the original dataframe
df['predicted_tempmax'] = y_pred_max
df['predicted_tempmin'] = y_pred_min

# Ensure alignment by resetting the indices
df = df.sort_index()

# Display the dataframe with predictions
# Display only a few rows for quick verification
df[['location', 'tempmax', 'predicted_tempmax', 'tempmin', 'predicted_tempmin']].head(10)

Unnamed: 0_level_0,location,tempmax,predicted_tempmax,tempmin,predicted_tempmin
datetime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2023-05-01,Beijing,25.2,24.793322,8.0,12.334292
2023-05-01,London,17.6,19.210062,11.3,10.509405
2023-05-01,Moscow,9.2,9.025468,3.4,3.586032
2023-05-01,Berlin,18.9,17.225393,3.4,6.747188
2023-05-01,Paris,15.7,16.694966,10.6,9.833487
2023-05-01,Mexico City,32.1,26.003736,5.0,9.184121
2023-05-01,Ottawa,9.9,11.563633,6.8,6.328336
2023-05-01,Rome,19.1,19.479011,14.1,12.152408
2023-05-01,Washington DC,16.0,20.207157,9.9,9.45441
2023-05-02,Ottawa,11.1,11.591042,5.1,4.589413


## Store Results

In [46]:
# Save the models for future use
joblib.dump(ridge_max, 'ridge_model_tempmax.pkl')
joblib.dump(ridge_min, 'ridge_model_tempmin.pkl')

# Save the DataFrame with predictions to a new CSV file
df.to_csv("Weather_data_with_predictions.csv")