In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
# Load datasets
historical_weather = pd.read_csv('historical_weather.csv')
submission_key = pd.read_csv('submission_key.csv')
sample_submission = pd.read_csv('sample_submission.csv')


In [3]:
# Data Cleaning
historical_weather.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 182338 entries, 0 to 182337
Data columns (total 9 columns):
 #   Column              Non-Null Count   Dtype  
---  ------              --------------   -----  
 0   city_id             182338 non-null  object 
 1   date                182338 non-null  object 
 2   avg_temp_c          181114 non-null  float64
 3   min_temp_c          176452 non-null  float64
 4   max_temp_c          174845 non-null  float64
 5   precipitation_mm    112594 non-null  float64
 6   snow_depth_mm       12238 non-null   float64
 7   avg_wind_dir_deg    146944 non-null  float64
 8   avg_wind_speed_kmh  159866 non-null  float64
dtypes: float64(7), object(2)
memory usage: 12.5+ MB


In [4]:
# Convert date column to datetime
historical_weather['date'] = pd.to_datetime(historical_weather['date'])
submission_key['date'] = pd.to_datetime(submission_key['date'])


In [5]:
#Feature Engineering
    
# Create date-related features
historical_weather['year'] = historical_weather['date'].dt.year
historical_weather['month'] = historical_weather['date'].dt.month
historical_weather['day'] = historical_weather['date'].dt.dayofyear


In [6]:
#handling missing values
numeric_cols = historical_weather.select_dtypes(include=np.number).columns
for col in numeric_cols:
    historical_weather [col].fillna(historical_weather[col].mean(), inplace=True)
    
    

In [7]:
# Prepare training data
features = ['year', 'month', 'day', 'min_temp_c', 'max_temp_c','precipitation_mm', 'snow_depth_mm', 'avg_wind_dir_deg', 'avg_wind_speed_kmh']
X = historical_weather[features]
y = historical_weather['avg_temp_c']


In [8]:
# Split data into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)


In [9]:

# Train a Random Forest model
model = RandomForestRegressor(n_estimators=100, random_state=42)
model.fit(X_train, y_train)


RandomForestRegressor(random_state=42)

In [10]:

# Evaluate model
y_pred = model.predict(X_val)
rmse = np.sqrt(mean_squared_error(y_val, y_pred))
print(f'Validation RMSE: {rmse}')


Validation RMSE: 1.6098338826051866


In [11]:
# Predict on test set
predictions = []
for idx, row in submission_key.iterrows():
    city_data = historical_weather[historical_weather['city_id'] == row['city_id']]
    # Select the most recent data of last 7 days
    relevant_data = city_data[city_data['date'] < row['date']].sort_values(by='date', ascending=False).head(7)
    rolling_mean = relevant_data['avg_temp_c'].mean() # mean of last 7 days 
    pred = rolling_mean
    predictions.append(pred)


In [12]:
# Prepare submission file
sample_submission['avg_temp_c'] = predictions
#sample_submission.to_csv('submission.csv', index=False)
predictions

[9.942857142857141,
 9.942857142857141,
 9.942857142857141,
 9.942857142857141,
 9.942857142857141,
 9.942857142857141,
 9.942857142857141,
 14.62857142857143,
 14.62857142857143,
 14.62857142857143,
 14.62857142857143,
 14.62857142857143,
 14.62857142857143,
 14.62857142857143,
 27.057142857142853,
 27.057142857142853,
 27.057142857142853,
 27.057142857142853,
 27.057142857142853,
 27.057142857142853,
 27.057142857142853,
 -0.9285714285714288,
 -0.9285714285714288,
 -0.9285714285714288,
 -0.9285714285714288,
 -0.9285714285714288,
 -0.9285714285714288,
 -0.9285714285714288,
 23.771428571428572,
 23.771428571428572,
 23.771428571428572,
 23.771428571428572,
 23.771428571428572,
 23.771428571428572,
 23.771428571428572,
 20.9,
 20.9,
 20.9,
 20.9,
 20.9,
 20.9,
 20.9,
 4.557142857142857,
 4.557142857142857,
 4.557142857142857,
 4.557142857142857,
 4.557142857142857,
 4.557142857142857,
 4.557142857142857,
 18.74285714285714,
 18.74285714285714,
 18.74285714285714,
 18.74285714285714,
 18