In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
import matplotlib.pyplot as plt

In [2]:
historical_weather = pd.read_csv('historical_weather.csv')
submission_key = pd.read_csv('submission_key.csv')

In [3]:
historical_weather.head()

Unnamed: 0,city_id,date,avg_temp_c,min_temp_c,max_temp_c,precipitation_mm,snow_depth_mm,avg_wind_dir_deg,avg_wind_speed_kmh
0,C001,1/1/2014,6.6,-1.4,11.6,,,168.0,6.2
1,C001,1/2/2014,9.3,6.3,13.3,,,155.0,10.0
2,C001,1/3/2014,7.6,1.9,14.0,,,,5.8
3,C001,1/4/2014,7.6,3.9,13.3,,,291.0,11.3
4,C001,1/5/2014,8.6,0.5,16.9,,,,5.0


In [4]:
submission_key.head()

Unnamed: 0,submission_ID,city_id,date
0,1,C001,1/1/2019
1,2,C001,1/2/2019
2,3,C001,1/3/2019
3,4,C001,1/4/2019
4,5,C001,1/5/2019


In [5]:
historical_weather.isnull().sum()

city_id                    0
date                       0
avg_temp_c              1224
min_temp_c              5886
max_temp_c              7493
precipitation_mm       69744
snow_depth_mm         170100
avg_wind_dir_deg       35394
avg_wind_speed_kmh     22472
dtype: int64

In [6]:
historical_weather.fillna(historical_weather.mean(), inplace=True)

In [7]:
historical_weather.isnull().sum()

city_id               0
date                  0
avg_temp_c            0
min_temp_c            0
max_temp_c            0
precipitation_mm      0
snow_depth_mm         0
avg_wind_dir_deg      0
avg_wind_speed_kmh    0
dtype: int64

In [8]:
historical_weather.head()

Unnamed: 0,city_id,date,avg_temp_c,min_temp_c,max_temp_c,precipitation_mm,snow_depth_mm,avg_wind_dir_deg,avg_wind_speed_kmh
0,C001,1/1/2014,6.6,-1.4,11.6,3.679964,342.481696,168.0,6.2
1,C001,1/2/2014,9.3,6.3,13.3,3.679964,342.481696,155.0,10.0
2,C001,1/3/2014,7.6,1.9,14.0,3.679964,342.481696,175.650289,5.8
3,C001,1/4/2014,7.6,3.9,13.3,3.679964,342.481696,291.0,11.3
4,C001,1/5/2014,8.6,0.5,16.9,3.679964,342.481696,175.650289,5.0


In [9]:
# Create new features
historical_weather['year'] = pd.DatetimeIndex(historical_weather['date']).year
historical_weather['month'] = pd.DatetimeIndex(historical_weather['date']).month
historical_weather['day'] = pd.DatetimeIndex(historical_weather['date']).day

# Lag features: Shift the avg_temp_c column by 7 days to get last week's average temperature
historical_weather['avg_temp_lag_7'] = historical_weather['avg_temp_c'].shift(7)

# Rolling averages: Rolling average of the last 7 days
historical_weather['avg_temp_roll_7'] = historical_weather['avg_temp_c'].rolling(window=7).mean()

# Fill any new NaN values that were created by shifting/rolling
historical_weather.fillna(historical_weather.mean(), inplace=True)


In [10]:
# Convert 'city_id' to categorical and then to numeric
historical_weather['city_id'] = historical_weather['city_id'].astype('category')
historical_weather['city_id'] = historical_weather['city_id'].cat.codes


In [11]:
# Import necessary libraries for modeling
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor

# Define the target variable and features
X = historical_weather.drop(columns=['avg_temp_c', 'date'])
y = historical_weather['avg_temp_c']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize and train the model
model = RandomForestRegressor()
model.fit(X_train, y_train)

# Evaluate the model (optional)
from sklearn.metrics import mean_squared_error

y_pred = model.predict(X_test)
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
print(f'Root Mean Squared Error: {rmse}')


Root Mean Squared Error: 0.9007022509982692


In [12]:
from sklearn.linear_model import LinearRegression

# Create a Linear Regression model
linear_model = LinearRegression()
linear_model.fit(X_train, y_train)

# Evaluate the model
y_pred_linear = linear_model.predict(X_test)
rmse_linear = np.sqrt(mean_squared_error(y_test, y_pred_linear))
print(f'Linear Regression RMSE: {rmse_linear}')


Linear Regression RMSE: 1.5201360259633752


In [None]:
from sklearn.svm import SVR

# Create an SVM model with a linear kernel
svm_model = SVR(kernel='linear')
svm_model.fit(X_train, y_train)

# Evaluate the model
y_pred_svm = svm_model.predict(X_test)
rmse_svm = np.sqrt(mean_squared_error(y_test, y_pred_svm))
print(f'SVM (Linear Kernel) RMSE: {rmse_svm}')