In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression, Ridge
from sklearn.metrics import mean_squared_error
from sklearn.base import BaseEstimator, RegressorMixin
from xgboost import XGBRegressor
from sklearn.model_selection import train_test_split, cross_val_score

In [2]:
data_df = pd.read_csv('/content/historical_weather.csv')

In [4]:
data_df['date'] = pd.to_datetime(data_df['date'], format='%Y-%m-%d') # Change the format string to match the actual format of dates in your data
filtered_data_df = pd.concat([data_df.dropna(subset=['avg_temp_c']), data_df.dropna(subset=['min_temp_c', 'max_temp_c'])])
filtered_data_df = filtered_data_df.drop_duplicates()

data_df['avg_temp_c'] = data_df.apply(
    lambda row: (row['min_temp_c'] + row['max_temp_c']) / 2 if pd.isna(row['avg_temp_c']) and not pd.isna(row['min_temp_c']) and not pd.isna(row['max_temp_c']) else row['avg_temp_c'],
    axis=1
)
data_df = data_df.dropna(subset=['avg_temp_c'])

In [5]:
complete_date_range = pd.date_range(start='2014-01-01', end='2018-12-31')
cities = data_df['city_id'].unique()
full_data = pd.DataFrame({
    'date': np.tile(complete_date_range, len(cities)),
    'city_id': np.repeat(cities, len(complete_date_range))
})
full_data = full_data.merge(data_df, on=['city_id', 'date'], how='left')
full_data['avg_temp_c'] = full_data['avg_temp_c'].interpolate(method='linear')
full_data['avg_temp_c'] = full_data['avg_temp_c'].fillna(method='bfill').ffill()


In [6]:
city_temp_array = []
for city in cities:
    city_data = full_data[full_data['city_id'] == city].sort_values('date')
    city_temp_array.append(city_data['avg_temp_c'].values)
city_temp_array = np.array(city_temp_array)

In [7]:
class SinusoidalRegressor(BaseEstimator, RegressorMixin):
    def __init__(self, freq=1.0):
        self.freq = freq

    def fit(self, X, y):
        X = X.flatten()
        self.coef_ = np.polyfit(X, y, 1)
        return self

    def predict(self, X):
        X = X.flatten()
        return self.coef_[0] * np.sin(self.freq * X) + self.coef_[1]

In [8]:
class BoostedHybrid:
    def __init__(self, model_1, model_2, model_3):
        self.model_1 = model_1
        self.model_2 = model_2
        self.model_3 = model_3

    def fit(self, X_1, X_2, X_3, y):
        self.model_1.fit(X_1, y)
        y_fit_1 = self.model_1.predict(X_1)
        y_resid_1 = y - y_fit_1

        self.model_2.fit(X_2, y_resid_1)
        y_fit_2 = self.model_2.predict(X_2)
        y_resid_2 = y_resid_1 - y_fit_2

        self.model_3.fit(X_3, y_resid_2)
        self.y_fit_1 = y_fit_1
        self.y_fit_2 = y_fit_2
        self.y_resid_2 = y_resid_2

    def predict(self, X_1, X_2, X_3):
        y_pred = self.model_1.predict(X_1)
        y_pred += self.model_2.predict(X_2)
        y_pred += self.model_3.predict(X_3)
        return y_pred

In [9]:
city_temps = city_temp_array[0]
X = np.arange(len(city_temps)).reshape(-1, 1)
y = city_temps

X_train, X_test = X[:1820], X[1820:]
y_train, y_test = y[:1820], y[1820:]

In [10]:
modelMain = BoostedHybrid(
    model_1=SinusoidalRegressor(freq=0.5),
    model_2=XGBRegressor(n_estimators=100, learning_rate=0.1),
    model_3=Ridge(alpha=1.0)
)

modelMain.fit(X_train, X_train, X_train, y_train)
y_pred = modelMain.predict(X_test, X_test, X_test)

In [11]:
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
print(f'Predicted values: {y_pred}')
print(f'Actual values: {y_test}')
print(f'RMSE: {rmse}')

Predicted values: [9.35828911 9.3584808  9.35874306 9.3590135  9.35922776 9.35933521]
Actual values: [ 9.4  9.7 10.1  8.9 10.7 11.8]
RMSE: 1.1995169328848636


In [12]:
# Initialize the 2D array to store predictions
num_cities = len(city_temp_array)
predictions_2D_array = np.zeros((num_cities, 7))
print(num_cities)
# Loop through each city's data
for city_index in range(num_cities):
    city_temps = city_temp_array[city_index]  # Use data from the current city
    X = np.arange(len(city_temps)).reshape(-1, 1)
    y = city_temps

    # Create the hybrid model
    model = modelMain
    # BoostedHybrid(
    #     model_1=SinusoidalRegressor(freq=0.01),  # Example frequency
    #     model_2=XGBRegressor(),
    # )

    # Train the model using all 1826 data points
    model.fit(X, X, X, y)  # Note: X is used twice, as X_1 and X_2

    # Predict the next 7 values (for days 1827 to 1833)
    X_future = np.arange(len(city_temps), len(city_temps) + 7).reshape(-1, 1)
    y_pred = model.predict(X_future, X_future, X_future)

    # Store the predictions in the 2D array
    predictions_2D_array[city_index] = y_pred



100


In [13]:
# Assuming predictions_2D_array is already generated and available

# Load the sample submission CSV file
submission_df = pd.read_csv('/content/sample_submission.csv')

# Flatten the predictions_2D_array to match the order in the CSV file
flattened_predictions = predictions_2D_array.flatten()

# Ensure the flattened predictions have the same length as the rows in the submission file
assert len(flattened_predictions) == len(submission_df), "The length of predictions does not match the submission file."

# Write the flattened predictions to the second column of the CSV file
submission_df.iloc[:, 1] = flattened_predictions

# Save the modified submission file
submission_df.to_csv('/content/sample_submission_with_predictions_dam.csv', index=False)

# Display the first few rows of the modified file for verification
submission_df.head()

Unnamed: 0,submission_ID,avg_temp_c
0,1,9.906546
1,2,9.906424
2,3,9.906226
3,4,9.906006
4,5,9.90582


In [14]:
submission_df

Unnamed: 0,submission_ID,avg_temp_c
0,1,9.906546
1,2,9.906424
2,3,9.906226
3,4,9.906006
4,5,9.905820
...,...,...
695,696,21.270901
696,697,21.271173
697,698,21.271404
698,699,21.271536


In [None]:
# from sklearn.metrics import mean_squared_error

# rmse = mean_squared_error(y, y_pred, squared=False)
# print(f"RMSE: {rmse}")