In [28]:
import warnings
warnings.filterwarnings('ignore')

In [29]:
import numpy as np
import pandas as pd
from sklearn.base import BaseEstimator, RegressorMixin
from sklearn.linear_model import LinearRegression
import xgboost as xgb
from sklearn.metrics import mean_squared_error

In [30]:
df=pd.read_csv('/kaggle/input/predicta-1-0-predict-the-unpredictable/historical_weather.csv')
submission=pd.read_csv('/kaggle/input/predicta-1-0-predict-the-unpredictable/sample_submission.csv')
submission_key=pd.read_csv('/kaggle/input/predicta-1-0-predict-the-unpredictable/submission_key.csv')

In [35]:
df.isna().sum()

city_id                    0
date                       0
avg_temp_c              1224
min_temp_c              5886
max_temp_c              7493
precipitation_mm       69744
snow_depth_mm         170100
avg_wind_dir_deg       35394
avg_wind_speed_kmh     22472
dtype: int64

In [36]:
df.columns

Index(['city_id', 'date', 'avg_temp_c', 'min_temp_c', 'max_temp_c',
       'precipitation_mm', 'snow_depth_mm', 'avg_wind_dir_deg',
       'avg_wind_speed_kmh'],
      dtype='object')

In [5]:
# Custom Sinusoidal Regressor
class SinusoidalRegressor(BaseEstimator, RegressorMixin):
    def __init__(self, freq=0.01):
        self.freq = freq
        self.model = LinearRegression()

    def fit(self, X, y):
        X_sin = np.hstack([X, np.sin(self.freq * X), np.cos(self.freq * X)])
        self.model.fit(X_sin, y)
        return self

    def predict(self, X):
        X_sin = np.hstack([X, np.sin(self.freq * X), np.cos(self.freq * X)])
        return self.model.predict(X_sin)

In [6]:
# Custom Boosted Hybrid Model
class BoostedHybrid(BaseEstimator, RegressorMixin):
    def __init__(self, model1, model2):
        self.model1 = model1
        self.model2 = model2

    def fit(self, X, X_model2, y):
        self.model1.fit(X, y)
        residuals = y - self.model1.predict(X)
        self.model2.fit(X_model2, residuals)
        return self

    def predict(self, X, X_model2):
        prediction = self.model1.predict(X) + self.model2.predict(X_model2)
        return prediction

In [14]:
# Load data (example dataset)
data = pd.read_csv('/kaggle/input/predicta-1-0-predict-the-unpredictable/historical_weather.csv', parse_dates=['date'], index_col=['date', 'city_id'])

In [8]:
# Define models
model = BoostedHybrid(
    model1=SinusoidalRegressor(freq=0.01),
    model2=xgb.XGBRegressor(objective='reg:squarederror')
)

In [9]:
# Iterate through each city to train and predict
num_cities = data.index.get_level_values('city_id').nunique()

In [15]:
data = data.sort_index()

# Group by city_id and forward fill missing values within each group
for city_id in data.index.get_level_values('city_id').unique():
    city_mask = (data.index.get_level_values('city_id') == city_id)
    last_year_temp = data.loc[city_mask, 'avg_temp_c'].shift(365)  # Shift by 365 days (approx. one year)
    data.loc[city_mask, 'avg_temp_c'] = data.loc[city_mask, 'avg_temp_c'].fillna(last_year_temp)

# Linear interpolation to fill any remaining missing values
data['avg_temp_c'] = data.groupby(data.index.get_level_values('city_id'))['avg_temp_c'].transform(lambda x: x.interpolate(method='linear'))    

In [18]:
data.isna().sum()

avg_temp_c            0
min_temp_c            0
max_temp_c            0
precipitation_mm      3
snow_depth_mm         3
avg_wind_dir_deg      0
avg_wind_speed_kmh    0
dtype: int64

In [17]:
# Handle missing values in features
data.fillna(method='ffill', inplace=True)

In [19]:
predictions_list = []

# Iterate through each city to train and predict
for city_id in data.index.get_level_values('city_id').unique():
    city_data = data.xs(city_id, level='city_id')
    
    # Handle missing values in target
    # city_data['avg_temp_c'].interpolate(method='linear', inplace=True)
    city_temps = city_data['avg_temp_c'].values
    
    X = np.arange(len(city_temps)).reshape(-1, 1)
    y = city_temps
    
    # Train the model using all data points
    model.fit(X, X, y)
    
    # Predict the next 7 values (for days 1827 to 1833)
    X_future = np.arange(len(city_temps), len(city_temps) + 7).reshape(-1, 1)
    future_predictions = model.predict(X_future, X_future)
    
    # Collect predictions
    future_dates = pd.date_range(start=city_data.index[-1] + pd.Timedelta(days=1), periods=7, freq='D')
    for date, prediction in zip(future_dates, future_predictions):
        predictions_list.append([date, city_id, prediction])

In [20]:
# Create a DataFrame for predictions
predictions_df = pd.DataFrame(predictions_list, columns=['date', 'city_id', 'avg_temp_c'])

In [21]:
predictions_df # new

Unnamed: 0,date,city_id,avg_temp_c
0,2019-01-01,C001,9.802389
1,2019-01-02,C001,9.803722
2,2019-01-03,C001,9.805049
3,2019-01-04,C001,9.806370
4,2019-01-05,C001,9.807684
...,...,...,...
695,2019-01-03,C112,21.346875
696,2019-01-04,C112,21.347400
697,2019-01-05,C112,21.347922
698,2019-01-06,C112,21.348441


In [44]:
predictions_df

Unnamed: 0,date,city_id,avg_temp_c
0,2019-01-01,C001,9.802389
1,2019-01-02,C001,9.803722
2,2019-01-03,C001,9.805049
3,2019-01-04,C001,9.806370
4,2019-01-05,C001,9.807684
...,...,...,...
695,2019-01-03,C112,21.346875
696,2019-01-04,C112,21.347400
697,2019-01-05,C112,21.347922
698,2019-01-06,C112,21.348441


In [45]:
submission_key

Unnamed: 0,submission_ID,city_id,date
0,1,C001,2019-01-01
1,2,C001,2019-01-02
2,3,C001,2019-01-03
3,4,C001,2019-01-04
4,5,C001,2019-01-05
...,...,...,...
695,696,C112,2019-01-03
696,697,C112,2019-01-04
697,698,C112,2019-01-05
698,699,C112,2019-01-06


In [22]:
submission_key['date'] = pd.to_datetime(submission_key['date'])

In [23]:
# Merge predictions with submission_key
submission_key = submission_key.merge(predictions_df, on=['date', 'city_id'], how='left')

In [24]:
# Add submission_ID
submission_key['submission_ID'] = range(1, len(submission_key) + 1)

In [25]:
# Reorder columns for submission format
submission_df = submission_key[['submission_ID', 'avg_temp_c']]

In [26]:
# Save predictions to CSV
submission_df.to_csv('submission.csv', index=False)

In [27]:
from IPython.display import FileLink
FileLink(r'submission.csv')

In [52]:
submission_df

Unnamed: 0,submission_ID,avg_temp_c
0,1,9.802389
1,2,9.803722
2,3,9.805049
3,4,9.806370
4,5,9.807684
...,...,...
695,696,21.346875
696,697,21.347400
697,698,21.347922
698,699,21.348441
