In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import LabelEncoder
from statsmodels.tsa.deterministic import DeterministicProcess
from xgboost import XGBRegressor
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split

In [9]:
# Load the data
data_df = pd.read_csv('/content/historical_weather.csv')  # Replace with the correct file path
submission_key = pd.read_csv('/content/submission_key.csv')
sample_submission = pd.read_csv('/content/sample_submission.csv')

In [10]:
class BoostedHybrid:
    def __init__(self, model_1, model_2):
        self.model_1 = model_1
        self.model_2 = model_2
        self.y_columns = None

    def fit(self, X_1, X_2, y):
        self.model_1.fit(X_1, y)
        y_fit = pd.Series(self.model_1.predict(X_1), index=y.index)
        y_resid = y - y_fit
        self.model_2.fit(X_2, y_resid)
        self.y_fit = y_fit
        self.y_resid = y_resid

    def predict(self, X_1, X_2):
        y_pred = pd.Series(self.model_1.predict(X_1), index=X_1.index)
        y_pred += self.model_2.predict(X_2)
        return y_pred

In [11]:
# Preprocess historical weather data
data_df['avg_temp_c'] = data_df.apply(
    lambda row: (row['min_temp_c'] + row['max_temp_c']) / 2
    if pd.isna(row.get('avg_temp_c', None)) and not pd.isna(row['min_temp_c']) and not pd.isna(row['max_temp_c']) # Use row.get() to avoid KeyError if 'avg_temp_c' doesn't exist
    else row.get('avg_temp_c', None), # Use row.get() to handle potential missing 'avg_temp_c'
    axis=1
)

# Remove rows where avg_temp_c is still NaN
data_df = data_df.dropna(subset=['avg_temp_c'])

In [12]:
# Encode categorical 'city_id' column
label_encoder = LabelEncoder()
data_df.loc[:, 'city_id'] = label_encoder.fit_transform(data_df['city_id'])

# Convert 'date' column to datetime and extract useful features
data_df['date'] = pd.to_datetime(data_df['date'], format='%Y-%m-%d')

# Drop the original 'date' column
data_df = data_df.drop(columns=['date'])



In [15]:
# Assuming 'data_df' is your preprocessed DataFrame
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.linear_model import LinearRegression
from xgboost import XGBRegressor
from sklearn.metrics import mean_squared_error
import pandas as pd
import numpy as np

# Define your target variable (assuming 'avg_temp_c' is your target)
y = data_df['avg_temp_c']

# Define features for model 1 and model 2 (replace with your actual features)
features_1 = ['city_id']
features_2 = ['min_temp_c', 'max_temp_c']
X_1 = data_df[features_1]
X_2 = data_df[features_2]

# Split the data into training and validation sets (adjust test_size as needed)
X_1_train, X_1_val, X_2_train, X_2_val, y_train, y_val = train_test_split(
    X_1, X_2, y, test_size=0.2, random_state=42
)

model = BoostedHybrid(
    model_1=LinearRegression(),
    model_2=XGBRegressor(),
)

model.fit(X_1_train, X_2_train, y_train)

# Predict on the validation set
y_pred = model.predict(X_1_val, X_2_val)
y_pred = y_pred.clip(0.0)

# Calculate RMSE
rmse = np.sqrt(mean_squared_error(y_val, y_pred))
print(f'RMSE: {rmse}')


RMSE: 2.3753783285804566


In [19]:
submission_data = submission_key.copy()
submission_data['city_id'] = label_encoder.transform(submission_data['city_id'])
submission_data['date'] = pd.to_datetime(submission_data['date'], format='%Y-%m-%d') # Change the format specifier to match the actual date format in your data.

# Create features from date for submission data
submission_data['year'] = submission_data['date'].dt.year
submission_data['month'] = submission_data['date'].dt.month
submission_data['day'] = submission_data['date'].dt.day

# Drop the original 'date' column
submission_data = submission_data.drop(columns=['date'])

# X_1 and X_2 for submission
# dp_sub = DeterministicProcess(index=submission_data.index, order=1) # Commenting out this line
# X_1_sub = dp_sub.in_sample() # Commenting out this line
X_1_sub = submission_data[['city_id']] # Selecting the 'city_id' column for X_1_sub

X_2_sub = submission_data.drop(columns=['submission_ID'])  # Drop the submission_ID
X_2_sub = X_2_sub.astype(float)

In [23]:
# Re-introduce date processing for submission data
submission_data['date'] = pd.to_datetime(submission_data['date'], format='%Y-%m-%d')

# ... (Your existing code to derive min_temp_c and max_temp_c for submission_data,
#      which might involve using a model or some other logic)

# X_1 and X_2 for submission
X_1_sub = submission_data[['city_id']]

# Include only 'min_temp_c', 'max_temp_c' in X_2_sub
X_2_sub = submission_data[['min_temp_c', 'max_temp_c']]
X_2_sub = X_2_sub.astype(float)

KeyError: "None of [Index(['min_temp_c', 'max_temp_c'], dtype='object')] are in the [columns]"