In [None]:
# Step 1: Install necessary libraries
!pip install kaggle seaborn scikit-learn

# Step 2: Upload kaggle.json
from google.colab import files
files.upload()  # This will prompt you to upload the kaggle.json file

# Step 3: Move kaggle.json to the appropriate directory
!mkdir -p ~/.kaggle
!cp kaggle.json ~/.kaggle/
!chmod 600 ~/.kaggle/kaggle.json



Saving kaggle.json to kaggle.json


In [None]:
!kaggle competitions download -c predicta-1-0-predict-the-unpredictable
!unzip predicta-1-0-predict-the-unpredictable

Downloading predicta-1-0-predict-the-unpredictable.zip to /content
  0% 0.00/1.94M [00:00<?, ?B/s]
100% 1.94M/1.94M [00:00<00:00, 121MB/s]
Archive:  predicta-1-0-predict-the-unpredictable.zip
  inflating: historical_weather.csv  
  inflating: sample_submission.csv   
  inflating: submission_key.csv      


In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import KFold, GridSearchCV
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import MinMaxScaler
import xgboost as xgb

# Load data
data = pd.read_csv('historical_weather.csv')

# Convert date to datetime and extract features
data['date'] = pd.to_datetime(data['date'])
data['year'] = data['date'].dt.year
data['month'] = data['date'].dt.month
data['day'] = data['date'].dt.day
data['dayofweek'] = data['date'].dt.dayofweek  # New feature: Day of week
data['dayofyear'] = data['date'].dt.dayofyear  # New feature: Day of year

# Impute missing avg_temp_c with mean of avg_temp_c for each city
data['avg_temp_c'] = data.groupby('city_id')['avg_temp_c'].transform(lambda x: x.fillna(x.mean()))

# Encode city_id as a categorical feature
data['city_id'] = data['city_id'].astype('category').cat.codes

# Define features and target
features = ['city_id', 'year', 'month', 'day', 'dayofweek', 'dayofyear']
target = 'avg_temp_c'

# Split the data into features and target
X = data[features]
y = data[target]

# Scale the features
scaler = MinMaxScaler()
X_scaled = scaler.fit_transform(X)

# Initialize the model
model = xgb.XGBRegressor(objective='reg:squarederror', random_state=42)  # Use squarederror for regression

# Set up KFold cross-validation
kf = KFold(n_splits=5, shuffle=True, random_state=42)

# Define hyperparameters to tune
param_grid = {
    'n_estimators': [100, 200, 300],  # Number of boosting rounds
    'learning_rate': [0.05, 0.1, 0.2],  # Step size shrinkage to prevent overfitting
    'max_depth': [3, 4, 5],  # Maximum depth of a tree
    'min_child_weight': [1, 3, 5],  # Minimum sum of instance weight (hessian) needed in a child
    'subsample': [0.6, 0.8, 1.0],  # Subsample ratio of the training instances
    'colsample_bytree': [0.6, 0.8, 1.0]  # Subsample ratio of columns when constructing each tree
}

# Perform GridSearchCV
grid_search = GridSearchCV(estimator=model, param_grid=param_grid, cv=kf, scoring='neg_root_mean_squared_error')
grid_search.fit(X_scaled, y)

# Print the best parameters and RMSE
print(f'Best parameters: {grid_search.best_params_}')
print(f'Best RMSE: {-grid_search.best_score_}')

# Train the model on the full training data with the best parameters
best_model = grid_search.best_estimator_
best_model.fit(X_scaled, y)

# Predict and evaluate on the test set (for 2018 data)
test_data = data[data['year'] == 2018]
X_test = scaler.transform(test_data[features])  # Scale test data
y_test = test_data[target]

y_pred = best_model.predict(X_test)
rmse = mean_squared_error(y_test, y_pred, squared=False)
print(f'RMSE on 2018 test set after hyperparameter tuning: {rmse}')

# Make predictions for 2019/01/01 to 2019/01/07
future_dates = pd.date_range(start='2019-01-01', end='2019-01-07')
future_data = pd.DataFrame({
    'city_id': np.repeat(range(100), len(future_dates)),
    'date': future_dates.tolist() * 100
})
future_data['year'] = future_data['date'].dt.year
future_data['month'] = future_data['date'].dt.month
future_data['day'] = future_data['date'].dt.day
future_data['dayofweek'] = future_data['date'].dt.dayofweek  # New feature: Day of week
future_data['dayofyear'] = future_data['date'].dt.dayofyear  # New feature: Day of year
future_data['city_id'] = future_data['city_id'].astype('category').cat.codes

X_future = scaler.transform(future_data[features])  # Scale future data
future_data['predicted_avg_temp_c'] = best_model.predict(X_future)

predicted_temperatures = future_data['predicted_avg_temp_c'].values
print(future_data)

# Optionally, save to CSV
future_data.to_csv('future_data.csv', index=False)


Best parameters: {'colsample_bytree': 1.0, 'learning_rate': 0.2, 'max_depth': 5, 'min_child_weight': 5, 'n_estimators': 300, 'subsample': 1.0}
Best RMSE: 2.5937769685005505
RMSE on 2018 test set after hyperparameter tuning: 2.450861876691023
     city_id       date  year  month  day  dayofweek  dayofyear  \
0          0 2019-01-01  2019      1    1          1          1   
1          0 2019-01-02  2019      1    2          2          2   
2          0 2019-01-03  2019      1    3          3          3   
3          0 2019-01-04  2019      1    4          4          4   
4          0 2019-01-05  2019      1    5          5          5   
..       ...        ...   ...    ...  ...        ...        ...   
695       99 2019-01-03  2019      1    3          3          3   
696       99 2019-01-04  2019      1    4          4          4   
697       99 2019-01-05  2019      1    5          5          5   
698       99 2019-01-06  2019      1    6          6          6   
699       99 2019-01-

In [None]:

# Update the sample_submission.csv with predictions
submission = pd.read_csv('sample_submission.csv')
submission['predicted_avg_temp_c'] = predicted_temperatures
submission.to_csv('sample_submission.csv', index=False)
submission.isnull().sum()

In [None]:
submission.isnull().sum()

submission_ID           0
avg_temp_c              0
predicted_avg_temp_c    0
dtype: int64

In [None]:
import pandas as pd



# Drop the 'avg_temp_c' column
submission = submission.drop(columns=['avg_temp_c'])

# Rename the 'predicted_avg_temp_c' column to 'avg_temp_c'
submission = submission.rename(columns={'predicted_avg_temp_c': 'avg_temp_c'})

print(submission)


     submission_ID  avg_temp_c
0                1    8.073177
1                2    7.513272
2                3    7.502435
3                4    7.769464
4                5    7.778315
..             ...         ...
695            696   21.144079
696            697   20.792645
697            698   20.798271
698            699   20.276926
699            700   20.485474

[700 rows x 2 columns]


In [None]:
print(submission)

In [None]:
# Save the updated DataFrame to sample_submission.csv
submission.to_csv('sample_submission.csv', index=False)

# Download the file
from google.colab import files
files.download('sample_submission.csv')