In [79]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
import torch
from torch.utils.data import DataLoader, TensorDataset

In [80]:
# from google.colab import drive
# # drive.mount('/content/drive')
# drive.mount("/content/drive", force_remount=True)

In [81]:
data = pd.read_csv('new_features_added_filled_nearest_neigb_min.csv')
# data = pd.read_csv('./../derived_data_csv/new_features_added_filled_nearest_neigb_min.csv')


In [82]:
data.columns

Index(['city_id', 'date', 'avg_temp_c', 'min_temp_c', 'max_temp_c',
       'avg_wind_dir_deg', 'avg_wind_speed_kmh', 'city', 'day_of_week',
       'day_of_year', 'month', 'temp_range'],
      dtype='object')

In [83]:
# Python
selected_columns_to_exclude = ['date', 'city_id',
                               'snow_depth_mm', 'day_of_week']
features = [col for col in data.columns if col not in selected_columns_to_exclude]

target = 'avg_temp_c'

# Scaling features
scaler = StandardScaler()
# Python
numerical_features = data.select_dtypes(include=[np.number]).columns.tolist()
data[numerical_features] = scaler.fit_transform(data[numerical_features])

# Save the mean and standard deviation for avg_temp_c
avg_temp_c_index = numerical_features.index('avg_temp_c')
avg_temp_c_mean = scaler.mean_[avg_temp_c_index]
avg_temp_c_std = scaler.scale_[avg_temp_c_index]

In [84]:
num_rows_with_nulls = data.isnull().any(axis=1).sum()
print(f"Number of records: {len(data)}")
print(f"Number of records with at least one null value: {num_rows_with_nulls}")

Number of records: 182338
Number of records with at least one null value: 0


In [85]:
# print(data[features].head())

# data_filled = data.fillna(-1)
data.head()

Unnamed: 0,city_id,date,avg_temp_c,min_temp_c,max_temp_c,avg_wind_dir_deg,avg_wind_speed_kmh,city,day_of_week,day_of_year,month,temp_range
0,C001,2014-01-01,-1.244652,-1.615164,-1.181246,-0.084152,-0.825328,-1.729156,-0.500611,-1.727467,-1.601858,0.840131
1,C001,2014-01-02,-0.974157,-0.843364,-1.017647,-0.212386,-0.202979,-1.729156,-0.00059,-1.717983,-1.601858,-0.47282
2,C001,2014-01-03,-1.144469,-1.284393,-0.950283,0.157519,-0.890838,-1.729156,0.499431,-1.708498,-1.601858,0.643188
3,C001,2014-01-04,-1.144469,-1.083925,-1.017647,1.129135,0.009929,-1.729156,0.999452,-1.699014,-1.601858,0.05236
4,C001,2014-01-05,-1.044286,-1.42472,-0.671204,0.225335,-1.021859,-1.729156,1.499473,-1.689529,-1.601858,1.584136


In [86]:
city_ids = data['city_id'].unique().tolist()

In [87]:
# Create sequences for each city
def create_sequences(df, features, target, seq_length=30, pred_length=7):
    X = []
    y = []
    for i in range(len(df) - seq_length - pred_length + 1):
        X.append(df[features].iloc[i:i+seq_length].values)
        y.append(df[target].iloc[i+seq_length:i+seq_length+pred_length].values)
    return np.array(X), np.array(y)


# Split data by city and create sequences
city_data = data.groupby('city_id')
seq_length=30
X_list = []
y_list = []
last_dates = []
X_list_final = []
for i, (_, group) in enumerate(city_data):
    X, y = create_sequences(group, features, target, seq_length=seq_length)
    X_list.append(X)
    y_list.append(y)
    X_last = group[features].iloc[-seq_length:].values
    X_list_final.append(X_last)

In [88]:
print(len(X_list))
print(len(y_list))
print(len(X_list[0]))
print(len(y_list[0]))
lengths = [len(element) for element in X_list]
print(lengths)
print(len(X_list[0][0]))

100
100
1790
1790
[1790, 1767, 1787, 1790, 1790, 1789, 1790, 1790, 1769, 1790, 1787, 1790, 1790, 1790, 1788, 1790, 1780, 1790, 1790, 1790, 1790, 1787, 1773, 1790, 1777, 1790, 1790, 1787, 1790, 1790, 1790, 1790, 1790, 1790, 1776, 1790, 1788, 1790, 1789, 1789, 1790, 1790, 1790, 1790, 1790, 1789, 1790, 1753, 1790, 1790, 1789, 1789, 1790, 1787, 1790, 1789, 1786, 1790, 1789, 1790, 1790, 1790, 1790, 1781, 1790, 1789, 1790, 1790, 1788, 1790, 1790, 1790, 1790, 1790, 1790, 1776, 1790, 1790, 1789, 1774, 1790, 1790, 1784, 1790, 1790, 1787, 1790, 1784, 1790, 1789, 1790, 1788, 1789, 1790, 1790, 1790, 1790, 1789, 1783, 1764]
30


In [89]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader, random_split
from matplotlib import pyplot as plt

In [90]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [91]:
class TimeSeriesDataset(Dataset):
    def __init__(self, X, y):
        self.X = torch.tensor(X, dtype=torch.float32).to(device)
        self.y = torch.tensor(y, dtype=torch.float32).to(device)

    def __len__(self):
        return len(self.X)

    def __getitem__(self, idx):
        return self.X[idx], self.y[idx]

In [113]:
train_ratio = 0.0
batch_size = 64

dataloaders_train = []
dataloaders_test = []

for X, y in zip(X_list, y_list):
    dataset = TimeSeriesDataset(X, y)
    train_size = int(train_ratio * len(dataset))
    test_size = len(dataset) - train_size
    train_dataset, test_dataset = random_split(
        dataset, [train_size, test_size])


    train_loader = DataLoader(
        dataset, batch_size=batch_size)

    dataloaders_train.append(train_loader)
    # dataloaders_test.append(test_loader)

In [None]:
# # Convert to PyTorch tensors
# X_predict_tensor = torch.tensor(X_list_final, dtype=torch.float32).to(device)
# predict_dataset = TensorDataset(X_predict_tensor)
# predict_loader = DataLoader(predict_dataset, batch_size=1, shuffle=False)

In [None]:
# For the train loader
# for i, (inputs, targets) in enumerate(train_loader):
#     if i == 1:
#         break
#     print(f'Train Inputs Shape: {inputs.shape}, Train Targets Shape: {targets.shape}')
# # For the test loader
# for i, (inputs, targets) in enumerate(test_loader):
#     if i == 1:
#         break
#     print(f'Test Inputs Shape: {inputs.shape}, Test Targets Shape: {targets.shape}')
# for i, (inputs,) in enumerate(predict_loader):
#     if i == 1:
#         break
#     print(f'Predict Inputs Shape: {inputs.shape}')

Train Inputs Shape: torch.Size([64, 30, 9]), Train Targets Shape: torch.Size([64, 7])
Test Inputs Shape: torch.Size([64, 30, 9]), Test Targets Shape: torch.Size([64, 7])


In [109]:

class LSTMModel(nn.Module):
    def __init__(self, input_size, hidden_layer_size, output_size, num_layers, dropout):
        super(LSTMModel, self).__init__()
        self.hidden_layer_size = hidden_layer_size
        self.num_layers = num_layers

        self.lstm = nn.LSTM(input_size, hidden_layer_size,
                            num_layers=num_layers, dropout=dropout, batch_first=True)
        self.dropout = nn.Dropout(dropout)
        self.fc = nn.Linear(hidden_layer_size, output_size)
        
        # Weight initialization
        for name, param in self.lstm.named_parameters():
            if 'weight' in name:
                nn.init.xavier_uniform_(param)

    def forward(self, x):
        h0 = torch.zeros(self.num_layers, x.size(0), self.hidden_layer_size).to(x.device)
        c0 = torch.zeros(self.num_layers, x.size(0), self.hidden_layer_size).to(x.device)

        out, _ = self.lstm(x, (h0, c0))
        out = self.dropout(out[:, -1, :])
        out = self.fc(out)
        return out

In [110]:
def train_model(model, criterion, optimizer, dataloader):
    model.train()
    total_loss = 0

    for data, target in dataloader:
        optimizer.zero_grad()
        output = model(data)
        loss = criterion(output, target)
        loss.backward()
        # Apply gradient clipping
        # torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1)
        optimizer.step()
        total_loss += loss.item()
    return total_loss / len(dataloader)

In [111]:
def evaluate_model(model, criterion, dataloader):
    model.eval()
    total_loss = 0

    with torch.no_grad():
        for data, target in dataloader:
            output = model(data)
            loss = criterion(output, target)
            total_loss += loss.item()

    return total_loss / len(dataloader)

In [112]:
def model_fit(model, criterion, optimizer, train_dataloader, test_dataloader, num_epochs=25, patience=10, checkpoint=40):
    train_losses = []
    test_losses = []

    best_test_loss = float('inf')
    patience_counter = 0

    for epoch in range(num_epochs):
        train_loss = train_model(
            model, criterion, optimizer, train_dataloader)
        test_loss = evaluate_model(
            model, criterion, test_dataloader)

        train_losses.append(train_loss)
        test_losses.append(test_loss)

        if test_loss < best_test_loss:
            best_test_loss = test_loss
            patience_counter = 0
        else:
            patience_counter += 1
        if (epoch + 1) % checkpoint == 0:
            print(f"Epoch {epoch+1}/{num_epochs}: Train Loss = {train_loss:.4f}, Test Loss = {test_loss:.4f}")

        if patience_counter >= patience:
            print("Early stopping triggered")
            break

    return model, train_losses, test_losses

In [101]:
def plot_losses(train_losses, test_losses, figsize=(6, 3)):
    plt.figure(figsize=figsize)
    plt.plot(train_losses, label='Train Loss')
    plt.plot(test_losses, label='Test Loss')
    plt.title('Losses')
    plt.xlabel('Epoch')
    plt.ylabel('Loss')
    plt.legend()
    plt.show()

In [102]:
# Define model, loss function, and optimizer
input_size = len(features)
hidden_layer_size = 64
output_size = 7
num_layers = 2
dropout = 0.5

# model = LSTMModel(input_size, hidden_layer_size,
#                   output_size, num_layers, dropout).to(device)
loss_function = nn.MSELoss()
# optimizer = torch.optim.Adam(model.parameters(), lr=0.001, weight_decay=0.0001)

In [103]:
from datetime import datetime, timedelta

predictions_df = pd.DataFrame(columns=["city_id", "date", "temp"])
predictions_list = []
start_date = datetime.strptime("2019-01-01", "%Y-%m-%d")

In [104]:
all_train_losses = []
all_test_losses = []

In [105]:
for (city_id, train_loader, test_loader) in zip(city_ids, dataloaders_train, dataloaders_test):
    # Initialize a new model for each city
    print(city_id)
    model = LSTMModel(input_size, hidden_layer_size,
                      output_size, num_layers, dropout).to(device)
    optimizer = torch.optim.Adam(model.parameters(), lr=0.001, weight_decay=0.0001)

    model, train_losses, test_losses = model_fit(model, loss_function, optimizer, train_loader, test_loader, num_epochs=50, checkpoint=5, patience=10)

    all_train_losses.append(train_losses)
    all_test_losses.append(test_losses)

    X_last = X_list_final[city_ids.index(city_id)]
    current_sequence_tensor = torch.tensor(X_last).unsqueeze(0).float().to(device)

    # Make prediction
    with torch.no_grad():
        model.eval()
        # Ensure the tensor is of type torch.float32
        current_sequence_tensor = current_sequence_tensor.float()
        prediction = model(current_sequence_tensor)

    for i, predicted_temp in enumerate(prediction.squeeze().tolist()):
        prediction_date = start_date + timedelta(days=i)
        predictions_list.append({
            "city_id": city_id,
            "date": prediction_date.strftime("%d/%m/%Y"),
            "temp": predicted_temp
        })

# predictions_df now contains the next day's temperature prediction for each city

C001
Epoch 5/50: Train Loss = 0.0756, Test Loss = 0.5881
Epoch 10/50: Train Loss = 0.0435, Test Loss = 0.5761
Epoch 15/50: Train Loss = 0.0171, Test Loss = 0.5763
Epoch 20/50: Train Loss = 0.0170, Test Loss = 0.5738
Epoch 25/50: Train Loss = 0.0084, Test Loss = 0.5686
Epoch 30/50: Train Loss = 0.0094, Test Loss = 0.5669
Epoch 35/50: Train Loss = 0.0225, Test Loss = 0.5661


KeyboardInterrupt: 

In [24]:
from math import ceil
num_cities = 100
num_rows = ceil(num_cities / 6)
fig, axs = plt.subplots(num_rows, 6, figsize=(20, num_rows * 3))

for i, ax in enumerate(axs.flat):
    if i < num_cities:
        ax.plot(all_train_losses[i], label='Train Loss')
        ax.plot(all_test_losses[i], label='Test Loss')
        ax.set_title(f'City {city_ids[i]}')
        ax.legend()
    else:
        ax.axis('off')

plt.tight_layout()
plt.show()

KeyboardInterrupt: 

In [71]:
predictions_list

[{'city_id': 'C001', 'date': '01/01/2019', 'temp': -1.3650602102279663},
 {'city_id': 'C001', 'date': '02/01/2019', 'temp': -1.1446642875671387},
 {'city_id': 'C001', 'date': '03/01/2019', 'temp': -0.9726508855819702},
 {'city_id': 'C001', 'date': '04/01/2019', 'temp': -1.099670648574829},
 {'city_id': 'C001', 'date': '05/01/2019', 'temp': -1.222062349319458},
 {'city_id': 'C001', 'date': '06/01/2019', 'temp': -1.2297097444534302},
 {'city_id': 'C001', 'date': '07/01/2019', 'temp': -1.231223464012146},
 {'city_id': 'C002', 'date': '01/01/2019', 'temp': -0.4354495406150818},
 {'city_id': 'C002', 'date': '02/01/2019', 'temp': -0.4248517155647278},
 {'city_id': 'C002', 'date': '03/01/2019', 'temp': -0.3268526792526245},
 {'city_id': 'C002', 'date': '04/01/2019', 'temp': -0.40492063760757446},
 {'city_id': 'C002', 'date': '05/01/2019', 'temp': -0.4271649420261383},
 {'city_id': 'C002', 'date': '06/01/2019', 'temp': -0.3707137703895569},
 {'city_id': 'C002', 'date': '07/01/2019', 'temp': -0

In [73]:
predictions_list

[{'city_id': 'C001', 'date': '01/01/2019', 'temp': -1.3650602102279663},
 {'city_id': 'C001', 'date': '02/01/2019', 'temp': -1.1446642875671387},
 {'city_id': 'C001', 'date': '03/01/2019', 'temp': -0.9726508855819702},
 {'city_id': 'C001', 'date': '04/01/2019', 'temp': -1.099670648574829},
 {'city_id': 'C001', 'date': '05/01/2019', 'temp': -1.222062349319458},
 {'city_id': 'C001', 'date': '06/01/2019', 'temp': -1.2297097444534302},
 {'city_id': 'C001', 'date': '07/01/2019', 'temp': -1.231223464012146},
 {'city_id': 'C002', 'date': '01/01/2019', 'temp': -0.4354495406150818},
 {'city_id': 'C002', 'date': '02/01/2019', 'temp': -0.4248517155647278},
 {'city_id': 'C002', 'date': '03/01/2019', 'temp': -0.3268526792526245},
 {'city_id': 'C002', 'date': '04/01/2019', 'temp': -0.40492063760757446},
 {'city_id': 'C002', 'date': '05/01/2019', 'temp': -0.4271649420261383},
 {'city_id': 'C002', 'date': '06/01/2019', 'temp': -0.3707137703895569},
 {'city_id': 'C002', 'date': '07/01/2019', 'temp': -0

In [74]:
predictions_df = pd.DataFrame(predictions_list)
# predictions_df.columns = ['city_id', 'date', 'avg_temp_c']
# predictions_df['avg_temp_c'] = predictions_df['avg_temp_c'].round(2)
# predictions_df['submission_ID'] = range(1, len(predictions_df) + 1)
# predictions_df = predictions_df[['submission_ID', 'city_id', 'date', 'avg_temp_c']]

# # print(predictions_df)

In [None]:
# predictions_df.to_csv('./output.csv', index=False)

In [35]:
# predictions_df

Unnamed: 0,submission_ID,city_id,date,avg_temp_c
0,1,C001,01/01/2019,-0.94
1,2,C001,02/01/2019,-0.97
2,3,C001,03/01/2019,-1.19
3,4,C001,04/01/2019,-1.18
4,5,C001,05/01/2019,-1.13
...,...,...,...,...
695,696,C112,03/01/2019,0.43
696,697,C112,04/01/2019,0.37
697,698,C112,05/01/2019,0.36
698,699,C112,06/01/2019,0.38


In [36]:
# submission_df = predictions_df

In [29]:
# y_pred_original_scale_placeholder = np.zeros((submission_df.shape[0], 10))

# y_pred_original_scale_placeholder[:, 0] = submission_df['avg_temp_c'].values

# y_pred_original_scale = scaler.inverse_transform(y_pred_original_scale_placeholder)

# y_pred_original_scale = y_pred_original_scale[:, 0]

In [34]:
# y_pred_original_scale

Unnamed: 0,avg_temp_c
0,9.640949
1,9.341498
2,7.145520
3,7.245337
4,7.744423
...,...
695,23.315900
696,22.716997
697,22.617180
698,22.816815


In [32]:
# y_pred_original_scale = pd.DataFrame(y_pred_original_scale, columns=['avg_temp_c'])
# y_pred_original_scale

# y_pred_original_scale.to_csv('time_ans_11_04.csv')


In [68]:
predictions_df

Unnamed: 0,city_id,date,temp


In [76]:
# Assuming predictions_df is a DataFrame with a column 'avg_temp_c' for the standardized predictions
# Use the saved mean and standard deviation to scale back to original values
predictions_df['temp'] = (
    predictions_df['temp'] * avg_temp_c_std) + avg_temp_c_mean
predictions_df.to_csv('sure_this_plz.csv', index=False)

In [78]:
predictions_df

Unnamed: 0,city_id,date,temp
0,C001,01/01/2019,5.398119
1,C001,02/01/2019,7.598049
2,C001,03/01/2019,9.315037
3,C001,04/01/2019,8.047162
4,C001,05/01/2019,6.825483
...,...,...,...
695,C112,03/01/2019,22.645363
696,C112,04/01/2019,21.752504
697,C112,05/01/2019,20.948795
698,C112,06/01/2019,21.739805


In [40]:
y_pred_original_scale

Unnamed: 0,avg_temp_c
0,9.640949
1,9.341498
2,7.145520
3,7.245337
4,7.744423
...,...
695,23.315900
696,22.716997
697,22.617180
698,22.816815
