In [2]:
import torch
import pandas as pd
from transformers import T5Tokenizer, T5ForConditionalGeneration, T5Config

In [3]:
#Read the data from the Excel file
df = pd.read_excel("dataset.xlsx")

# Convert date columns to datetime objects
df["From Date"] = pd.to_datetime(df["From Date"], format='%d-%m-%Y %H:%M')
df["To Date"] = pd.to_datetime(df["To Date"], format='%d-%m-%Y %H:%M')

# Prepare the data for fine-tuning the T5 model
train_text = ['From ' + from_date.strftime('%Y-%m-%d %H:%M') + ' to ' + to_date.strftime('%Y-%m-%d %H:%M') + ' the PM2.5 was ' + str(pm25) for from_date, to_date, pm25 in zip(df['From Date'], df['To Date'], df['PM2.5'])]

In [4]:
# Initialize the T5 tokenizer and model
tokenizer = T5Tokenizer.from_pretrained('t5-small')
config = T5Config.from_pretrained('t5-small')
model = T5ForConditionalGeneration(config)

# Tokenize the input text
inputs = tokenizer(train_text, return_tensors='pt', padding=True, truncation=True)

# Fine-tune the T5 model
optimizer = torch.optim.AdamW(model.parameters(), lr=5e-5)
model.train()
for epoch in range(1):
    optimizer.zero_grad()
    outputs = model(**inputs, labels=inputs["input_ids"])
    loss = outputs.loss
    loss.backward()
    optimizer.step()

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


: 

In [None]:
# Make predictions
next_day_date = df['To Date'].max() + pd.Timedelta(days=1)
prompt = f"From {next_day_date.strftime('%Y-%m-%d %H:%M')} to {(next_day_date + pd.Timedelta(days=1)).strftime('%Y-%m-%d %H:%M')} the PM2.5 was "
input_ids = tokenizer.encode(prompt, return_tensors='pt')
output = model.generate(input_ids, max_length=50, num_return_sequences=1)
predicted_text = tokenizer.decode(output[0], skip_special_tokens=True)


In [None]:

# Print the predicted PM2.5 value for the next day
print(f"Predicted PM2.5 for the next day: {predicted_text}")