## Synthetic data generation

In [None]:
import pandas as pd
import torch
from transformers import GPT2Tokenizer, GPT2LMHeadModel, Trainer, TrainingArguments

data = pd.read_csv("weatherHistory.csv").sample(1000)
data.head()


categorical_columns = ["Summary", "Precip Type", "Daily Summary"]
data['combined_text'] = data[categorical_columns].fillna(
    "").agg(" ".join, axis=1)
train_texts = data['combined_text'].tolist()

tokenizer = GPT2Tokenizer.from_pretrained("distilgpt2")

tokenizer.pad_token = tokenizer.eos_token

model = GPT2LMHeadModel.from_pretrained("distilgpt2")

train_encodings = tokenizer(
    train_texts, truncation=True, padding=True, max_length=32)
train_encodings['labels'] = train_encodings['input_ids'].copy()



class WeatherDataset(torch.utils.data.Dataset):
    def __init__(self, encodings):
        self.encodings = encodings

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx])
                for key, val in self.encodings.items()}
        return item

    def __len__(self):
        return len(self.encodings['input_ids'])


train_dataset = WeatherDataset(train_encodings)

training_args = TrainingArguments(
    output_dir="./weather_training",
    overwrite_output_dir=True,
    num_train_epochs=1,
    per_device_train_batch_size=32,
    save_steps=1000,
    save_total_limit=1,
    logging_dir="./logs",
    fp16=True,
    max_steps=100
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset
)

trainer.train()

model.save_pretrained("weather_training")
tokenizer.save_pretrained("weather_training")

synthetic_data = []
for _ in range(100):
    input_ids = tokenizer.encode("Weather: ", return_tensors="pt")
    outputs = model.generate(input_ids, max_length=32, num_return_sequences=1)
    synthetic_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
    synthetic_data.append(synthetic_text)

synthetic_df = pd.DataFrame(synthetic_data, columns=["Synthetic Weather Data"])
synthetic_df.to_csv("synthetic_weather_data.csv", index=False)

synthetic_df.head()

  from .autonotebook import tqdm as notebook_tqdm
max_steps is given, it will override any value given in num_train_epochs
100%|██████████| 100/100 [01:36<00:00,  1.03it/s]


{'train_runtime': 96.8551, 'train_samples_per_second': 33.039, 'train_steps_per_second': 1.032, 'train_loss': 0.6602192687988281, 'epoch': 3.12}


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
The attention mask is not set and cannot be inferred from input because pad token is same as eos token. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
The attention mask an

Unnamed: 0,Synthetic Weather Data
0,Weather:
1,Weather:
2,Weather: iced rain Partly cloudy starting in t...
3,Weather:
4,Weather:


In [2]:
synthetic_df

Unnamed: 0,Synthetic Weather Data
0,Weather:
1,Weather:
2,Weather: iced rain Partly cloudy starting in t...
3,Weather:
4,Weather:
...,...
95,Weather: iced rain Partly cloudy starting in t...
96,Weather: iced rain Partly cloudy starting in t...
97,Weather:
98,Weather: iced rain Partly cloudy starting in t...
