In [18]:
from rouge_score import rouge_scorer
import matplotlib.pyplot as plt

In [17]:
!pip install rouge-score

Collecting rouge-score
  Downloading rouge_score-0.1.2.tar.gz (17 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: rouge-score
  Building wheel for rouge-score (setup.py) ... [?25l[?25hdone
  Created wheel for rouge-score: filename=rouge_score-0.1.2-py3-none-any.whl size=24935 sha256=79dbc4d6e7fa73ea7b7067dea7875c39462fdbeba1704d09d97acfb95094443c
  Stored in directory: /root/.cache/pip/wheels/5f/dd/89/461065a73be61a532ff8599a28e9beef17985c9e9c31e541b4
Successfully built rouge-score
Installing collected packages: rouge-score
Successfully installed rouge-score-0.1.2


In [1]:
!pip install transformers datasets torch nltk

import json
import pandas as pd
import torch
from transformers import GPT2LMHeadModel, GPT2Tokenizer, Trainer, TrainingArguments
from datasets import Dataset
import nltk
from nltk.tokenize import word_tokenize

Collecting datasets
  Downloading datasets-3.1.0-py3-none-any.whl.metadata (20 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2024.9.0,>=2023.1.0 (from fsspec[http]<=2024.9.0,>=2023.1.0->datasets)
  Downloading fsspec-2024.9.0-py3-none-any.whl.metadata (11 kB)
Downloading datasets-3.1.0-py3-none-any.whl (480 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m480.6/480.6 kB[0m [31m7.8 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m7.0 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading fsspec-2024.9.0-py3-none-any.whl (1

In [4]:
nltk.download('punkt')
nltk.download('punkt_tab')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


True

In [19]:
def load_tweets(file_path):
    tweets = []
    with open(file_path, 'r') as f:
        for line in f:
            try:
                tweet = json.loads(line)
                if 'text' in tweet:
                    tweets.append(tweet['text'])
            except json.JSONDecodeError:
                continue
    return tweets

file_path = '/content/corona.json'
tweets = load_tweets(file_path)
print(f"Loaded {len(tweets)} tweets.")


def preprocess_tweets(tweets):
    processed_tweets = []
    for tweet in tweets:
        tokens = word_tokenize(tweet)
        processed_tweets.append(" ".join(tokens))
    return processed_tweets

processed_tweets = preprocess_tweets(tweets)


tweet_dataset = Dataset.from_dict({"text": processed_tweets})


model_name = "gpt2"
tokenizer = GPT2Tokenizer.from_pretrained(model_name)
tokenizer.pad_token = tokenizer.eos_token
def tokenize_function(examples):
    inputs = tokenizer(examples["text"], truncation=True, padding="max_length", max_length=128, return_tensors="pt")
    inputs["labels"] = inputs["input_ids"].clone()
    return inputs

tokenized_dataset = tweet_dataset.map(tokenize_function, batched=True)



model = GPT2LMHeadModel.from_pretrained(model_name)


training_args = TrainingArguments(
    output_dir="./results",
    num_train_epochs=5,
    per_device_train_batch_size=8,
    save_steps=10_000,
    save_total_limit=2,
    prediction_loss_only=True,
    fp16=True if torch.cuda.is_available() else False,
)

def compute_metrics(eval_pred):
    scorer = rouge_scorer.RougeScorer(["rouge1", "rouge2", "rougeL"], use_stemmer=True)
    predictions, labels = eval_pred
    decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)


    rouge_scores = []
    for pred, label in zip(decoded_preds, decoded_labels):
        scores = scorer.score(label, pred)
        rouge_scores.append(scores)


    avg_rouge1 = sum([s["rouge1"].fmeasure for s in rouge_scores]) / len(rouge_scores)
    avg_rouge2 = sum([s["rouge2"].fmeasure for s in rouge_scores]) / len(rouge_scores)
    avg_rougeL = sum([s["rougeL"].fmeasure for s in rouge_scores]) / len(rouge_scores)

    return {"rouge1": avg_rouge1, "rouge2": avg_rouge2, "rougeL": avg_rougeL}


# Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)

# Train the model
train_results =trainer.train()





Loaded 18518 tweets.


Map:   0%|          | 0/18518 [00:00<?, ? examples/s]

  trainer = Trainer(


Step,Training Loss
500,1.6089
1000,1.2931
1500,1.1993
2000,1.1371
2500,1.1075
3000,0.9806
3500,0.9983
4000,0.9771
4500,0.9677
5000,0.9064


In [20]:
def generate_tweet(prompt, model, tokenizer, max_length=50):
    inputs = tokenizer(prompt, return_tensors="pt").to(model.device)  # Move inputs to the model's device
    outputs = model.generate(inputs.input_ids, max_length=max_length, num_return_sequences=1, temperature=0.7)
    return tokenizer.decode(outputs[0], skip_special_tokens=True)

In [13]:
#1 epoch
prompt = "Corona"
generated_tweet = generate_tweet(prompt, model, tokenizer)
print(f"Generated Tweet: {generated_tweet}")

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


Generated Tweet: Corona is a virus .


In [34]:
#5 epoch
prompt = "Corona"
generated_tweet = generate_tweet(prompt, model, tokenizer)
print(f"Generated Tweet: {generated_tweet}")

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


Generated Tweet: Corona virus is a virus of the world . It is a virus of the world . It is a virus of the world . It is a virus of the world . It is a virus of the world .


In [None]:
plt.plot(train_results.history["loss"], label="Training Loss")
plt.xlabel("Iteration")
plt.ylabel("Loss")
plt.title("Training Loss Curve")
plt.legend()
plt.show()

In [39]:
def generate_tweet(prompt, model, tokenizer, max_length=50):
  """Generates a tweet based on the given prompt."""
  inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
  outputs = model.generate(inputs.input_ids, max_length=max_length, num_return_sequences=1, temperature=0.7)
  return tokenizer.decode(outputs[0], skip_special_tokens=True)

def calculate_rouge(generated_text, reference_text):
  """Calculates ROUGE scores for generated and reference texts."""
  scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)
  scores = scorer.score(reference_text[0]['text'], generated_text)
  return {metric: scores[metric].fmeasure for metric in scores}

# Example usage:
prompt = "Corona"
generated_tweet = generate_tweet(prompt, model, tokenizer)
print(f"Generated Tweet: {generated_tweet}")


# Calculate ROUGE scores
rouge_scores = calculate_rouge(generated_tweet, tokenized_dataset)

# Print the ROUGE scores
print(f"ROUGE-1: {rouge_scores['rouge1']:.4f}")
print(f"ROUGE-2: {rouge_scores['rouge2']:.4f}")
print(f"ROUGE-L: {rouge_scores['rougeL']:.4f}")

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


Generated Tweet: Corona virus is a virus that can be spread through kissing . https : //t.co/Xn0CqxnQ7
ROUGE-1: 0.0833
ROUGE-2: 0.0000
ROUGE-L: 0.0833


In [41]:
from rouge_score import rouge_scorer

def generate_tweet(prompt, model, tokenizer, max_length=50):
  """Generates a tweet based on the given prompt."""
  inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
  outputs = model.generate(inputs.input_ids, max_length=max_length, num_return_sequences=1, temperature=0.7)
  return tokenizer.decode(outputs[0], skip_special_tokens=True)

def calculate_rouge(generated_text, reference_texts):
  """Calculates average ROUGE scores for generated text against a list of reference texts."""
  scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)
  total_scores = {'rouge1': 0, 'rouge2': 0, 'rougeL': 0}
  num_references = len(reference_texts)

  for reference_text in reference_texts:
      scores = scorer.score(reference_text, generated_text)
      for metric in scores:
          total_scores[metric] += scores[metric].fmeasure

  # Calculate average scores
  avg_scores = {metric: total_scores[metric] / num_references for metric in total_scores}
  return avg_scores

# Example usage:
prompt = "Corona"
generated_tweet = generate_tweet(prompt, model, tokenizer)
print(f"Generated Tweet: {generated_tweet}")


rouge_scores = calculate_rouge(generated_tweet, tokenized_dataset['text'])

# Print the average ROUGE scores
print(f"Average ROUGE-1: {rouge_scores['rouge1']:.4f}")
print(f"Average ROUGE-2: {rouge_scores['rouge2']:.4f}")
print(f"Average ROUGE-L: {rouge_scores['rougeL']:.4f}")

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


Generated Tweet: Corona-Krise : The New York Times bestsellers : Coronavirus , The New York Times bestsellers : The Invisible Enemy , The New York Times bestsellers… https : //t.co/0Q0c
Average ROUGE-1: 0.0890
Average ROUGE-2: 0.0302
Average ROUGE-L: 0.0847
