In [40]:
import pandas as pd
import numpy as np
import re
from math import comb
import os
import random

In [41]:
text_path = "sport_text"
sum_path = "sport_sum"

input_files = sorted(os.listdir(text_path))
sum_files = sorted(os.listdir(sum_path))

assert len(input_files) == len(sum_files), "Number of files and sum files do not match"
assert all(ins == sums for ins, sums in zip(input_files,sum_files)), "File name mismatch"


In [42]:
input_texts = []
sum_texts = []

for ins, sums in zip(input_files, sum_files):
    with open(os.path.join(text_path, ins), 'r') as f:
        input_texts.append(f.read().strip())
    with open(os.path.join(sum_path, sums), 'r') as f:
        sum_texts.append(f.read().strip())
        
        
print(f"Number of input texts: {len(input_texts)}")

Number of input texts: 511


In [43]:
from transformers import GPT2Tokenizer, GPT2LMHeadModel

tokenizer = GPT2Tokenizer.from_pretrained("gpt2")

def tokenizing(train, test):
    tokenizer.pad_token = tokenizer.eos_token
    train_encodings = tokenizer(input_texts, max_length = 512, truncation = True, padding = True, return_tensors = 'pt',padding_side='left')
    test_encodings = tokenizer(sum_texts, max_length = 512, truncation = True, padding = True, return_tensors = 'pt',padding_side='left')
    return train_encodings, test_encodings

train_encodings, test_encodings = tokenizing(input_texts, sum_texts)



In [44]:
import torch
from torch.utils.data import DataLoader, TensorDataset
batch_size = 16

input_ids = train_encodings["input_ids"]
attention_mask = train_encodings["attention_mask"]
labels = test_encodings["input_ids"]

labels[labels == tokenizer.pad_token_id] = -100

dataset = TensorDataset(input_ids, attention_mask, labels)

dataloader = DataLoader(dataset, batch_size = batch_size, shuffle= False)

device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")



In [45]:
from transformers import GPT2LMHeadModel, AdamW

model = GPT2LMHeadModel.from_pretrained("gpt2").to(device)
optimizer = AdamW(model.parameters(), lr=5e-5)

# Training loop
model.train()
for epoch in range(3):
    for batch in dataloader:
        input_ids, attention_mask, labels = [b.to(device) for b in batch]        
        # Forward pass
        outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss

        # Backward pass and optimization
        loss.backward()
        optimizer.step()
        optimizer.zero_grad()

        print(f"Epoch {epoch}, Loss: {loss.item()}")

Epoch 0, Loss: 11.192526817321777
Epoch 0, Loss: 9.442776679992676
Epoch 0, Loss: 8.761029243469238
Epoch 0, Loss: 8.56392765045166
Epoch 0, Loss: 8.19342041015625
Epoch 0, Loss: 7.882939338684082
Epoch 0, Loss: 7.8071818351745605
Epoch 0, Loss: 7.595464706420898
Epoch 0, Loss: 7.758419990539551
Epoch 0, Loss: 7.571887016296387
Epoch 0, Loss: 7.6816887855529785
Epoch 0, Loss: 7.49912691116333
Epoch 0, Loss: 7.377882480621338
Epoch 0, Loss: 7.283494472503662
Epoch 0, Loss: 7.258186340332031
Epoch 0, Loss: 7.354409217834473
Epoch 0, Loss: 7.334232330322266
Epoch 0, Loss: 7.299045562744141
Epoch 0, Loss: 7.357757091522217
Epoch 0, Loss: 7.408795356750488
Epoch 0, Loss: 7.604079723358154
Epoch 0, Loss: 7.4713006019592285
Epoch 0, Loss: 7.275476455688477
Epoch 0, Loss: 7.353168487548828
Epoch 0, Loss: 7.062084674835205
Epoch 0, Loss: 7.282834529876709
Epoch 0, Loss: 7.337283611297607
Epoch 0, Loss: 7.443338394165039
Epoch 0, Loss: 7.286825656890869
Epoch 0, Loss: 7.08817195892334
Epoch 0, L

In [7]:
from transformers import Trainer, TrainingArguments
from transformers import GPT2LMHeadModel
model = GPT2LMHeadModel.from_pretrained("gpt2")
model.resize_token_embeddings(len(tokenizer))
model = model.to(device)
from transformers import AdamW, get_scheduler

optimizer = AdamW(model.parameters(), lr=5e-5)
num_training_steps = len(dataloader) * 3  # Assuming 3 epochs
scheduler = get_scheduler("linear", optimizer=optimizer, num_warmup_steps=0, num_training_steps=num_training_steps)






In [8]:
from torch.nn import CrossEntropyLoss

model.train()
epochs = 3
loss_fn = CrossEntropyLoss()

for epoch in range(epochs):
    print(f"Epoch {epoch + 1}/{epochs}")
    total_loss = 0

    for batch in dataloader:
        input_ids, attention_mask = batch
        input_ids, attention_mask = input_ids.to(device), attention_mask.to(device)

        outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=input_ids)
        loss = outputs.loss
        total_loss += loss.item()

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        scheduler.step()

    avg_loss = total_loss / len(dataloader)
    print(f"Average Loss: {avg_loss}")

Epoch 1/3
Average Loss: 3.5054780021309853
Epoch 2/3
Average Loss: 3.013164237141609
Epoch 3/3
Average Loss: 2.941358558833599


In [9]:
save_directory = "./gpt2-sports-summary"

# Save the model and tokenizer
model.save_pretrained(save_directory)
tokenizer.save_pretrained(save_directory)

print(f"Model and tokenizer saved to {save_directory}")

Model and tokenizer saved to ./gpt2-sports-summary


In [47]:
from transformers import GPT2Tokenizer, GPT2LMHeadModel

# Load the saved model and tokenizer
model = GPT2LMHeadModel.from_pretrained(save_directory)
tokenizer = GPT2Tokenizer.from_pretrained(save_directory)

# Move to device if necessary
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
model = model.to(device)

In [25]:
model.eval()
generated_summaries = []

with torch.no_grad():
    for batch in dataloader:
        input_ids, _ = batch
        input_ids = input_ids.to(device)

        outputs = model.generate(input_ids, max_new_tokens=50, num_beams=5, early_stopping=True)
        decoded = [tokenizer.decode(output, skip_special_tokens=True) for output in outputs]
        generated_summaries.extend(decoded)

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.
The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.
The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` t

In [51]:
import evaluate

rouge = evaluate.load("rouge")


results = rouge.compute(predictions=generated_summaries, references=sum_texts)
print("ROUGE Scores:", results)

ROUGE Scores: {'rouge1': np.float64(0.6215615793265203), 'rouge2': np.float64(0.5823896609995791), 'rougeL': np.float64(0.39069661554102597), 'rougeLsum': np.float64(0.4795879587686361)}


In [53]:
prompt = "Gout recorded the fourth-fastest under-18 100m time in history on Friday, clocking in at 10.04 seconds at the All-Schools Athletics Championships in Queensland.The run, which came in the heats but was wind-assisted and therefore does not count in official records, was also the fourth-fastest ever by an Australian sprinter of any age."
input_ids = tokenizer(prompt, return_tensors="pt", truncation=True, max_length=512).input_ids.to(device)


output_ids = model.generate(
    input_ids=input_ids,
    max_new_tokens=50,  # Limit the generated text length
    num_beams=5,        # Use beam search for better quality
    early_stopping=True  # Stop early if the model is confident in its prediction
)
summary = tokenizer.decode(output_ids[0])
print(summary[len(prompt):])

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


 It was the first time a sprinter has clocked in at 10.04 seconds at the All-Schools Athletics Championships.<|endoftext|>


In [ ]:
# PRAW from Reddit
!pip install praw
import praw

# Initialize Reddit API client
reddit = praw.Reddit(
    client_id="",  # use user's id
    client_secret="",  # use user's secret
    user_agent=""  # use user's agent specification
)
# Fetch posts from the "sports" subreddit
subreddit = reddit.subreddit("sports")
posts = []
for post in subreddit.hot(limit=10):  # Adjust the limit as needed
    # Fetch top-level comments
    comments = []
    post.comments.replace_more(limit=0)  # Load all top-level comments
    for comment in post.comments.list():
        comments.append(comment.body)
        if len(comments) >= 5:  # Limit to top 5 comments for brevity
            break

    posts.append({
        "title": post.title,
        "selftext": post.selftext.strip(),
        "url": post.url,
        "comments": comments
    })

# Display the posts
for idx, post in enumerate(posts):
    print(f"Post {idx + 1}: {post['title']}")
    if post['selftext']:
        print(f"Content: {post['selftext']}")
    else:
        print("Content: [No text content available]")
        print(f"Using comments: {post['comments']}")
    print(f"URL: {post['url']}")
    print()
# Test on Reddit
# Summarize Reddit posts
import nltk

nltk.download('punkt_tab')


def process_reddit_posts(posts):
    summarized_posts = []

    for post in posts:
        # Use content, comments, or title as input
        content = post["selftext"]
        if not content.strip():  # If no content, use comments
            content = ' '.join(post.get("comments", []))
        if not content.strip():  # If no comments, use the title
            content = post["title"]

        # Preprocess the content
        output_ids = model.generate(
        input_ids=input_ids,
        max_new_tokens=50,  # Limit the generated text length
        num_beams=5,        # Use beam search for better quality
        early_stopping=True  # Stop early if the model is confident in its prediction
        )
        summary = tokenizer.decode(output_ids[0])

        # Store the summarized post
        summarized_posts.append({
            "title": post["title"],
            "summary": summary,
            "url": post["url"]
        })

    return summarized_posts


# Summarize the fetched Reddit posts
summarized_reddit_posts = process_reddit_posts(posts)

# Display the summaries
for idx, post in enumerate(summarized_reddit_posts):
    print(f"Post {idx + 1}: {post['title']}")
    print(f"Summary: {post['summary']}")
    print(f"URL: {post['url']}")
