In [1]:
# %pip install pandas
# %pip install numpy
# %pip install contractions
# %pip install bs4
# %pip install nltk
# %pip install unidecode
# %pip install matplotlib
# %pip install scikit-learn
# %pip install transformers
# %pip install torch
# %pip install rouge

Note: you may need to restart the kernel to use updated packages.


In [5]:
import pandas as pd
import re
from bs4 import BeautifulSoup
from unidecode import unidecode
import nltk
import contractions
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
from sklearn.model_selection import train_test_split
from transformers import TextDataset, DataCollatorForLanguageModeling
from transformers import GPT2Tokenizer, GPT2LMHeadModel
from transformers import Trainer, TrainingArguments
import tqdm
from torch.optim import Adam
from rouge import Rouge
import numpy as np
import torch
from torch.utils.data import Dataset, DataLoader

In [7]:
dataset = pd.read_csv("/Users/nalishjain/Acad Sem 6/IR/CSE508_Winter2024_A4_2021543./archive/Reviews.csv")
dataset.dropna()


Unnamed: 0,Id,ProductId,UserId,ProfileName,HelpfulnessNumerator,HelpfulnessDenominator,Score,Time,Summary,Text
0,1,B001E4KFG0,A3SGXH7AUHU8GW,delmartian,1,1,5,1303862400,Good Quality Dog Food,I have bought several of the Vitality canned d...
1,2,B00813GRG4,A1D87F6ZCVE5NK,dll pa,0,0,1,1346976000,Not as Advertised,Product arrived labeled as Jumbo Salted Peanut...
2,3,B000LQOCH0,ABXLMWJIXXAIN,"Natalia Corres ""Natalia Corres""",1,1,4,1219017600,"""Delight"" says it all",This is a confection that has been around a fe...
3,4,B000UA0QIQ,A395BORC6FGVXV,Karl,3,3,2,1307923200,Cough Medicine,If you are looking for the secret ingredient i...
4,5,B006K2ZZ7K,A1UQRSCLF8GW1T,"Michael D. Bigham ""M. Wassir""",0,0,5,1350777600,Great taffy,Great taffy at a great price. There was a wid...
...,...,...,...,...,...,...,...,...,...,...
568449,568450,B001EO7N10,A28KG5XORO54AY,Lettie D. Carter,0,0,5,1299628800,Will not do without,Great for sesame chicken..this is a good if no...
568450,568451,B003S1WTCU,A3I8AFVPEE8KI5,R. Sawyer,0,0,2,1331251200,disappointed,I'm disappointed with the flavor. The chocolat...
568451,568452,B004I613EE,A121AA1GQV751Z,"pksd ""pk_007""",2,2,5,1329782400,Perfect for our maltipoo,"These stars are small, so you can give 10-15 o..."
568452,568453,B004I613EE,A3IBEVCTXKNOH,"Kathy A. Welch ""katwel""",1,1,5,1331596800,Favorite Training and reward treat,These are the BEST treats for training and rew...


In [8]:
def expand_acronyms(text):
    expanded_words = []
    for word in text.split():
        expanded_words.append(contractions.fix(word))
    expanded_text = ' '.join(expanded_words)
    return expanded_text

def remove_html_tags(text):
    soup = BeautifulSoup(text, "html.parser")
    return soup.get_text()

def remove_accented_chars(text):
    return unidecode(text)


def normalize_text(text):
    text = str(text)
    text = text.lower()
    text = expand_acronyms(text)
    text = remove_html_tags(text)
    text = remove_accented_chars(text)
    text = re.sub('\s\W',' ',text)
    text = re.sub('\W,\s',' ',text)
    text = re.sub("\d+", "", text)
    text = re.sub('\s+',' ',text)
    return text



In [11]:
dataset = dataset[:15000]

In [12]:
training_data = []
testing_data = []
training_reviews = []
summary_token = ' TL;DR '

for id in range(int(len(dataset)*0.75)):
  # print(id,"/",int(len(dataset)*0.75))
  text = normalize_text(dataset['Text'][id])
  summary = normalize_text(dataset['Summary'][id])
  training_data.append([text, summary])
  training_reviews.append(text + summary_token + summary)

for id in range(int(len(dataset)*0.75),len(dataset)):
  # print(id - int(len(dataset)*0.75),"/",int(len(dataset)*0.25))
  text = normalize_text(dataset['Text'][id])
  summary = normalize_text(dataset['Summary'][id])
  testing_data.append([text,summary])

  soup = BeautifulSoup(text, "html.parser")


In [13]:
class SummaryData(Dataset):
    def __init__(self, data, tokenizer):
        self.data = data
        tokenizer.pad_token = '[PAD]'
        self.X_encoded = tokenizer(self.data,max_length=100, truncation=True, padding="max_length", return_tensors="pt")
        self.input_ids = self.X_encoded['input_ids']
        self.attention_mask = self.X_encoded['attention_mask']

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        return (self.input_ids[idx], self.attention_mask[idx])


In [14]:
tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
model = GPT2LMHeadModel.from_pretrained("gpt2")
device = "cuda" if torch.cuda.is_available()  else "cpu"

In [15]:
summary_data = SummaryData(training_reviews, tokenizer)

In [16]:
def train(data_loader, model, optim):

    epochs = 10

    for i in range(epochs):
        train_loss = 0
        for input_ids, att_mask in tqdm.tqdm(data_loader):
            input_ids = input_ids.to(device)
            att_mask = att_mask.to(device)
            optim.zero_grad()
            loss = model(input_ids, attention_mask=att_mask, labels=input_ids).loss
            train_loss += loss
            loss.backward()
            optim.step()
            torch.cuda.empty_cache()

        print("Epoch: ",i," train_loss", loss)

In [17]:
model.resize_token_embeddings(len(tokenizer))
model = model.to(device)

data_loader =  DataLoader(summary_data, batch_size=32)
model.train()
optim = Adam(model.parameters(), lr=3e-4)


In [None]:
train(data_loader, model, optim)
torch.save(model.state_dict(), "/content/drive/MyDrive/gpt_model.pt")

In [18]:
def topk_summaries(probability, n=9):
    probability = torch.softmax(probability, dim= -1)
    token_probabilities, top_ = torch.topk(probability, k=n)
    token_probabilities = token_probabilities / torch.sum(token_probabilities)
    token_probabilities = token_probabilities.cpu().detach().numpy()
    choice = np.random.choice(n, 1, p = token_probabilities)
    tokenId = top_[choice][0]
    return int(tokenId)

In [19]:
def model_infer(model, tokenizer, review, device):
    max_length = 20
    model.eval()
    encoded_text = tokenizer.encode(review)
    summary = encoded_text
    input = torch.tensor(encoded_text).unsqueeze(0).to(device)

    with torch.no_grad():
        output = model(input)
        logits = output.logits[0,-1]
        summary.append(topk_summaries(logits))

        for step in range(max_length):
            input = torch.tensor(summary).unsqueeze(0).to(device)
            output = model(input)
            logits = output.logits[0,-1]
            word_index = topk_summaries(logits)

            if word_index == tokenizer.eos_token_id:
                return tokenizer.decode(summary)
            else: 
                summary.append(word_index)

    return tokenizer.decode(summary)

In [21]:
summary = model_infer(model, tokenizer, training_data[30][0] + "TL;DR", device).split("TL;DR")[1].strip()


In [24]:
loaded_model = GPT2LMHeadModel.from_pretrained("gpt2")
loaded_model.resize_token_embeddings(len(tokenizer))
state_dict_path = "gpt_model.pt"
state_dict = torch.load(state_dict_path, map_location=torch.device('cpu'))
loaded_model.load_state_dict(state_dict)
loaded_model = loaded_model.to(device)


In [25]:
def compute_rouge_scores(testing_data, model):
    rouge = Rouge()
    hypotheses = []
    references = []
    for i in range(len(testing_data)):
        print(i, "/", len(testing_data))
        hypotheses.append(model_infer(model, tokenizer, testing_data[i][0] + "TL;DR", device).split("TL;DR")[1].strip())
        references.append(testing_data[i][1])
    scores = rouge.get_scores(hypotheses, references, avg=True)
    return scores


rouge_scores = compute_rouge_scores(testing_data, loaded_model)



0 / 3750
1 / 3750
2 / 3750
3 / 3750
4 / 3750
5 / 3750
6 / 3750
7 / 3750
8 / 3750
9 / 3750
10 / 3750
11 / 3750
12 / 3750
13 / 3750
14 / 3750
15 / 3750
16 / 3750
17 / 3750
18 / 3750
19 / 3750
20 / 3750
21 / 3750
22 / 3750
23 / 3750
24 / 3750
25 / 3750
26 / 3750
27 / 3750
28 / 3750
29 / 3750
30 / 3750
31 / 3750
32 / 3750
33 / 3750
34 / 3750
35 / 3750
36 / 3750
37 / 3750
38 / 3750
39 / 3750
40 / 3750
41 / 3750
42 / 3750
43 / 3750
44 / 3750
45 / 3750
46 / 3750
47 / 3750
48 / 3750
49 / 3750
50 / 3750
51 / 3750
52 / 3750
53 / 3750
54 / 3750
55 / 3750
56 / 3750
57 / 3750
58 / 3750
59 / 3750
60 / 3750
61 / 3750
62 / 3750
63 / 3750
64 / 3750
65 / 3750
66 / 3750
67 / 3750
68 / 3750
69 / 3750
70 / 3750
71 / 3750
72 / 3750
73 / 3750
74 / 3750
75 / 3750
76 / 3750
77 / 3750
78 / 3750
79 / 3750
80 / 3750
81 / 3750
82 / 3750
83 / 3750
84 / 3750
85 / 3750
86 / 3750
87 / 3750
88 / 3750
89 / 3750
90 / 3750
91 / 3750
92 / 3750
93 / 3750
94 / 3750
95 / 3750
96 / 3750
97 / 3750
98 / 3750
99 / 3750
100 / 3750

In [None]:
print(rouge_scores)
# {'rouge-1': {'r': 0.08101238032120356, 'p': 0.10352501128501104, 'f': 0.08246468747314054}, 'rouge-2': {'r': 0.0160325925925926, 'p': 0.019111120731120736, 'f': 0.015195765047813517}, 'rouge-l': {'r': 0.0802342850831083, 'p': 0.1026937847337845, 'f': 0.08176597485678087}}
