In [1]:
# Load model directly
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
import torch
device = "cuda" if torch.cuda.is_available() else "cpu"
tokenizer = AutoTokenizer.from_pretrained("QuickRead/pegasus-reddit")
model = AutoModelForSeq2SeqLM.from_pretrained("QuickRead/pegasus-reddit").to(device)

In [2]:
device

'cuda'

In [1]:
import pandas as pd

In [2]:
df = pd.read_csv('/teamspace/studios/this_studio/NLP_project/openai_reddit_final.csv', sep=';')

In [3]:
data = df['post/article']

In [4]:
data.info()

<class 'pandas.core.series.Series'>
RangeIndex: 83797 entries, 0 to 83796
Series name: post/article
Non-Null Count  Dtype 
--------------  ----- 
83797 non-null  object
dtypes: object(1)
memory usage: 654.8+ KB


In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 83797 entries, 0 to 83796
Data columns (total 7 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   Unnamed: 0    83797 non-null  int64 
 1   user_id       83797 non-null  object
 2   doc_id        83797 non-null  object
 3   user_profile  83797 non-null  object
 4   post/article  83797 non-null  object
 5   summary_text  83797 non-null  object
 6   confidence    83797 non-null  int64 
dtypes: int64(2), object(5)
memory usage: 4.5+ MB


In [6]:
data = df[['doc_id', 'post/article']]

In [7]:
data

Unnamed: 0,doc_id,post/article
0,DC0,This is my first post so please be kind :)\n\n...
1,DC0,This is my first post so please be kind :)\n\n...
2,DC0,This is my first post so please be kind :)\n\n...
3,DC0,This is my first post so please be kind :)\n\n...
4,DC0,This is my first post so please be kind :)\n\n...
...,...,...
83792,DC1277,Hello World of Reddit\n\nI'm a 21 year old fem...
83793,DC1277,Hello World of Reddit\n\nI'm a 21 year old fem...
83794,DC1277,Hello World of Reddit\n\nI'm a 21 year old fem...
83795,DC1277,Hello World of Reddit\n\nI'm a 21 year old fem...


In [8]:
mapp = {}

for idx , row in data.iterrows():
    doc_id = row['doc_id']
    text = row['post/article']

    if doc_id not in mapp:
        mapp[doc_id] = text

In [9]:
len(mapp)

6218

In [10]:
ids = list(mapp.keys())
len(ids)

6218

In [11]:
unique_texts = list(mapp.values())
len(unique_texts)

6218

In [15]:
batch_size = 5

# Function to perform inference on a batch of texts
def translate_batch(texts):
    batch = tokenizer(texts, truncation=True, padding='longest', return_tensors="pt").to(device)
    translated = model.generate(**batch)
    tgt_texts = tokenizer.batch_decode(translated, skip_special_tokens=True)
    return tgt_texts

In [16]:
results = []

In [28]:
src_texts = unique_texts[2790:]
len(src_texts)

3428

In [31]:
for i in range(0, len(src_texts), batch_size):
    batch_texts = src_texts[i:i + batch_size]
    translated_texts = translate_batch(batch_texts)
    results.extend(translated_texts)


In [32]:
len(results)

6218

In [14]:
import pickle

In [34]:
with open('results_full.pkl', 'wb') as f:
    pickle.dump(results, f)

In [12]:
gen_summaries = []

In [15]:
with open('results_full.pkl', 'rb') as f:
    gen_summaries = pickle.load(f)

In [19]:
'DC6217' in set(ids)

True

In [20]:
mapp = dict(zip(ids, gen_summaries))

In [23]:
len(mapp['DC0'])

260

In [26]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 83797 entries, 0 to 83796
Data columns (total 2 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   doc_id        83797 non-null  object
 1   post/article  83797 non-null  object
dtypes: object(2)
memory usage: 1.3+ MB


In [36]:
len(data.iloc[0]['post/article'])

805

In [31]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 83797 entries, 0 to 83796
Data columns (total 7 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   Unnamed: 0    83797 non-null  int64 
 1   user_id       83797 non-null  object
 2   doc_id        83797 non-null  object
 3   user_profile  83797 non-null  object
 4   post/article  83797 non-null  object
 5   summary_text  83797 non-null  object
 6   confidence    83797 non-null  int64 
dtypes: int64(2), object(5)
memory usage: 4.5+ MB


In [32]:
df['pegasus_model_summary'] = df['doc_id'].map(mapp)

In [33]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 83797 entries, 0 to 83796
Data columns (total 8 columns):
 #   Column                 Non-Null Count  Dtype 
---  ------                 --------------  ----- 
 0   Unnamed: 0             83797 non-null  int64 
 1   user_id                83797 non-null  object
 2   doc_id                 83797 non-null  object
 3   user_profile           83797 non-null  object
 4   post/article           83797 non-null  object
 5   summary_text           83797 non-null  object
 6   confidence             83797 non-null  int64 
 7   pegasus_model_summary  83797 non-null  object
dtypes: int64(2), object(6)
memory usage: 5.1+ MB


In [34]:
df.head()

Unnamed: 0.1,Unnamed: 0,user_id,doc_id,user_profile,post/article,summary_text,confidence,pegasus_model_summary
0,0,U1,DC0,"DC1830,DC1586,DC1476,DC3950,DC3997,DC697,DC239...",This is my first post so please be kind :)\n\n...,"I never dated/flirted as an adult, now I'm no...",6,This is my first post so please be kind :) I k...
1,1,U1,DC0,"DC1439,DC2012,DC4143,DC6087,DC4746,DC288,DC594...",This is my first post so please be kind :)\n\n...,"Just came out of 8 year relationship, don't k...",7,This is my first post so please be kind :) I k...
2,2,U1,DC0,"DC1477,DC3565,DC2172,DC5606,DC21,DC2538,DC1835...",This is my first post so please be kind :)\n\n...,"I never dated/flirted as an adult, now I'm no...",7,This is my first post so please be kind :) I k...
3,3,U1,DC0,"DC5947,DC3242,DC1815,DC1762,DC5430,DC5839,DC52...",This is my first post so please be kind :)\n\n...,"Just came out of 8 year relationship, don't k...",4,This is my first post so please be kind :) I k...
4,4,U1,DC0,"DC4320,DC4991,DC3550,DC4693,DC1746,DC4996,DC16...",This is my first post so please be kind :)\n\n...,Started a long-term relationship as a teenage...,6,This is my first post so please be kind :) I k...


In [37]:
len(df.iloc[1]['post/article']), len(df.iloc[1]['pegasus_model_summary'])

(805, 260)

In [38]:
df.to_csv('openai_reddit_final_pegasus.csv', sep=';')