## PREPARING THE TRAINING DATA

In [2]:
import pandas as pd
import spacy
from tqdm import tqdm

In [3]:
synthetic_original_df = pd.read_csv("../../datasets/synthetic-original-augmented.csv")
summary_csv_df = pd.read_csv("../../summ.csv")
news_df = pd.read_csv("../../PENS/news_min.tsv",sep = '\t')
nouns_df = pd.read_csv("../../expm/PENS/nouns.csv")

In [4]:
synthetic_original_df.head()

Unnamed: 0,UserID,Docs,Action,Summaries
0,U335175,"['N41340', 'N55476', 'N119282', 'N78688', 'N31...","['click', 'click', 'skip', 'click', 'click', '...",2
1,U146053,"['N95202', 'N84319', 'N92854', 'N43563', 'N900...","['click', 'click', 'skip', 'gen_summ', 'click'...",5
2,U158889,"['N84182', 'N72110', 'N122127', 'N24095', 'N96...","['skip', 'skip', 'skip', 'skip', 'skip', 'skip...",0
3,U22232,"['N25386', 'N90820', 'N35729', 'N32113', 'N837...","['click', 'skip', 'click', 'click', 'skip', 's...",1
4,U32515,"['N55509', 'N111634', 'N15992', 'N78883', 'N27...","['click', 'skip', 'click', 'click', 'click', '...",3


In [5]:
summary_csv_df

Unnamed: 0,SummID,NewsID,UserID,Summary
0,S1,N24324,U335175,"Heat Wave Is Going to Hit Europe, Prepare to C..."
1,S2,N93272,U335175,Everything you need to know to plan your dream...
2,S3,N43563,U146053,Baby Boomers Love City Life as Much As Millenials
3,S4,N97393,U146053,Piping plover forces Sandy Hook to cancel all ...
4,S5,N104663,U146053,Trip Advisor's Guide to Eating America: for th...
...,...,...,...,...
777529,S777530,N51965,U215323,Ranking Penguin's possible picks for NHL draft
777530,S777531,N49096,U215323,Homeland Still Exposed to ISIS
777531,S777532,N44501,U215323,US-Iran tension in Photos
777532,S777533,N59444,U215323,Former Steeler's RB Le'Veon Bell left bare as ...


In [6]:
news_df

Unnamed: 0,News ID,Category,Topic,Headline,News body
0,N10000,sports,soccer,Predicting Atlanta United's lineup against Col...,"Only FIVE internationals allowed, count em, FI..."
1,N10001,news,newspolitics,Mitch McConnell: DC statehood push is 'full bo...,WASHINGTON -- Senate Majority Leader Mitch McC...
2,N10002,news,newsus,Home In North Highlands Damaged By Fire,NORTH HIGHLANDS (CBS13) Fire damaged a home ...
3,N10003,news,newspolitics,Meghan McCain blames 'liberal media' and 'thir...,Meghan McCain is speaking out after a journali...
4,N10004,news,newsworld,Today in History: Aug 1,"1714: George I becomes King Georg Ludwig, Elec..."
...,...,...,...,...,...
113757,N123757,sports,soccer_fifa_wwc,Hope who? Alyssa Naeher's penalty save sends U...,"LYON, France At the conclusion of the United..."
113758,N123758,sports,baseball_mlb,Chris Sale Explains What Specifically Has Gone...,The first half of Chris Sale's season could be...
113759,N123759,sports,basketball_nba_videos,Raptor fans jam streets to celebrate 1st NBA t...,Canadians are celebrating the country's first ...
113760,N123760,news,newspolitics,Judge won't allow Flynn to fire his attorneys,A federal judge denied the request by Michael ...


In [15]:
nouns_df['Noun Phrases'][2]

'Baby Boomers Love City Life, Millenials'

In [16]:
updated_df['Noun Phrases'][2]

'Phoenix, For, Baby, U, Rent, York, City, "The, Arizona, Vegas,, Austin, You, Florida, Census, "This, Florida,, Two, The, Las, New, S, Cafe, However,'

## Associated Noun Phrase Generation

In [16]:
nlp = spacy.load('en_core_web_sm')

def extract_noun_phrases(summary):
    doc = nlp(summary)
    noun_phrases = [chunk.text for chunk in doc.noun_chunks]
    return ', '.join(noun_phrases)


tqdm.pandas()

In [17]:
summary_csv_df['Noun Phrases'] = summary_csv_df['Summary'].progress_apply(extract_noun_phrases)

100%|██████████| 777534/777534 [52:39<00:00, 246.07it/s]  


In [18]:
summary_csv_df.to_csv("nouns.csv", index = False)

In [None]:
#df = pd.open

## Generating the Noun Phrases from the Original Document

In [8]:
# Function to extract sentences containing noun phrases
def get_sentences_with_nouns(news_body, noun_phrases):
    sentences = news_body.split('.')  # Assuming sentences are separated by periods.
    sentences_with_nouns = []
    
    for sentence in sentences:
        for noun in noun_phrases:
            if noun in sentence:
                sentences_with_nouns.append(sentence)
                break  # Once we find a noun in a sentence, we move to the next sentence.
    return sentences_with_nouns

# Function to extract nouns from sentences
def extract_nouns_from_sentences(sentences):
    # Placeholder for actual noun extraction logic, this can be done using NLP libraries like spaCy or NLTK
    extracted_nouns = set()
    for sentence in sentences:
        words = sentence.split() 
        nouns = [word for word in words if word.istitle()]  # Example heuristic to identify nouns
        extracted_nouns.update(nouns)
    return extracted_nouns


In [11]:
# Iterate through noun_df and match NewsID with news_df
for index, row in tqdm(nouns_df.iterrows(), total=nouns_df.shape[0]):
    news_id = row['NewsID']
    
    # Find the corresponding row in news_df
    news_row = news_df[news_df['News ID'] == news_id]
    
    if not news_row.empty:
        news_body = news_row['News body'].values[0]
        noun_phrases = row['Noun Phrases']
        if isinstance(noun_phrases, str):
            noun_phrases = noun_phrases.split(', ')  # Assuming noun phrases are separated by commas
        else:
            noun_phrases = []
        # Extract sentences containing the noun phrases
        sentences_with_nouns = get_sentences_with_nouns(news_body, noun_phrases)

        # Extract all nouns in the same sentence
        extracted_nouns = extract_nouns_from_sentences(sentences_with_nouns)

        # Update the noun_df with the new extracted nouns
        nouns_df.at[index, 'Noun Phrases'] = ', '.join(list(extracted_nouns))

# Save the updated dataframe
nouns_df.to_csv('updated_nouns_df.csv', index=False)

100%|██████████| 777534/777534 [1:00:48<00:00, 213.09it/s]


In [4]:
updated_df = pd.read_csv("../../expm/PENS/updated_nouns_df.csv")

In [27]:
updated_df

Unnamed: 0,SummID,NewsID,UserID,Summary,Noun Phrases
0,S1,N24324,U335175,"Heat Wave Is Going to Hit Europe, Prepare to C...","Africa, A, Europe,, Europe, Additionally,, Jun..."
1,S2,N93272,U335175,Everything you need to know to plan your dream...,"Garden, U, Prince), Part, Gulfport-Biloxi,, *C..."
2,S3,N43563,U146053,Baby Boomers Love City Life as Much As Millenials,"Phoenix, For, Baby, U, Rent, York, City, ""The,..."
3,S4,N97393,U146053,Piping plover forces Sandy Hook to cancel all ...,"Jen, They, Pier, Jirks,, Wednesday,, Wednesday..."
4,S5,N104663,U146053,Trip Advisor's Guide to Eating America: for th...,"""Fine, Garden, B, U, Or, Grill, Thomas, Markjf..."
...,...,...,...,...,...
777529,S777530,N51965,U215323,Ranking Penguin's possible picks for NHL draft,
777530,S777531,N49096,U215323,Homeland Still Exposed to ISIS,"Bakr, Iraq, They, Abu, State, According, Mousa..."
777531,S777532,N44501,U215323,US-Iran tension in Photos,"States, Iran, United, Photos"
777532,S777533,N59444,U215323,Former Steeler's RB Le'Veon Bell left bare as ...,


In [24]:
len(nouns_df['Noun Phrases'][4])

3400

In [25]:
len(updated_df['Noun Phrases'][4])

3400

In [5]:
differences = pd.merge(nouns_df, updated_df, on='Noun Phrases', how='left', indicator=True)
unique_to_nouns_df = differences[differences['_merge'] == 'left_only']
print(unique_to_nouns_df)


          SummID_x NewsID_x UserID_x  \
0               S1   N24324  U335175   
1               S2   N93272  U335175   
2               S3   N43563  U146053   
3               S4   N97393  U146053   
4               S5  N104663  U146053   
...            ...      ...      ...   
106742370  S777530   N51965  U215323   
106742371  S777531   N49096  U215323   
106742372  S777532   N44501  U215323   
106742373  S777533   N59444  U215323   
106742374  S777534   N81907  U215323   

                                                   Summary_x  \
0          Heat Wave Is Going to Hit Europe, Prepare to C...   
1          Everything you need to know to plan your dream...   
2          Baby Boomers Love City Life as Much As Millenials   
3          Piping plover forces Sandy Hook to cancel all ...   
4          Trip Advisor's Guide to Eating America: for th...   
...                                                      ...   
106742370     Ranking Penguin's possible picks for NHL draft   
1067423