In [1]:
import pandas as pd

In [2]:
data = pd.read_csv('/Users/sebastiansoderberg/Documents/CBS/Master/2nd Semester/NLP/NLP Exam/cleaned_data.csv')
print(data.head())

                                         id  \
0  1211a23db42e7dd52a43e564a899ebbf1b2d1251   
1  aff92aa2fc6e294efeeed70e9b0a0aefac32b030   
2  bb7757b7bf3d90ec4f1d1915a4699eecafc25d1a   
3  8f25b2c54cb563b98e1f2619401df7137a3f0c77   
4  daa6ca8fe860fa70bed00e2bfd05ab7414b7cb0e   

                                     cleaned_article  \
0  daily mail reporter published est june updated...   
1  lizzie parry mailonline young cancer victim to...   
2  rebel aided al qaedalinked militant seized con...   
3  cnn two right group launched stinging critique...   
4  cnn puffing electronic cigarette already nono ...   

                                  cleaned_highlights  
0  president barack obamas top national security ...  
1  sophie walton diagnosed bone cancer discoverin...  
2  israeli military closed area around crossing o...  
3  new trial resume kremlin critic aleksei navaln...  
4  government proposes explicitly ban use electro...  


In [3]:
pip install nltk

Note: you may need to restart the kernel to use updated packages.


In [4]:
from collections import defaultdict, Counter
import nltk
from nltk.util import ngrams
import random

In [5]:
sampled_data = data.sample(n=1500, random_state=42)
print(sampled_data.head())

                                              id  \
59770   e44535866d0ffef16786e9ab2dccbf3c4d690778   
21362   2c7fb7b897db7f304961e919cd5ef1a5a93877a4   
127324  edf5aaf6c48a14789eec105b32a5ccb3efe5970a   
140509  47b7bda07fc0f09b7c8f4fe52e8d63ce22d0ee9e   
144297  7a37e591a3250e5791f1238e0092e728decf7100   

                                          cleaned_article  \
59770   gerri peev matt chorley published est may upda...   
21362   cnn five cent chinese yuan coin gave tan yuan ...   
127324  victoria woollaston keeping track polar bear p...   
140509  katie davy published est march updated est mar...   
144297  ted thornhill published est january updated es...   

                                       cleaned_highlights  
59770   poll reveals voter trust current leader le pre...  
21362   tan yuan yuan san francisco ballet principal d...  
127324  researcher tracked polar bear canada using hig...  
140509  celebration underway across u today day extrav...  
144297  jerome hauer

In [6]:
def generate_ngrams(text, n):
    tokens = text.split()
    return list(ngrams(tokens, n))

def build_ngram_model(data, n=3):
    ngram_model = defaultdict(Counter)
    for article in data['cleaned_article']:
        ngrams_list = generate_ngrams(article, n)
        for ngram in ngrams_list:
            prefix = ngram[:-1]
            next_word = ngram[-1]
            ngram_model[prefix][next_word] +=1
    return ngram_model

ngram_model = build_ngram_model(sampled_data, n=3)


In [7]:
def generate_summary(model, start_words, max_length = 40):
    current_words = tuple(start_words.split()[:2])
    summary = list(current_words)

    for _ in range(max_length - len(current_words)):
        if current_words in model:
            next_word_candidates = model[current_words]
            next_word = random.choices(list(next_word_candidates.keys()), weights=next_word_candidates.values())[0]
            summary.append(next_word)
            current_words = tuple(summary[-2:])
        else:
            break
    return ' '.join(summary)

In [8]:
#Exmaple usage
start_words = 'daily mail reporter'
summary = generate_summary(ngram_model, start_words)
print(summary)

daily mail angry hurt jury discharged burst tear whole thing difficult frightening recognising bellfield court giving evidence dewani middle man tasked tongo recruiting killer spared prosecution return giving evidence mcmurray senior told jury snaresbrook crown court august driver bus told


In [9]:
pip install rouge

Note: you may need to restart the kernel to use updated packages.


In [10]:
from rouge import Rouge
def evaluate_summary(reference, generated):
    rouge = Rouge()
    scores = rouge.get_scores(generated, reference, avg=True)
    return scores

# Generate and evaluate summaries for the sampled data
results = []
for index, row in sampled_data.iterrows():
    start_words = ' '.join(row['cleaned_article'].split()[:3])
    generated_summary = generate_summary(ngram_model, start_words)
    reference_summary = row['cleaned_highlights']
    scores = evaluate_summary(reference_summary, generated_summary)
    results.append({
        'id': row['id'],
        'generated_summary': generated_summary,
        'reference_summary': reference_summary,
        'rouge_scores': scores
    })

# Sort results by ROUGE-1 F1 score in descending order
sorted_results = sorted(results, key=lambda x: x['rouge_scores']['rouge-1']['f'], reverse=True)

In [11]:
# Display the top 10 results
top_results = sorted_results[:10]

for result in top_results:
    print(f"Article ID: {result['id']}")
    print(f"Generated Summary: {result['generated_summary']}")
    print(f"Reference Summary: {result['reference_summary']}")
    print(f"ROUGE Scores: {result['rouge_scores']}")
    print("\n")

# Explanation of ROUGE scores
print("Explanation of ROUGE scores:")
print("ROUGE-1, ROUGE-2, and ROUGE-L measure different aspects of overlap between the generated and reference summaries.")
print(" - 'r' (Recall): Measures how much of the reference summary is captured by the generated summary.")
print(" - 'p' (Precision): Measures how much of the generated summary is relevant compared to the reference summary.")
print(" - 'f' (F1 Score): Harmonic mean of precision and recall, providing a balance between the two.")


Article ID: 10886eec4f22bc5366ff9653746dd67b5d8ec874
Generated Summary: spelling error including trail instead trial sited rather cited manger instead manager yearold taught science served freshman girl basketball coach academy school student said hoped case would accelerate slow slow said start one omarska guard testified released served twothirds sevenyear
Reference Summary: spelling error including trail instead trial sited rather cited manger instead manager yearold taught science new york year
ROUGE Scores: {'rouge-1': {'r': 0.8235294117647058, 'p': 0.3888888888888889, 'f': 0.5283018824350303}, 'rouge-2': {'r': 0.8235294117647058, 'p': 0.358974358974359, 'f': 0.49999999577168375}, 'rouge-l': {'r': 0.8235294117647058, 'p': 0.3888888888888889, 'f': 0.5283018824350303}}


Article ID: 6cd1aa494e1c18d98eba4ddb2fa68c6557858479
Generated Summary: cnn peter hedblom overcame finalround charge fellow swedish golfer martin erlandsson win johnnie walker championship one shot adrift thirdround