In [87]:
# ---------------------------------------------------------
# IMPORTS
# ---------------------------------------------------------
import pandas as pd
import numpy as np
import re
import nltk
from nltk.tokenize import sent_tokenize
from nltk.corpus import stopwords
from rouge_score import rouge_scorer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import networkx as nx
import joblib

In [88]:
# ---------------------------------------------------------
# TEXT CLEANING FUNCTION
# ---------------------------------------------------------
import string
def clean_text(text):

    if not isinstance(text, str):
        return ""

    # Remove HTML
    text = re.sub(r"<.*?>", " ", text)

    # Remove URLs
    text = re.sub(r"http\S+|www\S+|https\S+", " ", text)

    # Remove emojis
    text = re.sub(
        "[" 
        u"\U0001F600-\U0001F64F"
        u"\U0001F300-\U0001F5FF"
        u"\U0001F680-\U0001F6FF"
        u"\U0001F1E0-\U0001F1FF"
        "]+", 
        "", 
        text
    )

    # Remove special symbols
    text = re.sub(r"[^a-zA-Z0-9\s.,!?]", " ", text)

    # Keep only allowed characters
    allowed = set(string.ascii_letters + string.digits + " .,!?")
    text = "".join(ch for ch in text if ch in allowed)

    # Lowercase
    text = text.lower()

    # Normalize whitespace
    text = " ".join(text.split())

    return text

In [89]:
# ---------------------------------------------------------
# LOAD DATA
# ---------------------------------------------------------
df = pd.read_csv("D:/Python_WC/Final_project/Multi-Task_News_Intelligence_System/Data/news.tsv", sep="\t")

df["text"] = df["Headline"].fillna("") + " " + df["News body"].fillna("")
df = df.rename(columns={"Category": "label"}).dropna()

# Clean text
df["clean_text"] = df["text"].apply(clean_text)

# This is required for evaluation
df["full_text"] = df["text"]   # FIXED

In [90]:
#Tokenization & Stopword Handling
import nltk
nltk.download("punkt")
nltk.download("stopwords")


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\dhiya\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\dhiya\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [91]:
#Sentence & word tokenization
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.corpus import stopwords

stop_words = set(stopwords.words("english"))

def tokenize_and_remove_stopwords(text):
    words = word_tokenize(text)
    words = [w for w in words if w not in stop_words and w.isalpha()]
    return words


In [92]:
#Apply tokenization (optional column)
df["tokens"] = df["clean_text"].apply(tokenize_and_remove_stopwords)


In [93]:
#Truncate text lengths
MAX_ARTICLE_TOKENS = 512
MAX_SUMMARY_TOKENS = 128

def truncate_text(text, max_tokens):
    tokens = word_tokenize(text)
    tokens = tokens[:max_tokens]
    return " ".join(tokens)

df["article_trunc"] = df["clean_text"].apply(
    lambda x: truncate_text(x, MAX_ARTICLE_TOKENS)
)



In [94]:
df["article_trunc"]

0         predicting atlanta united s lineup against col...
1         mitch mcconnell dc statehood push is full bore...
2         home in north highlands damaged by fire north ...
3         meghan mccain blames liberal media and third w...
4         today in history aug 1 1714 george i becomes k...
                                ...                        
113757    hope who ? alyssa naeher s penalty save sends ...
113758    chris sale explains what specifically has gone...
113759    raptor fans jam streets to celebrate 1st nba t...
113760    judge won t allow flynn to fire his attorneys ...
113761    worley thinks he and conley will rival greates...
Name: article_trunc, Length: 113704, dtype: object

In [95]:
#Vectorization / Feature Representations
#TF-IDF (for extractive summarization)
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf_vectorizer = TfidfVectorizer(
    max_features=5000,
    stop_words="english"
)

tfidf_matrix = tfidf_vectorizer.fit_transform(df["clean_text"])



In [96]:
#Extractive Summarization (TF-IDF Baseline)
#TF-IDF sentence scorer
import numpy as np

def tfidf_extractive_summary(text, num_sentences=3):
    sentences = sent_tokenize(text)

    if len(sentences) <= num_sentences:
        return text

    vectorizer = TfidfVectorizer(stop_words="english")
    tfidf = vectorizer.fit_transform(sentences)

    sentence_scores = np.array(tfidf.sum(axis=1)).flatten()
    top_sentence_indices = sentence_scores.argsort()[-num_sentences:]
    top_sentence_indices.sort()

    summary = " ".join([sentences[i] for i in top_sentence_indices])
    return summary


In [97]:
#Generate summaries
df["tfidf_summary"] = df["full_text"].apply(
    lambda x: tfidf_extractive_summary(x, num_sentences=3)
)


In [98]:
#Word2Vec (Static Embeddings)
#Train Word2Vec
from gensim.models import Word2Vec

sentences = df["tokens"].tolist()

w2v_model = Word2Vec(
    sentences,
    vector_size=100,
    window=5,
    min_count=2,
    workers=4
)


In [99]:
#Create sentence vectors (average embeddings)
import numpy as np

def sentence_vector(sentence_tokens, model, vector_size=100):
    vectors = [
        model.wv[word]
        for word in sentence_tokens
        if word in model.wv
    ]

    if len(vectors) == 0:
        return np.zeros(vector_size)

    return np.mean(vectors, axis=0)

df["w2v_vector"] = df["tokens"].apply(
    lambda x: sentence_vector(x, w2v_model)
)


In [100]:
#Transformer Summarization (BART)
#Load BART summarizer
from transformers import pipeline

summarizer = pipeline(
    "summarization",
    model="facebook/bart-base",
    tokenizer="facebook/bart-base"
)




In [102]:
#Extractive Baseline (TF-IDF-based)
from nltk.tokenize import sent_tokenize

#TF-IDF Sentence Scoring
from sklearn.feature_extraction.text import TfidfVectorizer
import numpy as np

def tfidf_extractive_summary(text, top_k=3):
    sentences = sent_tokenize(text)

    if len(sentences) <= top_k:
        return text

    vectorizer = TfidfVectorizer(stop_words="english")
    tfidf_matrix = vectorizer.fit_transform(sentences)

    scores = tfidf_matrix.sum(axis=1).A1
    top_sentence_ids = scores.argsort()[-top_k:]
    top_sentence_ids.sort()

    summary = " ".join([sentences[i] for i in top_sentence_ids])
    return summary



In [103]:
#Generate Extractive Summaries
df["tfidf_summary"] = df["full_text"].apply(
    lambda x: tfidf_extractive_summary(x, top_k=3)
)


In [104]:
df["tfidf_summary"]

0         We've seen how he rotates (or doesn't rotate) ...
1         Mitch McConnell: DC statehood push is 'full bo...
2         Home In North Highlands Damaged By Fire NORTH ...
3         Meghan McCain blames 'liberal media' and 'thir...
4         1798: Battle of Nile begins Battle of Nile, al...
                                ...                        
113757    No, when the final whistle sounded, the entire...
113758    In his last start before the All-Star break, S...
113759    Raptor fans jam streets to celebrate 1st NBA t...
113760    Attorneys for President Trump's former nationa...
113761    The kind of season they had overall lends litt...
Name: tfidf_summary, Length: 113704, dtype: object

In [105]:
# ---------------------------------------------------------
# TEXT RANK SUMMARIZER
# ---------------------------------------------------------
def textrank_summarize(text, top_n=3):
    cleaned = clean_text(text)
    sentences = sent_tokenize(cleaned)

    if len(sentences) <= top_n:
        return " ".join(sentences)

    vectorizer = TfidfVectorizer()
    vectors = vectorizer.fit_transform(sentences).toarray()

    sim_matrix = cosine_similarity(vectors)

    nx_graph = nx.from_numpy_array(sim_matrix)
    scores = nx.pagerank(nx_graph)

    ranked = sorted(((scores[i], s) for i, s in enumerate(sentences)), reverse=True)
    summary = " ".join([s for _, s in ranked[:top_n]])

    return summary

In [106]:
# ---------------------------------------------------------
# TF-IDF SENTENCE SCORING
# ---------------------------------------------------------
def tfidf_summarize(text, top_n=3):
    cleaned = clean_text(text)
    sentences = sent_tokenize(cleaned)

    if len(sentences) <= top_n:
        return " ".join(sentences)

    vectorizer = TfidfVectorizer()
    tfidf_matrix = vectorizer.fit_transform(sentences)

    scores = tfidf_matrix.mean(axis=1).A.flatten()

    ranked_idx = np.argsort(scores)[::-1]
    selected = [sentences[i] for i in ranked_idx[:top_n]]

    return " ".join(selected)

# ---------------------------------------------------------
# REFERENCE SUMMARY (WEAK BASELINE)
# ---------------------------------------------------------
def reference_summary(text):
    sents = sent_tokenize(clean_text(text))
    return " ".join(sents[:2])

In [107]:
# ---------------------------------------------------------
# ROUGE EVALUATION
# ---------------------------------------------------------
scorer = rouge_scorer.RougeScorer(["rouge1", "rouge2", "rougeL"], use_stemmer=True)

def evaluate_model(summarizer_fn, df, samples=50):

    rouge1_scores, rouge2_scores, rougeL_scores = [], [], []

    for i in range(min(samples, len(df))):

        text = df.iloc[i]["full_text"]
        ref = reference_summary(text)
        pred = summarizer_fn(text)

        scores = scorer.score(ref, pred)

        rouge1_scores.append(scores["rouge1"].fmeasure)
        rouge2_scores.append(scores["rouge2"].fmeasure)
        rougeL_scores.append(scores["rougeL"].fmeasure)

    return {
        "rouge1": np.mean(rouge1_scores),
        "rouge2": np.mean(rouge2_scores),
        "rougeL": np.mean(rougeL_scores),
    }

In [108]:
# ---------------------------------------------------------
# RUN EVALUATION
# ---------------------------------------------------------
print("Evaluating TextRank...")
textrank_scores = evaluate_model(textrank_summarize, df)

print("Evaluating TF-IDF...")
tfidf_scores = evaluate_model(tfidf_summarize, df)


# ---------------------------------------------------------
# SAVE RESULTS TO CSV
# ---------------------------------------------------------
OUTPUT_CSV = "rouge_eval_results.csv"  # change if needed

new_results = pd.DataFrame([
    {
        "Model": "TextRank",
        "rouge1": textrank_scores["rouge1"],
        "rouge2": textrank_scores["rouge2"],
        "rougeL": textrank_scores["rougeL"],
        "rougeLsum": textrank_scores["rougeL"],
        "Average Score": np.mean([
            textrank_scores["rouge1"],
            textrank_scores["rouge2"],
            textrank_scores["rougeL"],
        ])
    },
    {
        "Model": "TF-IDF",
        "rouge1": tfidf_scores["rouge1"],
        "rouge2": tfidf_scores["rouge2"],
        "rougeL": tfidf_scores["rougeL"],
        "rougeLsum": tfidf_scores["rougeL"],
        "Average Score": np.mean([
            tfidf_scores["rouge1"],
            tfidf_scores["rouge2"],
            tfidf_scores["rougeL"],
        ])
    }
])

new_results = new_results.round(4)

# If CSV exists → append, else create new
try:
    old_df = pd.read_csv(OUTPUT_CSV)
    final_df = pd.concat([old_df, new_results], ignore_index=True)
except FileNotFoundError:
    final_df = new_results

final_df.to_csv(OUTPUT_CSV, index=False)

print("\nEvaluation Completed.\nSaved to:", OUTPUT_CSV)

Evaluating TextRank...
Evaluating TF-IDF...

Evaluation Completed.
Saved to: rouge_eval_results.csv


In [109]:
import pandas as pd
df1=pd.read_csv("D:/Python_WC/Final_project/Multi-Task_News_Intelligence_System/Summarization/rouge_eval_results.csv")
df1

Unnamed: 0,Model,rouge1,rouge2,rougeL,rougeLsum,Average Score
0,TextRank,0.5028,0.3924,0.4402,0.4402,0.4451
1,TF-IDF,0.4836,0.3779,0.4144,0.4144,0.4253


*DL*

In [25]:
# ---------------------------------------------------------
# TEXT CLEANING FUNCTION
# ---------------------------------------------------------
def dl_clean_text(text):
    if not isinstance(text, str):
        return ""

    text = re.sub(r"<.*?>", " ", text)                      # Remove HTML
    text = re.sub(r"http\S+|www\S+|https\S+", " ", text)   # Remove URLs

    # Remove emojis
    text = re.sub("[" 
                  u"\U0001F600-\U0001F64F"
                  u"\U0001F300-\U0001F5FF"
                  u"\U0001F680-\U0001F6FF"
                  u"\U0001F1E0-\U0001F1FF"
                  "]+", "", text)

    text = re.sub(r"[^a-zA-Z0-9\s.,!?]", " ", text)        # Special chars
    allowed = set(string.ascii_letters + string.digits + " .,!?")
    text = "".join(ch for ch in text if ch in allowed)

    text = text.lower()
    text = " ".join(text.split())
    return text

In [26]:
# ---------------------------------------------------------
# LOAD DATA (PENS)
# ---------------------------------------------------------
df = pd.read_csv("../data/news.tsv", sep="\t")

df["article"] = df["News body"].fillna("").apply(dl_clean_text)
df["summary"] = df["Headline"].fillna("").apply(dl_clean_text)

df = df[(df["article"].str.len() > 0) & (df["summary"].str.len() > 0)]

# Add special tokens
df["summary_in"]  = "<sos> " + df["summary"]
df["summary_out"] = df["summary"] + " <eos>"

In [27]:
#Prepare Tokenizer (Seq2Seq)
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
MAX_ART_LEN = 500     # encoder input length
MAX_SUM_LEN = 50      # decoder input/output length
VOCAB_SIZE = 30000
tokenizer = Tokenizer(
    num_words=VOCAB_SIZE,
    oov_token="<unk>",
    filters=""   # VERY IMPORTANT: keeps <sos> and <eos>
)

tokenizer.fit_on_texts(
    df["article"].tolist() +
    df["summary_in"].tolist() +
    df["summary_out"].tolist()
)
encoder_input_seq = tokenizer.texts_to_sequences(df["article"])
decoder_input_seq = tokenizer.texts_to_sequences(df["summary_in"])
decoder_output_seq = tokenizer.texts_to_sequences(df["summary_out"])
encoder_input_seq = pad_sequences(
    encoder_input_seq,
    maxlen=MAX_ART_LEN,
    padding="post",
    truncating="post"
)

decoder_input_seq = pad_sequences(
    decoder_input_seq,
    maxlen=MAX_SUM_LEN,
    padding="post",
    truncating="post"
)

decoder_output_seq = pad_sequences(
    decoder_output_seq,
    maxlen=MAX_SUM_LEN,
    padding="post",
    truncating="post"
)
vocab_size = min(VOCAB_SIZE, len(tokenizer.word_index) + 1)
print("Vocabulary Size:", vocab_size)


Vocabulary Size: 30000


In [28]:
#Build Encoder–Decoder with Attention (LSTM)
import tensorflow as tf
from tensorflow.keras.layers import (
    Input, Embedding, LSTM, Dense, Attention, Concatenate
)
from tensorflow.keras.models import Model

#Model hyperparameters
EMB_DIM = 128
LATENT_DIM = 256

#Encoder
# Encoder input
encoder_inputs = Input(shape=(MAX_ART_LEN,), name="encoder_inputs")

# Encoder embedding
encoder_embedding = Embedding(
    input_dim=vocab_size,
    output_dim=EMB_DIM,
    mask_zero=True,
    name="encoder_embedding"
)(encoder_inputs)

# Encoder LSTM
encoder_lstm = LSTM(
    LATENT_DIM,
    return_sequences=True,
    return_state=True,
    name="encoder_lstm"
)

encoder_outputs, state_h, state_c = encoder_lstm(encoder_embedding)

#Decoder
# Decoder input
decoder_inputs = Input(shape=(MAX_SUM_LEN,), name="decoder_inputs")

# Decoder embedding
decoder_embedding = Embedding(
    input_dim=vocab_size,
    output_dim=EMB_DIM,
    mask_zero=True,
    name="decoder_embedding"
)(decoder_inputs)

# Decoder LSTM
decoder_lstm = LSTM(
    LATENT_DIM,
    return_sequences=True,
    return_state=True,
    name="decoder_lstm"
)

decoder_outputs, _, _ = decoder_lstm(
    decoder_embedding,
    initial_state=[state_h, state_c]
)

#Attention Layer
attention = Attention(name="attention_layer")

context_vector = attention(
    [decoder_outputs, encoder_outputs]
)
#Concatenate + Output
decoder_concat = Concatenate(axis=-1, name="concat_layer")(
    [decoder_outputs, context_vector]
)

decoder_dense = Dense(
    vocab_size,
    activation="softmax",
    name="output_layer"
)

decoder_outputs = decoder_dense(decoder_concat)



In [29]:
#Build & compile model
model = Model(
    inputs=[encoder_inputs, decoder_inputs],
    outputs=decoder_outputs
)

model.compile(
    optimizer="adam",
    loss="sparse_categorical_crossentropy",
    metrics=["accuracy"]
)

model.summary()


In [30]:
#Training with Teacher Forcing
#Shift decoder output
decoder_target_seq = decoder_output_seq[..., None]
#Train model
history = model.fit(
    [encoder_input_seq, decoder_input_seq],
    decoder_target_seq,
    batch_size=32,
    epochs=5,
    validation_split=0.1
)


Epoch 1/5
[1m3198/3198[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2617s[0m 817ms/step - accuracy: 0.1572 - loss: 6.5085 - val_accuracy: 0.1935 - val_loss: 5.7941
Epoch 2/5
[1m3198/3198[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2234s[0m 698ms/step - accuracy: 0.2205 - loss: 5.3112 - val_accuracy: 0.2331 - val_loss: 5.2236
Epoch 3/5
[1m3198/3198[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2444s[0m 764ms/step - accuracy: 0.2618 - loss: 4.6473 - val_accuracy: 0.2542 - val_loss: 4.9789
Epoch 4/5
[1m3198/3198[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2341s[0m 732ms/step - accuracy: 0.2975 - loss: 4.1499 - val_accuracy: 0.2664 - val_loss: 4.8762
Epoch 5/5
[1m3198/3198[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2419s[0m 756ms/step - accuracy: 0.3330 - loss: 3.7350 - val_accuracy: 0.2738 - val_loss: 4.8572


In [32]:
from tensorflow.keras.models import Model
#Save the trained Seq2Seq model
model.save('seq2seq_lstm_model.h5')
print("✅ Saved seq2seq_lstm_model.h5")



✅ Saved seq2seq_lstm_model.h5


In [33]:
#Save the tokenizer
import pickle

with open("tokenizer.pkl", "wb") as f:
    pickle.dump(tokenizer, f)

In [34]:
#Save training history (loss & accuracy)
import pandas as pd

history_df = pd.DataFrame(history.history)
history_df.to_csv("training_history_DL.csv", index=False)


In [36]:
#Save model config info
config = {
    "MAX_ART_LEN": MAX_ART_LEN,
    "MAX_SUM_LEN": MAX_SUM_LEN,
    "VOCAB_SIZE": vocab_size,
    "EMB_DIM": 128,
    "LATENT_DIM": 256
}

with open("model_config_DL.json", "w") as f:
    import json
    json.dump(config, f)

In [56]:
#Seq2Seq summary generation function
from tensorflow.keras.preprocessing.text import Tokenizer
import pickle

# ARTICLE TOKENIZER
article_tokenizer = Tokenizer(
    num_words=30000,
    oov_token="<unk>"
)
article_tokenizer.fit_on_texts(df["article"])


# SUMMARY TOKENIZER (IMPORTANT FIX)
summary_tokenizer = Tokenizer(
    num_words=10000,
    filters=""   # do NOT remove <sos> or <eos>
)

# Fit on BOTH summary_in and summary_out
summary_tokenizer.fit_on_texts(
    df["summary_in"].tolist() + df["summary_out"].tolist()
)

# Save tokenizers
with open("article_tokenizer.pkl", "wb") as f:
    pickle.dump(article_tokenizer, f)

with open("summary_tokenizer.pkl", "wb") as f:
    pickle.dump(summary_tokenizer, f)


In [57]:
print("<sos>" in summary_tokenizer.word_index)
print("<eos>" in summary_tokenizer.word_index)


True
True


In [63]:
#Encoder Inference Model
from tensorflow.keras.models import Model

# Encoder inference model
encoder_model = Model(
    inputs=model.input[0],                    # encoder_inputs
    outputs=model.get_layer("encoder_lstm").output
)


encoder_model.summary()

In [66]:
seq = article_tokenizer.texts_to_sequences([df["article"].iloc[0]])
seq = pad_sequences(seq, maxlen=MAX_ART_LEN, padding="post")

encoder_outputs, h, c = encoder_model.predict(seq)

print(encoder_outputs.shape)  # MUST be (1, 50, 256)


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 40ms/step
(1, 500, 256)


In [67]:
#Decoder Inference Model
from tensorflow.keras.layers import Input
from tensorflow.keras.models import Model

# Decoder inputs
decoder_inputs = Input(shape=(1,), name="decoder_input_token")
decoder_state_input_h = Input(shape=(256,), name="decoder_h")
decoder_state_input_c = Input(shape=(256,), name="decoder_c")

# ✅ FIXED HERE
encoder_outputs_input = Input(shape=(None, 256), name="encoder_outputs")

# Layers from trained model
decoder_embedding = model.get_layer("decoder_embedding")
decoder_lstm = model.get_layer("decoder_lstm")
attention_layer = model.get_layer("attention_layer")
concat_layer = model.get_layer("concat_layer")
output_layer = model.get_layer("output_layer")

# Forward pass
decoder_embedded = decoder_embedding(decoder_inputs)

decoder_outputs, h, c = decoder_lstm(
    decoder_embedded,
    initial_state=[decoder_state_input_h, decoder_state_input_c]
)

attention_output = attention_layer(
    [decoder_outputs, encoder_outputs_input]
)

decoder_concat = concat_layer([decoder_outputs, attention_output])
decoder_outputs = output_layer(decoder_concat)

decoder_model = Model(
    [
        decoder_inputs,
        encoder_outputs_input,
        decoder_state_input_h,
        decoder_state_input_c
    ],
    [decoder_outputs, h, c]
)



decoder_model.summary()





In [68]:
#Seq2Seq Summary Generation
def seq2seq_generate_summary(text):
    seq = article_tokenizer.texts_to_sequences([text])
    seq = pad_sequences(seq, maxlen=MAX_ART_LEN, padding="post")

    encoder_outputs, h, c = encoder_model.predict(seq, verbose=0)

    start_token = summary_tokenizer.word_index["<sos>"]
    end_token   = summary_tokenizer.word_index["<eos>"]

    target_seq = np.array([[start_token]])
    decoded_words = []

    for _ in range(MAX_SUM_LEN):
        output_tokens, h, c = decoder_model.predict(
            [target_seq, encoder_outputs, h, c],
            verbose=0
        )

        sampled_token_index = np.argmax(output_tokens[0, -1, :])
        sampled_word = summary_tokenizer.index_word.get(sampled_token_index, "")

        if sampled_word == "<eos>" or sampled_word == "":
            break

        decoded_words.append(sampled_word)
        target_seq = np.array([[sampled_token_index]])

    return " ".join(decoded_words)

In [69]:
#Test It
print("ARTICLE:")
print(df["article"].iloc[0][:300])

print("\nREFERENCE SUMMARY:")
print(df["summary"].iloc[0])

print("\nSEQ2SEQ SUMMARY:")
print(seq2seq_generate_summary(df["article"].iloc[0]))



ARTICLE:
only five internationals allowed, count em, five! so first off we should say, per our usual atlanta united lineup predictions, this will be wrong. why will it be wrong? well, aside from the obvious, we still don t have a ton of data points from frank de boer in how he prefers to rotate his team for 

REFERENCE SUMMARY:
predicting atlanta united s lineup against columbus crew in the u.s. open cup

SEQ2SEQ SUMMARY:
the still after


In [70]:
#ROUGE evaluation
#Generate Seq2Seq summaries for your test set
import numpy as np
from tqdm import tqdm

MAX_TEST = 100  # how many examples to evaluate (for speed; use all if you want)

seq2seq_summaries = []

for i in tqdm(range(min(MAX_TEST, len(df)))):
    article = df["article"].iloc[i]
    summary_pred = seq2seq_generate_summary(article)
    seq2seq_summaries.append(summary_pred)


100%|██████████| 100/100 [01:52<00:00,  1.13s/it]


In [71]:
#Prepare reference summaries
references = df["summary"].iloc[:MAX_TEST].tolist()


In [110]:
#ROUGE evaluation
from rouge_score import rouge_scorer, scoring

# Initialize ROUGE scorer
scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)
aggregator = scoring.BootstrapAggregator()

# Score each prediction
for ref, pred in zip(references, seq2seq_summaries):
    scores = scorer.score(ref, pred)
    aggregator.add_scores(scores)

# Get aggregate scores
result = aggregator.aggregate()

print("ROUGE-1   : {:.4f}".format(result['rouge1'].mid.fmeasure))
print("ROUGE-2   : {:.4f}".format(result['rouge2'].mid.fmeasure))
print("ROUGE-L   : {:.4f}".format(result['rougeL'].mid.fmeasure))


ROUGE-1   : 0.0276
ROUGE-2   : 0.0000
ROUGE-L   : 0.0264


In [111]:
# ---------------------------------------------------------
# SAVE RESULTS TO CSV
# ---------------------------------------------------------
OUTPUT_CSV = "rouge_eval_results.csv"  # change if needed

new_results = pd.DataFrame([
    {
        "Model": "Seq2Seq LSTM",
        "rouge1":result["rouge1"].mid.fmeasure,
        "rouge2": result["rouge2"].mid.fmeasure,
        "rougeL": result["rougeL"].mid.fmeasure,
        "rougeLsum": result["rougeL"].mid.fmeasure,
        "Average Score": np.mean([
            result["rouge1"].mid.fmeasure,
            result["rouge2"].mid.fmeasure,
            result["rougeL"].mid.fmeasure,
        ])
    } 
])

new_results = new_results.round(4)

# If CSV exists → append, else create new
try:
    old_df = pd.read_csv(OUTPUT_CSV)
    final_df = pd.concat([old_df, new_results], ignore_index=True)
except FileNotFoundError:
    final_df = new_results

final_df.to_csv(OUTPUT_CSV, index=False)

print("\nEvaluation Completed.\nSaved to:", OUTPUT_CSV)


Evaluation Completed.
Saved to: rouge_eval_results.csv
