In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import re
import random
from sklearn.model_selection import train_test_split
import torch

In [2]:
df = pd.read_csv("test.csv")
df.head()

Unnamed: 0,id,article,highlights
0,92c514c913c0bdfe25341af9fd72b29db544099b,Ever noticed how plane seats appear to be gett...,Experts question if packed out planes are put...
1,2003841c7dc0e7c5b1a248f9cd536d727f27a45a,A drunk teenage boy had to be rescued by secur...,Drunk teenage boy climbed into lion enclosure ...
2,91b7d2311527f5c2b63a65ca98d21d9c92485149,Dougie Freedman is on the verge of agreeing a ...,Nottingham Forest are close to extending Dougi...
3,caabf9cbdf96eb1410295a673e953d304391bfbb,Liverpool target Neto is also wanted by PSG an...,Fiorentina goalkeeper Neto has been linked wit...
4,3da746a7d9afcaa659088c8366ef6347fe6b53ea,Bruce Jenner will break his silence in a two-h...,"Tell-all interview with the reality TV star, 6..."


In [3]:
df.shape

(11490, 3)

In [4]:
df.drop("id", inplace=True, axis=1)

In [5]:
def segment_production_runs(data, sample_size, n=3):
    production_data = dict()
    for i in range(n):
        production_data[i+1] = data.sample(sample_size, random_state=42)
        remove_data = production_data[i+1].index.tolist()
        data = data.drop(remove_data,axis=0)
    return data, production_data

In [6]:
baseline, production_data = segment_production_runs(df, 50)

In [7]:
baseline.shape

(11340, 2)

In [8]:
baseline = baseline.iloc[:100]

In [9]:
baseline.shape

(100, 2)

In [10]:
def text_cleaning(text):

    text = text.lower()
    contractions = {"ain't": "is not", "aren't": "are not","can't": "cannot", "'cause": "because", "could've": "could have", "couldn't": "could not", "didn't": "did not",  "doesn't": "does not", "don't": "do not", "hadn't": "had not", "hasn't": "has not", "haven't": "have not", "he'd": "he would","he'll": "he will", "he's": "he is", "how'd": "how did", "how'd'y": "how do you", "how'll": "how will", "how's": "how is", "I'd": "I would", "I'd've": "I would have", "I'll": "I will", "I'll've": "I will have","I'm": "I am", "I've": "I have", "i'd": "i would", "i'd've": "i would have", "i'll": "i will",  "i'll've": "i will have","i'm": "i am", "i've": "i have", "isn't": "is not", "it'd": "it would", "it'd've": "it would have", "it'll": "it will", "it'll've": "it will have","it's": "it is", "let's": "let us", "ma'am": "madam", "mayn't": "may not", "might've": "might have","mightn't": "might not","mightn't've": "might not have", "must've": "must have", "mustn't": "must not", "mustn't've": "must not have", "needn't": "need not", "needn't've": "need not have","o'clock": "of the clock", "oughtn't": "ought not", "oughtn't've": "ought not have", "shan't": "shall not", "sha'n't": "shall not", "shan't've": "shall not have", "she'd": "she would", "she'd've": "she would have", "she'll": "she will", "she'll've": "she will have", "she's": "she is", "should've": "should have", "shouldn't": "should not", "shouldn't've": "should not have", "so've": "so have","so's": "so as", "this's": "this is","that'd": "that would", "that'd've": "that would have", "that's": "that is", "there'd": "there would", "there'd've": "there would have", "there's": "there is", "here's": "here is","they'd": "they would", "they'd've": "they would have", "they'll": "they will", "they'll've": "they will have", "they're": "they are", "they've": "they have", "to've": "to have", "wasn't": "was not", "we'd": "we would", "we'd've": "we would have", "we'll": "we will", "we'll've": "we will have", "we're": "we are", "we've": "we have", "weren't": "were not", "what'll": "what will", "what'll've": "what will have", "what're": "what are", "what's": "what is", "what've": "what have", "when's": "when is", "when've": "when have", "where'd": "where did", "where's": "where is", "where've": "where have", "who'll": "who will", "who'll've": "who will have", "who's": "who is", "who've": "who have", "why's": "why is", "why've": "why have", "will've": "will have", "won't": "will not", "won't've": "will not have", "would've": "would have", "wouldn't": "would not", "wouldn't've": "would not have", "y'all": "you all", "y'all'd": "you all would","y'all'd've": "you all would have","y'all're": "you all are","y'all've": "you all have", "you'd": "you would",
                "you'd've": "you would have", "you'll": "you will", "you'll've": "you will have", "you're": "you are", "you've": "you have"}
    text = ' '.join([contractions[word] if word in contractions else word for word in text.split()])
    text = re.sub(r'[^0-9a-zA-Z\s]+','',text)
    return text

In [11]:
baseline['preprocessed_article'] = df['article'].apply(text_cleaning)

In [12]:
baseline['preprocessed_summary'] = df['highlights'].apply(text_cleaning)

In [13]:
df.head()

Unnamed: 0,article,highlights
0,Ever noticed how plane seats appear to be gett...,Experts question if packed out planes are put...
1,A drunk teenage boy had to be rescued by secur...,Drunk teenage boy climbed into lion enclosure ...
2,Dougie Freedman is on the verge of agreeing a ...,Nottingham Forest are close to extending Dougi...
3,Liverpool target Neto is also wanted by PSG an...,Fiorentina goalkeeper Neto has been linked wit...
4,Bruce Jenner will break his silence in a two-h...,"Tell-all interview with the reality TV star, 6..."


In [14]:
baseline.head()

Unnamed: 0,article,highlights,preprocessed_article,preprocessed_summary
0,Ever noticed how plane seats appear to be gett...,Experts question if packed out planes are put...,ever noticed how plane seats appear to be gett...,experts question if packed out planes are putt...
1,A drunk teenage boy had to be rescued by secur...,Drunk teenage boy climbed into lion enclosure ...,a drunk teenage boy had to be rescued by secur...,drunk teenage boy climbed into lion enclosure ...
2,Dougie Freedman is on the verge of agreeing a ...,Nottingham Forest are close to extending Dougi...,dougie freedman is on the verge of agreeing a ...,nottingham forest are close to extending dougi...
3,Liverpool target Neto is also wanted by PSG an...,Fiorentina goalkeeper Neto has been linked wit...,liverpool target neto is also wanted by psg an...,fiorentina goalkeeper neto has been linked wit...
4,Bruce Jenner will break his silence in a two-h...,"Tell-all interview with the reality TV star, 6...",bruce jenner will break his silence in a twoho...,tellall interview with the reality tv star 69 ...


In [15]:
max([len(i.split()) for i in baseline['preprocessed_article']]), max([len(i.split()) for i in baseline['preprocessed_summary']])

(1525, 114)

In [16]:
np.mean([len(i.split()) for i in baseline['preprocessed_article']]), np.mean([len(i.split()) for i in baseline['preprocessed_summary']])

(626.08, 48.39)

In [17]:
random_number = random.randint(0,len(df))
for i in df[['article','highlights']][random_number:random_number+1].itertuples():
    _,q,a = i
    print('article: ',q)
    print('highlights: ',a)
    print()

article:  (CNN)It's obvious that Tom Brady's love for his wife, model Gisele Bundchen, will never go out of fashion. Bundchen walked the runway for the last time Wednesday, and the New England Patriots quarterback wasn't just there to support her in person, he expressed his emotions to the world on Facebook. "Congratulations Love of my Life," Brady wrote. "You inspire me every day to be a better person. I am so proud of you and everything you have accomplished on the runway. I have never met someone with more of a will to succeed and determination to overcome any obstacle in the way. You never cease to amaze me. Nobody loves life more than you and your beauty runs much deeper than what the eye can see. I can't wait to see what's next. I love you." He followed the text with two hashtags, #GOAT ("greatest of all time") and #thebestisyettocome. Bundchen, 34, announced her retirement from the catwalk last weekend. "I am grateful that at 14, I was given the opportunity to start this journey

In [18]:
# baseline['preprocessed_article'] = baseline['preprocessed_article'] + ' <E>'
# baseline['preprocessed_summary'] = '<S> ' + baseline['preprocessed_summary'] + ' <E>'
# baseline.head()

Unnamed: 0,article,highlights,preprocessed_article,preprocessed_summary
0,Ever noticed how plane seats appear to be gett...,Experts question if packed out planes are put...,ever noticed how plane seats appear to be gett...,<S> experts question if packed out planes are ...
1,A drunk teenage boy had to be rescued by secur...,Drunk teenage boy climbed into lion enclosure ...,a drunk teenage boy had to be rescued by secur...,<S> drunk teenage boy climbed into lion enclos...
2,Dougie Freedman is on the verge of agreeing a ...,Nottingham Forest are close to extending Dougi...,dougie freedman is on the verge of agreeing a ...,<S> nottingham forest are close to extending d...
3,Liverpool target Neto is also wanted by PSG an...,Fiorentina goalkeeper Neto has been linked wit...,liverpool target neto is also wanted by psg an...,<S> fiorentina goalkeeper neto has been linked...
4,Bruce Jenner will break his silence in a two-h...,"Tell-all interview with the reality TV star, 6...",bruce jenner will break his silence in a twoho...,<S> tellall interview with the reality tv star...


In [41]:
X = baseline['article']
y = baseline['highlights']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

In [None]:
pip install simplet5

In [21]:
import tensorflow as tf

In [42]:
X_train.head()

30    Jack Nicklaus recorded his first ever hole-in-...
95    As Zlatan Ibrahimovic famously believes the Wo...
5     This is the moment that a crew of firefighters...
68    A high-level ministerial briefing exclusively ...
67    Lazio replaced their fierce city rivals Roma a...
Name: article, dtype: object

In [43]:
from simplet5 import SimpleT5
model = SimpleT5()

In [44]:
train_df = pd.concat([X_train,y_train],axis=1)

In [45]:
train_df.columns = ['source_text','target_text']
train_df.head()

Unnamed: 0,source_text,target_text
30,Jack Nicklaus recorded his first ever hole-in-...,Kevin Streelman beat Camilo Villegas on third ...
95,As Zlatan Ibrahimovic famously believes the Wo...,Zlatan Ibrahimovic will line up against former...
5,This is the moment that a crew of firefighters...,Giant pig fell into the swimming pool at his h...
68,A high-level ministerial briefing exclusively ...,Number and seriousness of Australians facing d...
67,Lazio replaced their fierce city rivals Roma a...,Lazio closed the gap on leaders Juventus with ...


In [46]:
test_df = pd.concat([X_test,y_test],axis=1)

In [47]:
test_df.columns = ['source_text','target_text']
test_df.head()

Unnamed: 0,source_text,target_text
85,West Ham are discussing a deal for Jamaican st...,West Ham are keen on concluding a deal for 17-...
55,"Danny Willett gave a rules official, who had b...",Englishman Danny Willett blasts timing referee...
72,While Robin van Persie's career appears to be ...,Robin van Persie has been linked with a move f...
47,Steven Finn believes he has rediscovered the f...,Steven Finn was left out of the England squad ...
46,Jeremy Clarkson and his fellow Top Gear presen...,James May reveals he celebrated prematurely by...


In [49]:
model.from_pretrained("t5","t5-base")
model.train(train_df=train_df,
            eval_df=test_df,
            source_max_token_len = 900,
            target_max_token_len = 100,
            batch_size = 2,
            max_epochs = 5,
            use_gpu = True,
            outputdir = "outputs_new",
            early_stopping_patience_epochs = 0,
            precision = 32
            )

INFO:pytorch_lightning.utilities.distributed:GPU available: True, used: True
INFO:pytorch_lightning.utilities.distributed:TPU available: False, using: 0 TPU cores
INFO:pytorch_lightning.utilities.distributed:IPU available: False, using: 0 IPUs
INFO:pytorch_lightning.accelerators.gpu:LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
INFO:pytorch_lightning.callbacks.model_summary:
  | Name  | Type                       | Params
-----------------------------------------------------
0 | model | T5ForConditionalGeneration | 222 M 
-----------------------------------------------------
222 M     Trainable params
0         Non-trainable params
222 M     Total params
891.614   Total estimated model params size (MB)


Validation sanity check: 0it [00:00, ?it/s]

INFO:pytorch_lightning.utilities.seed:Global seed set to 42


Training: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

In [50]:
ls outputs_new

[0m[01;34msimplet5-epoch-0-train-loss-1.8409-val-loss-1.4926[0m/
[01;34msimplet5-epoch-1-train-loss-1.3333-val-loss-1.5028[0m/
[01;34msimplet5-epoch-2-train-loss-1.0262-val-loss-1.5438[0m/
[01;34msimplet5-epoch-3-train-loss-0.8131-val-loss-1.6049[0m/
[01;34msimplet5-epoch-4-train-loss-0.6763-val-loss-1.6723[0m/


In [51]:
model.load_model("simplet5-epoch-1-train-loss-1.3333-val-loss-1.5028/", use_gpu=True)

In [30]:
production_data[1].head()

Unnamed: 0,article,highlights
1516,Comedian Jenny Eclair travelled with her other...,The comedian stayed with Flavours who offer a ...
1393,A woman of Arab and Jewish descent who was str...,The federal government will give Shoshana Hebs...
10560,World No 1 Novak Djokovic has apologised to th...,Novak Djokovic beat Andy Murray 7-6 4-6 6-0 in...
11457,(CNN)ISIS on Wednesday released more than 200 ...,Most of those released were women and children...
647,Hillary Clinton’s security detail arrived at a...,"Second modified, armored van spotted near Des ..."


In [31]:
for i in production_data:
  production_data[i].to_csv(f"ground_truths{i}.csv")

In [32]:
baseline.to_csv("baseline.csv")

In [52]:
production_data[1]['article'].iloc[1]

"A woman of Arab and Jewish descent who was strip-searched at a Detroit-area airport has reached a settlement in a lawsuit filed on her behalf, the American Civil Liberties Union said Tuesday. The federal government will give Shoshana Hebshi $40,000 as compensation for being humiliated on the 10th anniversary of the 9/11 terrorist attacks after armed agents forced her from a plane at Detroit Metropolitan Airport, made her undress during a search and held her for hours. Frontier Airlines, the Transportation Security Administration and Wayne County Airport Authority were named in the federal lawsuit. Won her case: Shoshana Hebshi, of Sylvania, Ohio, who was strip-searched at a Detroit-area airport, will gain $40,000 as compensation for being humiliated . Yoga instructor: Hebshi works as a freelance writer, yoga instructor, and is a mother to two twins . Hebshi, who has a Jewish mother and Saudi Arabian father, has said she was ethnically profiled based on her dark complexion. 'I filed th

In [56]:
model.predict(production_data[1]['article'].iloc[0])

['Jenny and her other half went on a Painting In Venus break with Flavours. They stayed at the Villa Bianchi, a farmhouse in Treviso, near Venice. We were all middle-aged women to the core of the group. British Airways offers a seven-night full-board Painting In Venice holiday from £1,599pp.']

In [62]:
target = []
for i in range(3):
  target.append(model.predict(production_data[1]['article'].iloc[i])[0])

In [59]:
baseline.shape, production_data[1].shape

((100, 4), (50, 2))

In [63]:
target

['Jenny Eclair stayed with nine-strong crew of wannabe artists and keen cooks in Treviso, just outside Venice. The holiday was all about painting and drawing, but Jenny’s other half did the cooking for us. Ibiza is a popular destination for women who want to be creative while on holiday in Europe.',
 "Shoshana Hebshi, of Sylvania, Ohio, was removed from a plane at Detroit Metropolitan Airport on the 10th anniversary of 9/11. She was seated next to two Indian-American men who said they spent a lot of time in the plane's bathroom. She claims she was ethnically profiled based on her dark complexion. Frontier Airlines, the Transportation Security Administration and Wayne County Airport Authority were named in the lawsuit.",
 "Andy Murray beat Novak Djokovic 7-6 4-6 6-0 in the Miami Open final. Djokovic snatched a towel from a ball boy during the crossfire. The youngster was standing between Djokovic and his backroom team. Djokovic has since issued an apology via Facebook to the boy's paren

In [65]:
len(target), len(production_data[1])

(3, 50)

In [66]:
def get_preds(df):
  target = []

  for i in range(len(df)):
    target.append(model.predict(df['article'].iloc[i])[0])

  df['target'] = target
  return df

In [77]:
prod_run3 = get_preds(production_data[3])

In [78]:
prod_run3.head()

Unnamed: 0,article,highlights,target
3786,Wildlife officials in Alaska are going to kill...,State's Department of Fish and Game ruled that...,"Wildlife officials in Anchorage, Alaska, are g..."
1612,Gemma Redhead was left terrified after her for...,Philip Kirby was jailed for eight years in 201...,"Philip Kirby, 32, attacked Gemma Redhead at kn..."
1472,Three men accused of being agents of the Syria...,Men were executed on a dusty road in Deir ez-Z...,Islamic State has claimed three men are workin...
876,A pound will buy you one and a half tins of be...,Ciudad Real airport was built in 2009 at a cos...,Ciudad Real was built at a cost of more than €...
8011,Liverpool striker Daniel Sturridge believes Ra...,Liverpool midfielder Raheem Sterling is stalli...,Daniel Sturridge believes Raheem Sterling will...


In [79]:
prod_run3.drop("highlights",axis=1, inplace=True)

In [80]:
prod_run3.head()

Unnamed: 0,article,target
3786,Wildlife officials in Alaska are going to kill...,"Wildlife officials in Anchorage, Alaska, are g..."
1612,Gemma Redhead was left terrified after her for...,"Philip Kirby, 32, attacked Gemma Redhead at kn..."
1472,Three men accused of being agents of the Syria...,Islamic State has claimed three men are workin...
876,A pound will buy you one and a half tins of be...,Ciudad Real was built at a cost of more than €...
8011,Liverpool striker Daniel Sturridge believes Ra...,Daniel Sturridge believes Raheem Sterling will...


In [81]:
prod_run3.to_csv("production_run3.csv")

In [82]:
pip install shap

Collecting shap
  Downloading shap-0.44.1-cp310-cp310-manylinux_2_12_x86_64.manylinux2010_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl (535 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m535.7/535.7 kB[0m [31m4.3 MB/s[0m eta [36m0:00:00[0m
Collecting slicer==0.0.7 (from shap)
  Downloading slicer-0.0.7-py3-none-any.whl (14 kB)
Installing collected packages: slicer, shap
Successfully installed shap-0.44.1 slicer-0.0.7


In [84]:
import shap

In [136]:
def predict(text):
  inputs = model.tokenizer(text.tolist(), return_tensors="pt", padding=True).to("cuda")
  return np.array(model.predict(inputs))

In [98]:
predict("Hey there")

'Hey there, Hey there! Hey there!'

In [103]:
explainer = shap.Explainer(predict, model.tokenizer)

In [106]:
shap_values = explainer(baseline['article'].iloc[0][0])

TypeError: TextEncodeInput must be Union[TextInputSequence, Tuple[InputSequence, InputSequence]]

In [90]:
baseline['article'].iloc[0]

"Ever noticed how plane seats appear to be getting smaller and smaller? With increasing numbers of people taking to the skies, some experts are questioning if having such packed out planes is putting passengers at risk. They say that the shrinking space on aeroplanes is not only uncomfortable - it's putting our health and safety in danger. More than squabbling over the arm rest, shrinking space on planes putting our health and safety in danger? This week, a U.S consumer advisory group set up by the Department of Transportation said at a public hearing that while the government is happy to set standards for animals flying on planes, it doesn't stipulate a minimum amount of space for humans. 'In a world where animals have more rights to space and food than humans,' said Charlie Leocha, consumer representative on the committee.\xa0'It is time that the DOT and FAA take a stand for humane treatment of passengers.' But could crowding on planes lead to more serious issues than fighting for sp

In [96]:
baseline['article']

0      Ever noticed how plane seats appear to be gett...
1      A drunk teenage boy had to be rescued by secur...
2      Dougie Freedman is on the verge of agreeing a ...
3      Liverpool target Neto is also wanted by PSG an...
4      Bruce Jenner will break his silence in a two-h...
                             ...                        
97     A paramedic who pretended he was gay to get cl...
98     Paris Saint-Germain face Nice on Saturday, hop...
99     (CNN)You know the phrase "dodging a bullet"? F...
100    A mother-of-two lost more than ten stone after...
102    The records just keep tumbling for Cristiano R...
Name: article, Length: 100, dtype: object

In [105]:
model.predict(baseline['article'].iloc[0])

["With increasing numbers of people taking to the skies, some experts are questioning if having such packed out planes is putting passengers at risk. They say that shrinking space on planes is not only uncomfortable - it's putting our health and safety in danger."]

In [124]:
def f(x):
    inputs = model.tokenizer(x.tolist(), return_tensors="pt", padding=True).to("cuda")
    with torch.no_grad():
        out = model.predict(inputs)
    # sentence = [model.tokenizer.decode(g, skip_special_tokens=True) for g in out]
    return np.array(out)

In [137]:
teacher_forcing_model = shap.models.TeacherForcing(
    predict, similarity_model=model, similarity_tokenizer=model.tokenizer, device=model.device
)
# create a Text masker
masker = shap.maskers.Text(model.tokenizer, mask_token="...", collapse_mask_token=True)

In [138]:
explainer_model_agnostic = shap.Explainer(teacher_forcing_model, masker)

In [139]:
shap_values_model_agnostic = explainer_model_agnostic(baseline['article'].iloc[0])

AttributeError: 'SimpleT5' object has no attribute 'config'

In [1]:
import pandas as pd
import numpy as np
import plotly.express as px
import matplotlib.pyplot as plt

In [2]:
baseline = pd.read_csv("baseline.csv")

In [3]:
baseline.head()

Unnamed: 0.1,Unnamed: 0,article,highlights,preprocessed_article,preprocessed_summary
0,0,Ever noticed how plane seats appear to be gett...,Experts question if packed out planes are put...,ever noticed how plane seats appear to be gett...,<S> experts question if packed out planes are ...
1,1,A drunk teenage boy had to be rescued by secur...,Drunk teenage boy climbed into lion enclosure ...,a drunk teenage boy had to be rescued by secur...,<S> drunk teenage boy climbed into lion enclos...
2,2,Dougie Freedman is on the verge of agreeing a ...,Nottingham Forest are close to extending Dougi...,dougie freedman is on the verge of agreeing a ...,<S> nottingham forest are close to extending d...
3,3,Liverpool target Neto is also wanted by PSG an...,Fiorentina goalkeeper Neto has been linked wit...,liverpool target neto is also wanted by psg an...,<S> fiorentina goalkeeper neto has been linked...
4,4,Bruce Jenner will break his silence in a two-h...,"Tell-all interview with the reality TV star, 6...",bruce jenner will break his silence in a twoho...,<S> tellall interview with the reality tv star...


In [5]:
baseline.drop(['Unnamed: 0', "preprocessed_article", 'preprocessed_summary'], axis=1, inplace=True)

In [6]:
baseline.head()

Unnamed: 0,article,highlights
0,Ever noticed how plane seats appear to be gett...,Experts question if packed out planes are put...
1,A drunk teenage boy had to be rescued by secur...,Drunk teenage boy climbed into lion enclosure ...
2,Dougie Freedman is on the verge of agreeing a ...,Nottingham Forest are close to extending Dougi...
3,Liverpool target Neto is also wanted by PSG an...,Fiorentina goalkeeper Neto has been linked wit...
4,Bruce Jenner will break his silence in a two-h...,"Tell-all interview with the reality TV star, 6..."


In [7]:
gt = pd.read_csv("ground_truths1.csv")
gt.head()

Unnamed: 0.1,Unnamed: 0,article,highlights
0,1516,Comedian Jenny Eclair travelled with her other...,The comedian stayed with Flavours who offer a ...
1,1393,A woman of Arab and Jewish descent who was str...,The federal government will give Shoshana Hebs...
2,10560,World No 1 Novak Djokovic has apologised to th...,Novak Djokovic beat Andy Murray 7-6 4-6 6-0 in...
3,11457,(CNN)ISIS on Wednesday released more than 200 ...,Most of those released were women and children...
4,647,Hillary Clinton’s security detail arrived at a...,"Second modified, armored van spotted near Des ..."


In [8]:
gt.drop("Unnamed: 0", axis=1, inplace=True)

In [9]:
prod = pd.read_csv("production_run1.csv")
prod.drop("Unnamed: 0", axis=1, inplace=True)
prod.head()

Unnamed: 0,article,target
0,Comedian Jenny Eclair travelled with her other...,Comedian Jenny Eclair travelled with her other...
1,A woman of Arab and Jewish descent who was str...,"Shoshana Hebshi, of Sylvania, Ohio, was forced..."
2,World No 1 Novak Djokovic has apologised to th...,Novak Djokovic lost his cool during the Miami ...
3,(CNN)ISIS on Wednesday released more than 200 ...,"ISIS released more than 200 Yazidis, a minorit..."
4,Hillary Clinton’s security detail arrived at a...,Two Scooby vans arrived at Capitol Fruit Compa...


In [None]:
def metrics_for_seq2seq():
    pass

In [10]:
import nltk
from nltk.translate.bleu_score import corpus_bleu
from nltk.translate.bleu_score import sentence_bleu
from nltk.translate.rouge import Rouge

ModuleNotFoundError: No module named 'nltk.translate.rouge'

In [11]:
pip install rouge-score

Collecting rouge-score
  Downloading rouge_score-0.1.2.tar.gz (17 kB)
Building wheels for collected packages: rouge-score
  Building wheel for rouge-score (setup.py): started
  Building wheel for rouge-score (setup.py): finished with status 'done'
  Created wheel for rouge-score: filename=rouge_score-0.1.2-py3-none-any.whl size=24956 sha256=c31db142681e9fcc1831f60f695494568bcd3956d2bfa8bab5a2b3373ed8d269
  Stored in directory: c:\users\akshat mittu\appdata\local\pip\cache\wheels\24\55\6f\ebfc4cb176d1c9665da4e306e1705496206d08215c1acd9dde
Successfully built rouge-score
Installing collected packages: rouge-score
Successfully installed rouge-score-0.1.2
Note: you may need to restart the kernel to use updated packages.


In [15]:
from rouge_score import rouge_scorer

In [16]:
pip install rouge

Collecting rouge
  Downloading rouge-1.0.1-py3-none-any.whl (13 kB)
Installing collected packages: rouge
Successfully installed rouge-1.0.1
Note: you may need to restart the kernel to use updated packages.


In [17]:
from rouge import Rouge

In [22]:
rouge = Rouge()

scores = rouge.get_scores(gt['highlights'], prod['target'], avg=True)

In [23]:
scores

{'rouge-1': {'r': 0.38252274360935634,
  'p': 0.3713916596844363,
  'f': 0.3625988351191481},
 'rouge-2': {'r': 0.158827124643509,
  'p': 0.14991241474983932,
  'f': 0.14610687756063304},
 'rouge-l': {'r': 0.35804254120081536,
  'p': 0.34623114775340147,
  'f': 0.33850226043512144}}

In [29]:
bleu_score = sentence_bleu(gt['highlights'].iloc[0].split(), prod['target'].iloc[0].split())

The hypothesis contains 0 counts of 2-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 3-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 4-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()


In [27]:
gt['highlights'].iloc[0].split()

['The',
 'comedian',
 'stayed',
 'with',
 'Flavours',
 'who',
 'offer',
 'a',
 'Painting',
 'In',
 'Venice',
 'break',
 '.',
 'Jenny',
 'and',
 'her',
 'partner',
 'Geof',
 'stayed',
 'at',
 'the',
 'farmhouse',
 'Villa',
 'Bianchi',
 '.',
 'Days',
 'involved',
 'sitting',
 'in',
 'medieval',
 'market',
 'towns',
 'with',
 'a',
 'brush',
 'and',
 'prosecco',
 '.']

In [28]:
prod['target'].iloc[0].split()

['Comedian',
 'Jenny',
 'Eclair',
 'travelled',
 'with',
 'her',
 'other',
 'half',
 'on',
 'a',
 'Painting',
 'In',
 'Venus',
 'break.',
 'We',
 'stayed',
 'in',
 'a',
 'nine-strong',
 'crew',
 'of',
 'wannabe',
 'artists',
 'and',
 'keen',
 'cooks.',
 'The',
 'holidays',
 'included',
 'wine',
 'with',
 'meals,',
 'five',
 'days',
 'of',
 'tuition',
 'and',
 '£100',
 'towards',
 'flight',
 'costs.']

In [30]:
bleu_score

7.199666163340923e-232

In [None]:
references = [['this', 'is', 'a', 'test'], ['another', 'test']]
candidate = ['this', 'is', 'a', 'test']

In [31]:
bleu_score = corpus_bleu(gt['highlights'], prod['target'])

In [32]:
bleu_score

1.0530448927191171e-231

In [33]:
from sklearn.metrics.pairwise import cosine_similarity

In [34]:
sim = cosine_similarity(X=gt['highlights'].iloc[0], Y=prod['target'].iloc[0])

ValueError: could not convert string to float: 'The comedian stayed with Flavours who offer a Painting In Venice break .\nJenny and her partner Geof stayed at the farmhouse Villa Bianchi .\nDays involved sitting in medieval market towns with a brush and prosecco .'

In [39]:
import tensorflow_hub as hub

TypeError: Descriptors cannot be created directly.
If this call came from a _pb2.py file, your generated code is out of date and must be regenerated with protoc >= 3.19.0.
If you cannot immediately regenerate your protos, some other possible workarounds are:
 1. Downgrade the protobuf package to 3.20.x or lower.
 2. Set PROTOCOL_BUFFERS_PYTHON_IMPLEMENTATION=python (but this will use pure-Python parsing and will be much slower).

More information: https://developers.google.com/protocol-buffers/docs/news/2022-05-06#python-updates

In [37]:
def create_and_save_embeddings(text_df, text_col):
    
    module_url = "https://tfhub.dev/google/universal-sentence-encoder/4"
    use = hub.load(module_url)
    
    text_df['embeds'] = text_df[text_col].apply(preprocess_for_embedding) 
    
    embed_df = dict()
    for i in range(len(text_df)):
        embed_df[i] = creat_embed(text_df.iloc[i]['embeds'], use).tolist()
    
    df = pd.DataFrame(embed_df).T
    
    return df

In [38]:
gt_embed = create_and_save_embeddings(gt, 'highlights')

NameError: name 'hub' is not defined