In [1]:
import pandas as pd
import numpy as np
import calendar
import math
import re
import string

In [2]:
import segmentation
import utils
import data2graph
from finetuned import T5FineTuner, BARTFineTuner, generate, generate_beam, graph2text_nobeam, graph2text_nobeam_ngram_es, graph2text_nobeam_topk, graph2text_nobeam_topp

In [3]:
import textstat
import language_tool_python
from lexical_diversity import lex_div as ld

In [4]:
tool = language_tool_python.LanguageTool('en-US')

def grammar_score(input_text):
    errors = len(tool.check(input_text))
    clean_text = input_text.translate(str.maketrans('', '', string.punctuation))
    clean_text = list(filter(None, clean_text.split(' ')))
    num_words = len(clean_text)
    return float(1-(errors/num_words))

### Loading Fine-Tuned PLMs

In [5]:
import torch
# cuda0 = torch.device("cuda:0")
# cuda1 = torch.device("cuda:1")
cuda3 = torch.device("cuda:3")

t5 = T5FineTuner.load_from_checkpoint("T5Models/T5Both.ckpt")
bart = BARTFineTuner.load_from_checkpoint("BARTModels/BARTBoth.ckpt")

t5.to(cuda3)
bart.to(cuda3)

BARTFineTuner(
  (model): BartForConditionalGeneration(
    (model): BartModel(
      (shared): Embedding(50268, 1024)
      (encoder): BartEncoder(
        (embed_tokens): Embedding(50268, 1024)
        (embed_positions): LearnedPositionalEmbedding(1026, 1024, padding_idx=1)
        (layers): ModuleList(
          (0): EncoderLayer(
            (self_attn): Attention(
              (k_proj): Linear(in_features=1024, out_features=1024, bias=True)
              (v_proj): Linear(in_features=1024, out_features=1024, bias=True)
              (q_proj): Linear(in_features=1024, out_features=1024, bias=True)
              (out_proj): Linear(in_features=1024, out_features=1024, bias=True)
            )
            (self_attn_layer_norm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
            (fc1): Linear(in_features=1024, out_features=4096, bias=True)
            (fc2): Linear(in_features=4096, out_features=1024, bias=True)
            (final_layer_norm): LayerNorm((1024,), eps=1e

### COVID19

In [6]:
#Import Covid Dataset

ds_covid = pd.read_csv("Data/COVID/owid-covid-data.csv")
#If missing values: Interpolate: ds_covid = ds_covid.interpolate(method='nearest')
ds_covid = ds_covid.fillna(0)
ds_covid.date = pd.to_datetime(ds_covid.date)
ds_covid['month'] = pd.DatetimeIndex(ds_covid['date']).month
ds_covid['month'] = ds_covid['month'].apply(lambda x: calendar.month_name[x])
ds_covid['year'] = pd.DatetimeIndex(ds_covid['date']).year
ds_covid.set_index(['date'],inplace=True)

In [7]:
countries = ['USA', 'IND', 'BRA', 'RUS', 'GBR', 'FRA', 'ESP', 'ITA' , 'TUR',  'DEU']

#RE Scores
template_re_scores = []
t5_re_scores = []
t5_re_scores_topk = []
t5_re_scores_topp = []
bart_re_scores = []
bart_re_scores_topk = []
bart_re_scores_topp = []

#Diveristy Scores
template_tte_scores = []
t5_tte_scores = []
t5_tte_scores_topk = []
t5_tte_scores_topp = []
bart_tte_scores = []
bart_tte_scores_topk = []
bart_tte_scores_topp = []

#Grammar Scores
t5_g_scores = []
t5_g_scores_topk = []
t5_g_scores_topp = []
bart_g_scores = []
bart_g_scores_topk = []
bart_g_scores_topp = []

#Grammar Mistakes
t5_g_mistake = []
t5_g_mistake_topk = []
t5_g_mistake_topp = []
bart_g_mistake = []
bart_g_mistake_topk = []
bart_g_mistake_topp = []

for iso in countries:
    
    print("Processing Country: ", iso)
    
    country = ds_covid[ds_covid['iso_code']==iso][['new_cases','month', 'year']].reset_index().drop(columns=['date'])
    country_cases_raw = country['new_cases'].tolist()

    #Log-normalize data
    trans = np.ma.log(country_cases_raw)
    country_cases = trans.filled(0)
    
    #Detecting Waves
    embeds, cluster_labels = segmentation.tslr_rep(country_cases)
    cluster_arrangement = utils.find_contiguous(cluster_labels)
    indices = utils.find_indices(cluster_arrangement)
    wave_indices = utils.find_waves(country_cases_raw, indices, tolerance=7)
    
    print("Waves Detected: ", iso)

    #Detecting Trends
    segmentation_results = segmentation.swab(country_cases, 0.1, 3, 3)
    filtered_results = segmentation.re_segment(segmentation_results, country_cases)
    trends = segmentation.find_trend(filtered_results, country_cases)
    
    print("Trends Detected: ", iso)
    
    location = ds_covid[ds_covid['iso_code'] == iso].iloc[1]['location']
    
    graph, essentials = data2graph.build_graph_covid_form1("Coronavirus cases", location, wave_indices, trends, country, country_cases_raw )
    
    #Template Narrative
    template_text = data2graph.build_template_covid_nums("Coronavirus cases", location, wave_indices, trends, country, country_cases_raw )

    t5_prefix = 'translate Graph to English: '
    
    #Simple PLM Generation
    t5_narrative = graph2text_nobeam(t5, graph, t5_prefix, 512, cuda3)
    bart_narrative = graph2text_nobeam(bart , graph, "", 512, cuda3)
    bart_narrative = re.sub('</s>' , '', bart_narrative)
    
    print("Simple Generation Complete: ", iso)
    
    #Top-k at 50
    t5_narrative_topk = graph2text_nobeam_topk(t5, graph, t5_prefix, 50, 512, cuda3)
    bart_narrative_topk = graph2text_nobeam_topk(bart, graph, "", 50, 512, cuda3)
    bart_narrative_topk = re.sub('</s>' , '', bart_narrative_topk)
    
    print("Top-k Complete: ", iso)
    
    #Top-p at 0.92
    t5_narrative_topp = graph2text_nobeam_topp(t5, graph, t5_prefix, 0.92, 512, cuda3)
    bart_narrative_topp = graph2text_nobeam_topp(bart, graph, "", 0.92, 512, cuda3)
    bart_narrative_topp = re.sub('</s>' , '', bart_narrative_topp)
    
    print("Top-p Complete: ", iso)
    
    #RE Scores
    template_re_scores.append(textstat.flesch_reading_ease(template_text))
    t5_re_scores.append(textstat.flesch_reading_ease(t5_narrative))
    t5_re_scores_topk.append(textstat.flesch_reading_ease(t5_narrative_topk))
    t5_re_scores_topp.append(textstat.flesch_reading_ease(t5_narrative_topp))
    bart_re_scores.append(textstat.flesch_reading_ease(bart_narrative))
    bart_re_scores_topk.append(textstat.flesch_reading_ease(bart_narrative_topk))
    bart_re_scores_topp.append(textstat.flesch_reading_ease(bart_narrative_topp))
    
    print("RE Scores Computed: ", iso)
    
    #Diveristy Scores
    template_tte_scores.append(ld.ttr(ld.flemmatize(template_text)))
    t5_tte_scores.append(ld.ttr(ld.flemmatize(t5_narrative)))
    t5_tte_scores_topk.append(ld.ttr(ld.flemmatize(t5_narrative_topk)))
    t5_tte_scores_topp.append(ld.ttr(ld.flemmatize(t5_narrative_topp)))
    bart_tte_scores.append(ld.ttr(ld.flemmatize(bart_narrative)))
    bart_tte_scores_topk.append(ld.ttr(ld.flemmatize(bart_narrative_topk)))
    bart_tte_scores_topp.append(ld.ttr(ld.flemmatize(bart_narrative_topp)))
    
    print("TTE Scores Computed: ", iso)
    
    #Grammar Scores
    gs = grammar_score(t5_narrative)
    t5_g_scores.append(gs)
    if gs != 1.0:
        t5_g_mistake.append((graph, t5_narrative))
    
    gs = grammar_score(t5_narrative_topk)
    t5_g_scores_topk.append(gs)
    if gs != 1.0:
        t5_g_mistake_topk.append((graph, t5_narrative_topk))
    
    gs = grammar_score(t5_narrative_topp)
    t5_g_scores_topp.append(gs)
    if gs != 1.0:
        t5_g_mistake_topp.append((graph, t5_narrative_topp))
    
    gs = grammar_score(bart_narrative)                          
    bart_g_scores.append(gs)
    if gs != 1.0:
        bart_g_mistake.append((graph, bart_narrative))
        
    gs = grammar_score(bart_narrative_topk)
    bart_g_scores_topk.append(gs)
    if gs != 1.0:
        bart_g_mistake_topk.append((graph, bart_narrative_topk))
    
    gs = grammar_score(bart_narrative_topp)
    bart_g_scores_topp.append(gs)
    if gs != 1.0:
        bart_g_mistake_topp.append((graph, bart_narrative_topp))
    
    print("Grammar Scores Computed: ", iso)

Processing Country:  USA
Waves Detected:  USA


  (p,residuals,rank,s) = np.linalg.lstsq(A,y)


Trends Detected:  USA
Simple Generation Complete:  USA
Top-k Complete:  USA
Top-p Complete:  USA
RE Scores Computed:  USA
TTE Scores Computed:  USA
Grammar Scores Computed:  USA
Processing Country:  IND
Waves Detected:  IND


  (p,residuals,rank,s) = np.linalg.lstsq(A,y)
  slope = r_num / ssxm
  t = r * np.sqrt(df / ((1.0 - r + TINY)*(1.0 + r + TINY)))
  sterrest = np.sqrt((1 - r**2) * ssym / ssxm / df)


Trends Detected:  IND
Simple Generation Complete:  IND
Top-k Complete:  IND
Top-p Complete:  IND
RE Scores Computed:  IND
TTE Scores Computed:  IND
Grammar Scores Computed:  IND
Processing Country:  BRA
Waves Detected:  BRA


  (p,residuals,rank,s) = np.linalg.lstsq(A,y)
  slope = r_num / ssxm
  t = r * np.sqrt(df / ((1.0 - r + TINY)*(1.0 + r + TINY)))
  sterrest = np.sqrt((1 - r**2) * ssym / ssxm / df)


Trends Detected:  BRA
Simple Generation Complete:  BRA
Top-k Complete:  BRA
Top-p Complete:  BRA
RE Scores Computed:  BRA
TTE Scores Computed:  BRA
Grammar Scores Computed:  BRA
Processing Country:  RUS
Waves Detected:  RUS


  (p,residuals,rank,s) = np.linalg.lstsq(A,y)
  slope = r_num / ssxm
  t = r * np.sqrt(df / ((1.0 - r + TINY)*(1.0 + r + TINY)))
  sterrest = np.sqrt((1 - r**2) * ssym / ssxm / df)


Trends Detected:  RUS
Simple Generation Complete:  RUS
Top-k Complete:  RUS
Top-p Complete:  RUS
RE Scores Computed:  RUS
TTE Scores Computed:  RUS
Grammar Scores Computed:  RUS
Processing Country:  GBR
Waves Detected:  GBR


  (p,residuals,rank,s) = np.linalg.lstsq(A,y)
  slope = r_num / ssxm
  t = r * np.sqrt(df / ((1.0 - r + TINY)*(1.0 + r + TINY)))
  sterrest = np.sqrt((1 - r**2) * ssym / ssxm / df)


Trends Detected:  GBR
Simple Generation Complete:  GBR
Top-k Complete:  GBR
Top-p Complete:  GBR
RE Scores Computed:  GBR
TTE Scores Computed:  GBR
Grammar Scores Computed:  GBR
Processing Country:  FRA
Waves Detected:  FRA


  (p,residuals,rank,s) = np.linalg.lstsq(A,y)
  slope = r_num / ssxm
  t = r * np.sqrt(df / ((1.0 - r + TINY)*(1.0 + r + TINY)))
  sterrest = np.sqrt((1 - r**2) * ssym / ssxm / df)


Trends Detected:  FRA
Simple Generation Complete:  FRA
Top-k Complete:  FRA
Top-p Complete:  FRA
RE Scores Computed:  FRA
TTE Scores Computed:  FRA
Grammar Scores Computed:  FRA
Processing Country:  ESP
Waves Detected:  ESP


  (p,residuals,rank,s) = np.linalg.lstsq(A,y)
  slope = r_num / ssxm
  t = r * np.sqrt(df / ((1.0 - r + TINY)*(1.0 + r + TINY)))
  sterrest = np.sqrt((1 - r**2) * ssym / ssxm / df)


Trends Detected:  ESP
Simple Generation Complete:  ESP
Top-k Complete:  ESP
Top-p Complete:  ESP
RE Scores Computed:  ESP
TTE Scores Computed:  ESP
Grammar Scores Computed:  ESP
Processing Country:  ITA
Waves Detected:  ITA


  (p,residuals,rank,s) = np.linalg.lstsq(A,y)
  slope = r_num / ssxm
  t = r * np.sqrt(df / ((1.0 - r + TINY)*(1.0 + r + TINY)))
  sterrest = np.sqrt((1 - r**2) * ssym / ssxm / df)


Trends Detected:  ITA
Simple Generation Complete:  ITA
Top-k Complete:  ITA
Top-p Complete:  ITA
RE Scores Computed:  ITA
TTE Scores Computed:  ITA
Grammar Scores Computed:  ITA
Processing Country:  TUR
Waves Detected:  TUR


  (p,residuals,rank,s) = np.linalg.lstsq(A,y)
  slope = r_num / ssxm
  t = r * np.sqrt(df / ((1.0 - r + TINY)*(1.0 + r + TINY)))
  sterrest = np.sqrt((1 - r**2) * ssym / ssxm / df)


Trends Detected:  TUR
Simple Generation Complete:  TUR
Top-k Complete:  TUR
Top-p Complete:  TUR
RE Scores Computed:  TUR
TTE Scores Computed:  TUR
Grammar Scores Computed:  TUR
Processing Country:  DEU
Waves Detected:  DEU


  (p,residuals,rank,s) = np.linalg.lstsq(A,y)
  slope = r_num / ssxm
  t = r * np.sqrt(df / ((1.0 - r + TINY)*(1.0 + r + TINY)))
  sterrest = np.sqrt((1 - r**2) * ssym / ssxm / df)


Trends Detected:  DEU
Simple Generation Complete:  DEU
Top-k Complete:  DEU
Top-p Complete:  DEU
RE Scores Computed:  DEU
TTE Scores Computed:  DEU
Grammar Scores Computed:  DEU


In [8]:
#RE Scores
print("*** RE Scores ***")
print("template_re_scores: ", np.mean(template_re_scores))
print("t5_re_scores: ", np.mean(t5_re_scores))
print("t5_re_scores_topk: ", np.mean(t5_re_scores_topk))
print("t5_re_scores_topp: ", np.mean(t5_re_scores_topp))
print("bart_re_scores: ", np.mean(bart_re_scores))
print("bart_re_scores_topk: ", np.mean(bart_re_scores_topk))
print("bart_re_scores_topp: ", np.mean(bart_re_scores_topp))

print("\n")
print("*** Diversity Scores ***")
#Diveristy Scores
print("template_tte_scores: ", np.mean(template_tte_scores))
print("t5_tte_scores: ", np.mean(t5_tte_scores))
print("t5_tte_scores_topk: ", np.mean(t5_tte_scores_topk))
print("t5_tte_scores_topp: ", np.mean(t5_tte_scores_topp))
print("bart_tte_scores: ", np.mean(bart_tte_scores))
print("bart_tte_scores_topk: ", np.mean(bart_tte_scores_topk))
print("bart_tte_scores_topp: ", np.mean(bart_tte_scores_topp))

print("\n")
print("*** Grammar Scores ***")
#Grammar Scores
print("t5_g_scores: ", np.mean(t5_g_scores))
print("t5_g_scores_topk: ", np.mean(t5_g_scores_topk))
print("t5_g_scores_topp: ", np.mean(t5_g_scores_topp))
print("bart_g_scores: ", np.mean(bart_g_scores))
print("bart_g_scores_topk: ", np.mean(bart_g_scores_topk))
print("bart_g_scores_topp: ", np.mean(bart_g_scores_topp))

*** RE Scores ***
template_re_scores:  17.794000000000004
t5_re_scores:  64.486
t5_re_scores_topk:  65.678
t5_re_scores_topp:  68.024
bart_re_scores:  70.71000000000001
bart_re_scores_topk:  69.608
bart_re_scores_topp:  67.532


*** Diversity Scores ***
template_tte_scores:  0.2662341503911566
t5_tte_scores:  0.3109472873333881
t5_tte_scores_topk:  0.38714269563265336
t5_tte_scores_topp:  0.3699915339415927
bart_tte_scores:  0.4250835914750163
bart_tte_scores_topk:  0.4294212038876563
bart_tte_scores_topp:  0.4099059446725753


*** Grammar Scores ***
t5_g_scores:  0.99434795361903
t5_g_scores_topk:  0.9899279793236557
t5_g_scores_topp:  0.9928866408072878
bart_g_scores:  0.9388890914596416
bart_g_scores_topk:  0.9398873855847419
bart_g_scores_topp:  0.9430885059553639
