In [1]:
import pandas as pd
import numpy as np
import calendar
import math
import re
import string

In [2]:
import segmentation
import utils
import data2graph
from finetuned import T5FineTuner, BARTFineTuner, generate, generate_beam, graph2text_nobeam, graph2text_nobeam_ngram_es, graph2text_nobeam_topk, graph2text_nobeam_topp

In [3]:
import textstat
import language_tool_python
from lexical_diversity import lex_div as ld

In [4]:
tool = language_tool_python.LanguageTool('en-US')

def grammar_score(input_text):
    errors = len(tool.check(input_text))
    clean_text = input_text.translate(str.maketrans('', '', string.punctuation))
    clean_text = list(filter(None, clean_text.split(' ')))
    num_words = len(clean_text)
    return float(1-(errors/num_words))

### Loading Fine-Tuned PLMs

In [5]:
import torch
#cuda0 = torch.device("cuda:0")
#cuda1 = torch.device("cuda:1")
cuda3 = torch.device("cuda:3")

t5 = T5FineTuner.load_from_checkpoint("T5Models/T5Both.ckpt")
bart = BARTFineTuner.load_from_checkpoint("BARTModels/BARTBoth.ckpt")

t5.to(cuda3)
bart.to(cuda3)

BARTFineTuner(
  (model): BartForConditionalGeneration(
    (model): BartModel(
      (shared): Embedding(50268, 1024)
      (encoder): BartEncoder(
        (embed_tokens): Embedding(50268, 1024)
        (embed_positions): LearnedPositionalEmbedding(1026, 1024, padding_idx=1)
        (layers): ModuleList(
          (0): EncoderLayer(
            (self_attn): Attention(
              (k_proj): Linear(in_features=1024, out_features=1024, bias=True)
              (v_proj): Linear(in_features=1024, out_features=1024, bias=True)
              (q_proj): Linear(in_features=1024, out_features=1024, bias=True)
              (out_proj): Linear(in_features=1024, out_features=1024, bias=True)
            )
            (self_attn_layer_norm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
            (fc1): Linear(in_features=1024, out_features=4096, bias=True)
            (fc2): Linear(in_features=4096, out_features=1024, bias=True)
            (final_layer_norm): LayerNorm((1024,), eps=1e

### Population

In [6]:
#Import Population Dataset
ds_pop = pd.read_csv("Data/Population/Pop.csv")
ds_pop = ds_pop.dropna()

In [11]:
#RE Scores
template_re_scores = []
t5_re_scores = []
t5_re_scores_topk = []
t5_re_scores_topp = []
bart_re_scores = []
bart_re_scores_topk = []
bart_re_scores_topp = []

#Diveristy Scores
template_tte_scores = []
t5_tte_scores = []
t5_tte_scores_topk = []
t5_tte_scores_topp = []
bart_tte_scores = []
bart_tte_scores_topk = []
bart_tte_scores_topp = []

#Grammar Scores
t5_g_scores = []
t5_g_scores_topk = []
t5_g_scores_topp = []
bart_g_scores = []
bart_g_scores_topk = []
bart_g_scores_topp = []

#Grammar Mistakes
t5_g_mistake = []
t5_g_mistake_topk = []
t5_g_mistake_topp = []
bart_g_mistake = []
bart_g_mistake_topk = []
bart_g_mistake_topp = []

countries = ['USA', 'IND', 'BRA', 'RUS', 'GBR', 'FRA', 'ESP', 'ITA' , 'TUR',  'DEU']

for c in countries:
    
    print("Processing Country: ", c)
    
    country = ds_pop[ds_pop['Code']==c][['Population by Country (Clio Infra (2016))','Year']].reset_index().drop(columns=['index'])
    country_pop_raw = country['Population by Country (Clio Infra (2016))'].tolist()

    #Log-normalize data
    trans = np.ma.log(country_pop_raw)
    country_pop = trans.filled(0)
    
    #Detecting Waves
    embeds, cluster_labels = segmentation.tslr_rep(country_pop)
    cluster_arrangement = utils.find_contiguous(cluster_labels)
    indices = utils.find_indices(cluster_arrangement)
    wave_indices = utils.find_waves(country_pop_raw, indices, tolerance=7)
    
    print("Waves Detected: ", c)

    #Detecting Trends
    if c != "RUS":
        segmentation_results = segmentation.swab(country_pop, 0.1, 3 ,3)
    else:
        segmentation_results = segmentation.sliding_window(country_pop, 1.5)
    filtered_results = segmentation.re_segment(segmentation_results, country_pop)
    trends = segmentation.find_trend(filtered_results, country_pop)
    
    print("Trends Detected: ", c)
    
    location = ds_pop[ds_pop['Code']==c]['Entity'].iloc[0]
    
    graph, essentials = data2graph.build_graph_pop_form1("Population data", location, wave_indices, trends, country, country_pop_raw )
    
    #Template Narrative
    template_text = data2graph.build_template_pop_nums("Population data", location, wave_indices, trends, country, country_pop_raw )
    
    t5_prefix = 'translate Graph to English: '
    
    iso = c
    
    #Simple PLM Generation
    t5_narrative = graph2text_nobeam(t5, graph, t5_prefix, 512, cuda3)
    bart_narrative = graph2text_nobeam(bart , graph, "", 512, cuda3)
    bart_narrative = re.sub('</s>' , '', bart_narrative)
    
    print("Simple Generation Complete: ", iso)
    
    #Top-k at 50
    t5_narrative_topk = graph2text_nobeam_topk(t5, graph, t5_prefix, 50, 512, cuda3)
    bart_narrative_topk = graph2text_nobeam_topk(bart, graph, "", 50, 512, cuda3)
    bart_narrative_topk = re.sub('</s>' , '', bart_narrative_topk)
    
    print("Top-k Complete: ", iso)
    
    #Top-p at 0.92
    t5_narrative_topp = graph2text_nobeam_topp(t5, graph, t5_prefix, 0.92, 512, cuda3)
    bart_narrative_topp = graph2text_nobeam_topp(bart, graph, "", 0.92, 512, cuda3)
    bart_narrative_topp = re.sub('</s>' , '', bart_narrative_topp)
    
    print("Top-p Complete: ", iso)
    
    #RE Scores
    template_re_scores.append(textstat.flesch_reading_ease(template_text))
    t5_re_scores.append(textstat.flesch_reading_ease(t5_narrative))
    t5_re_scores_topk.append(textstat.flesch_reading_ease(t5_narrative_topk))
    t5_re_scores_topp.append(textstat.flesch_reading_ease(t5_narrative_topp))
    bart_re_scores.append(textstat.flesch_reading_ease(bart_narrative))
    bart_re_scores_topk.append(textstat.flesch_reading_ease(bart_narrative_topk))
    bart_re_scores_topp.append(textstat.flesch_reading_ease(bart_narrative_topp))
    
    print("RE Scores Computed: ", iso)
    
    #Diveristy Scores
    template_tte_scores.append(ld.ttr(ld.flemmatize(template_text)))
    t5_tte_scores.append(ld.ttr(ld.flemmatize(t5_narrative)))
    t5_tte_scores_topk.append(ld.ttr(ld.flemmatize(t5_narrative_topk)))
    t5_tte_scores_topp.append(ld.ttr(ld.flemmatize(t5_narrative_topp)))
    bart_tte_scores.append(ld.ttr(ld.flemmatize(bart_narrative)))
    bart_tte_scores_topk.append(ld.ttr(ld.flemmatize(bart_narrative_topk)))
    bart_tte_scores_topp.append(ld.ttr(ld.flemmatize(bart_narrative_topp)))
    
    print("TTE Scores Computed: ", iso)
    
    #Grammar Scores
    gs = grammar_score(t5_narrative)
    t5_g_scores.append(gs)
    if gs != 1.0:
        t5_g_mistake.append((graph, t5_narrative))
    
    gs = grammar_score(t5_narrative_topk)
    t5_g_scores_topk.append(gs)
    if gs != 1.0:
        t5_g_mistake_topk.append((graph, t5_narrative_topk))
    
    gs = grammar_score(t5_narrative_topp)
    t5_g_scores_topp.append(gs)
    if gs != 1.0:
        t5_g_mistake_topp.append((graph, t5_narrative_topp))
    
    gs = grammar_score(bart_narrative)                          
    bart_g_scores.append(gs)
    if gs != 1.0:
        bart_g_mistake.append((graph, bart_narrative))
        
    gs = grammar_score(bart_narrative_topk)
    bart_g_scores_topk.append(gs)
    if gs != 1.0:
        bart_g_mistake_topk.append((graph, bart_narrative_topk))
    
    gs = grammar_score(bart_narrative_topp)
    bart_g_scores_topp.append(gs)
    if gs != 1.0:
        bart_g_mistake_topp.append((graph, bart_narrative_topp))
    
    print("Grammar Scores Computed: ", iso)

Processing Country:  USA
Waves Detected:  USA
Trends Detected:  USA


  (p,residuals,rank,s) = np.linalg.lstsq(A,y)
  slope = r_num / ssxm
  t = r * np.sqrt(df / ((1.0 - r + TINY)*(1.0 + r + TINY)))
  sterrest = np.sqrt((1 - r**2) * ssym / ssxm / df)


Simple Generation Complete:  USA
Top-k Complete:  USA
Top-p Complete:  USA
RE Scores Computed:  USA
TTE Scores Computed:  USA
Grammar Scores Computed:  USA
Processing Country:  IND
Waves Detected:  IND
Trends Detected:  IND


  (p,residuals,rank,s) = np.linalg.lstsq(A,y)
  slope = r_num / ssxm
  t = r * np.sqrt(df / ((1.0 - r + TINY)*(1.0 + r + TINY)))
  sterrest = np.sqrt((1 - r**2) * ssym / ssxm / df)


Simple Generation Complete:  IND
Top-k Complete:  IND
Top-p Complete:  IND
RE Scores Computed:  IND
TTE Scores Computed:  IND
Grammar Scores Computed:  IND
Processing Country:  BRA
Waves Detected:  BRA
Trends Detected:  BRA


  (p,residuals,rank,s) = np.linalg.lstsq(A,y)
  slope = r_num / ssxm
  t = r * np.sqrt(df / ((1.0 - r + TINY)*(1.0 + r + TINY)))
  sterrest = np.sqrt((1 - r**2) * ssym / ssxm / df)


Simple Generation Complete:  BRA
Top-k Complete:  BRA
Top-p Complete:  BRA
RE Scores Computed:  BRA
TTE Scores Computed:  BRA
Grammar Scores Computed:  BRA
Processing Country:  RUS
Waves Detected:  RUS
Trends Detected:  RUS


  (p,residuals,rank,s) = np.linalg.lstsq(A,y)


Simple Generation Complete:  RUS
Top-k Complete:  RUS
Top-p Complete:  RUS
RE Scores Computed:  RUS
TTE Scores Computed:  RUS
Grammar Scores Computed:  RUS
Processing Country:  GBR
Waves Detected:  GBR
Trends Detected:  GBR


  (p,residuals,rank,s) = np.linalg.lstsq(A,y)
  slope = r_num / ssxm
  t = r * np.sqrt(df / ((1.0 - r + TINY)*(1.0 + r + TINY)))
  sterrest = np.sqrt((1 - r**2) * ssym / ssxm / df)


Simple Generation Complete:  GBR
Top-k Complete:  GBR
Top-p Complete:  GBR
RE Scores Computed:  GBR
TTE Scores Computed:  GBR
Grammar Scores Computed:  GBR
Processing Country:  FRA
Waves Detected:  FRA
Trends Detected:  FRA


  (p,residuals,rank,s) = np.linalg.lstsq(A,y)
  slope = r_num / ssxm
  t = r * np.sqrt(df / ((1.0 - r + TINY)*(1.0 + r + TINY)))
  sterrest = np.sqrt((1 - r**2) * ssym / ssxm / df)


Simple Generation Complete:  FRA
Top-k Complete:  FRA
Top-p Complete:  FRA
RE Scores Computed:  FRA
TTE Scores Computed:  FRA
Grammar Scores Computed:  FRA
Processing Country:  ESP
Waves Detected:  ESP
Trends Detected:  ESP


  (p,residuals,rank,s) = np.linalg.lstsq(A,y)
  slope = r_num / ssxm
  t = r * np.sqrt(df / ((1.0 - r + TINY)*(1.0 + r + TINY)))
  sterrest = np.sqrt((1 - r**2) * ssym / ssxm / df)


Simple Generation Complete:  ESP
Top-k Complete:  ESP
Top-p Complete:  ESP
RE Scores Computed:  ESP
TTE Scores Computed:  ESP
Grammar Scores Computed:  ESP
Processing Country:  ITA
Waves Detected:  ITA
Trends Detected:  ITA


  (p,residuals,rank,s) = np.linalg.lstsq(A,y)
  slope = r_num / ssxm
  t = r * np.sqrt(df / ((1.0 - r + TINY)*(1.0 + r + TINY)))
  sterrest = np.sqrt((1 - r**2) * ssym / ssxm / df)


Simple Generation Complete:  ITA
Top-k Complete:  ITA
Top-p Complete:  ITA
RE Scores Computed:  ITA
TTE Scores Computed:  ITA
Grammar Scores Computed:  ITA
Processing Country:  TUR
Waves Detected:  TUR


IndexError: list index out of range

In [12]:
#RE Scores
print("*** RE Scores ***")
print("template_re_scores: ", np.mean(template_re_scores))
print("t5_re_scores: ", np.mean(t5_re_scores))
print("t5_re_scores_topk: ", np.mean(t5_re_scores_topk))
print("t5_re_scores_topp: ", np.mean(t5_re_scores_topp))
print("bart_re_scores: ", np.mean(bart_re_scores))
print("bart_re_scores_topk: ", np.mean(bart_re_scores_topk))
print("bart_re_scores_topp: ", np.mean(bart_re_scores_topp))

print("\n")
print("*** Diversity Scores ***")
#Diveristy Scores
print("template_tte_scores: ", np.mean(template_tte_scores))
print("t5_tte_scores: ", np.mean(t5_tte_scores))
print("t5_tte_scores_topk: ", np.mean(t5_tte_scores_topk))
print("t5_tte_scores_topp: ", np.mean(t5_tte_scores_topp))
print("bart_tte_scores: ", np.mean(bart_tte_scores))
print("bart_tte_scores_topk: ", np.mean(bart_tte_scores_topk))
print("bart_tte_scores_topp: ", np.mean(bart_tte_scores_topp))

print("\n")
print("*** Grammar Scores ***")
#Grammar Scores
print("t5_g_scores: ", np.mean(t5_g_scores))
print("t5_g_scores_topk: ", np.mean(t5_g_scores_topk))
print("t5_g_scores_topp: ", np.mean(t5_g_scores_topp))
print("bart_g_scores: ", np.mean(bart_g_scores))
print("bart_g_scores_topk: ", np.mean(bart_g_scores_topk))
print("bart_g_scores_topp: ", np.mean(bart_g_scores_topp))

*** RE Scores ***
template_re_scores:  66.28
t5_re_scores:  69.57624999999999
t5_re_scores_topk:  74.19375000000001
t5_re_scores_topp:  71.82624999999999
bart_re_scores:  75.04875
bart_re_scores_topk:  76.8175
bart_re_scores_topp:  76.58375000000001


*** Diversity Scores ***
template_tte_scores:  0.46777487189137357
t5_tte_scores:  0.49512116482339175
t5_tte_scores_topk:  0.5599216064094512
t5_tte_scores_topp:  0.5365553504968045
bart_tte_scores:  0.5486699256103651
bart_tte_scores_topk:  0.5572979177683555
bart_tte_scores_topp:  0.5698693064182194


*** Grammar Scores ***
t5_g_scores:  1.0
t5_g_scores_topk:  1.0
t5_g_scores_topp:  1.0
bart_g_scores:  0.998046875
bart_g_scores_topk:  0.9865077741407529
bart_g_scores_topp:  0.9955021902377972
