In [1]:
import pandas as pd
import numpy as np
import calendar
import math
import re
import string

In [2]:
import segmentation
import utils
import data2graph
from finetuned import T5FineTuner, BARTFineTuner, generate, generate_beam, graph2text_nobeam, graph2text_nobeam_ngram_es, graph2text_nobeam_topk, graph2text_nobeam_topp

In [3]:
import textstat
import language_tool_python
from lexical_diversity import lex_div as ld

In [4]:
tool = language_tool_python.LanguageTool('en-US')

def grammar_score(input_text):
    errors = len(tool.check(input_text))
    clean_text = input_text.translate(str.maketrans('', '', string.punctuation))
    clean_text = list(filter(None, clean_text.split(' ')))
    num_words = len(clean_text)
    return float(1-(errors/num_words))

### Loading Fine-Tuned PLMs

In [5]:
import torch
cuda0 = torch.device("cuda:0")
#cuda1 = torch.device("cuda:1")
#cuda3 = torch.device("cuda:3")

t5 = T5FineTuner.load_from_checkpoint("T5Models/T5Both.ckpt")
bart = BARTFineTuner.load_from_checkpoint("BARTModels/BARTBoth.ckpt")

t5.to(cuda0)
bart.to(cuda0)

BARTFineTuner(
  (model): BartForConditionalGeneration(
    (model): BartModel(
      (shared): Embedding(50268, 1024)
      (encoder): BartEncoder(
        (embed_tokens): Embedding(50268, 1024)
        (embed_positions): LearnedPositionalEmbedding(1026, 1024, padding_idx=1)
        (layers): ModuleList(
          (0): EncoderLayer(
            (self_attn): Attention(
              (k_proj): Linear(in_features=1024, out_features=1024, bias=True)
              (v_proj): Linear(in_features=1024, out_features=1024, bias=True)
              (q_proj): Linear(in_features=1024, out_features=1024, bias=True)
              (out_proj): Linear(in_features=1024, out_features=1024, bias=True)
            )
            (self_attn_layer_norm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
            (fc1): Linear(in_features=1024, out_features=4096, bias=True)
            (fc2): Linear(in_features=4096, out_features=1024, bias=True)
            (final_layer_norm): LayerNorm((1024,), eps=1e

### Global Temperature

In [6]:
#Import Land Temp Dataset
ds_gtemp = pd.read_csv("Data/GlobalTemperature/GlobalLandTemperaturesByCountry.csv")
ds_gtemp = ds_gtemp.dropna()
ds_gtemp['dt'] = pd.to_datetime(ds_gtemp['dt'])
ds_gtemp['month'] = pd.DatetimeIndex(ds_gtemp['dt']).month
ds_gtemp['month'] = ds_gtemp['month'].apply(lambda x: calendar.month_name[x])
ds_gtemp['year'] = pd.DatetimeIndex(ds_gtemp['dt']).year
ds_gtemp.set_index(['dt'],inplace=True)

In [8]:
#RE Scores
template_re_scores = []
t5_re_scores = []
t5_re_scores_topk = []
t5_re_scores_topp = []
bart_re_scores = []
bart_re_scores_topk = []
bart_re_scores_topp = []

#Diveristy Scores
template_tte_scores = []
t5_tte_scores = []
t5_tte_scores_topk = []
t5_tte_scores_topp = []
bart_tte_scores = []
bart_tte_scores_topk = []
bart_tte_scores_topp = []

#Grammar Scores
t5_g_scores = []
t5_g_scores_topk = []
t5_g_scores_topp = []
bart_g_scores = []
bart_g_scores_topk = []
bart_g_scores_topp = []

#Grammar Mistakes
t5_g_mistake = []
t5_g_mistake_topk = []
t5_g_mistake_topp = []
bart_g_mistake = []
bart_g_mistake_topk = []
bart_g_mistake_topp = []

countries = ['United States', 'India', 'Brazil', 'Russia', 'United Kingdom', 'France', 'Spain', 'Italy' , 'Turkey', 'Germany']

for c in countries:
    
    print("Processing Country: ", c)
    
    country = ds_gtemp[ds_gtemp['Country']==c][['AverageTemperature','month', 'year']].reset_index().drop(columns=['dt'])
    country_gtemp_raw = country['AverageTemperature'].tolist()
    
    #Log-normalize data
    trans = np.ma.log(country_gtemp_raw)
    country_gtemp = trans.filled(0)
    
    print("\n Data Loaded")
    
    #Detecting Waves
    embeds, cluster_labels = segmentation.tslr_rep(country_gtemp)
    cluster_arrangement = utils.find_contiguous(cluster_labels)
    indices = utils.find_indices(cluster_arrangement)
    wave_indices = utils.find_waves(country_gtemp_raw, indices, tolerance=7)
    
    print("\n Waves Detected")

    #Detecting Trends
    segmentation_results = segmentation.sliding_window(country_gtemp, 7)
    print("\n Segmentation Done")
    filtered_results = segmentation.re_segment(segmentation_results, country_gtemp)
    trends = segmentation.find_trend(filtered_results, country_gtemp)
    
    print("\n Trends Detected")
    
    location = c
    
    graph, essentials = data2graph.build_graph_gtemp_form1("Global Temperature", location, wave_indices, trends, country, country_gtemp_raw )
    
    print("\n Graph Calculated")
    
    #Template Narrative
    template_text = data2graph.build_template_gtemp_nums("Global Temperature", location, wave_indices, trends, country, country_gtemp_raw )
    
    print("\n Templated Computed")
    
    t5_prefix = 'translate Graph to English: '
    
    iso = c
    
    #Simple PLM Generation
    t5_narrative = graph2text_nobeam(t5, graph, t5_prefix, 512, cuda0)
    bart_narrative = graph2text_nobeam(bart , graph, "", 512, cuda0)
    bart_narrative = re.sub('</s>' , '', bart_narrative)
    
    print("Simple Generation Complete: ", iso)
    
    #Top-k at 50
    t5_narrative_topk = graph2text_nobeam_topk(t5, graph, t5_prefix, 50, 512, cuda0)
    bart_narrative_topk = graph2text_nobeam_topk(bart, graph, "", 50, 512, cuda0)
    bart_narrative_topk = re.sub('</s>' , '', bart_narrative_topk)
    
    print("Top-k Complete: ", iso)
    
    #Top-p at 0.92
    t5_narrative_topp = graph2text_nobeam_topp(t5, graph, t5_prefix, 0.92, 512, cuda0)
    bart_narrative_topp = graph2text_nobeam_topp(bart, graph, "", 0.92, 512, cuda0)
    bart_narrative_topp = re.sub('</s>' , '', bart_narrative_topp)
    
    print("Top-p Complete: ", iso)
    
    #RE Scores
    template_re_scores.append(textstat.flesch_reading_ease(template_text))
    t5_re_scores.append(textstat.flesch_reading_ease(t5_narrative))
    t5_re_scores_topk.append(textstat.flesch_reading_ease(t5_narrative_topk))
    t5_re_scores_topp.append(textstat.flesch_reading_ease(t5_narrative_topp))
    bart_re_scores.append(textstat.flesch_reading_ease(bart_narrative))
    bart_re_scores_topk.append(textstat.flesch_reading_ease(bart_narrative_topk))
    bart_re_scores_topp.append(textstat.flesch_reading_ease(bart_narrative_topp))
    
    print("RE Scores Computed: ", iso)
    
    #Diveristy Scores
    template_tte_scores.append(ld.ttr(ld.flemmatize(template_text)))
    t5_tte_scores.append(ld.ttr(ld.flemmatize(t5_narrative)))
    t5_tte_scores_topk.append(ld.ttr(ld.flemmatize(t5_narrative_topk)))
    t5_tte_scores_topp.append(ld.ttr(ld.flemmatize(t5_narrative_topp)))
    bart_tte_scores.append(ld.ttr(ld.flemmatize(bart_narrative)))
    bart_tte_scores_topk.append(ld.ttr(ld.flemmatize(bart_narrative_topk)))
    bart_tte_scores_topp.append(ld.ttr(ld.flemmatize(bart_narrative_topp)))
    
    print("TTE Scores Computed: ", iso)
    
    #Grammar Scores
    gs = grammar_score(t5_narrative)
    t5_g_scores.append(gs)
    if gs != 1.0:
        t5_g_mistake.append((graph, t5_narrative))
    
    gs = grammar_score(t5_narrative_topk)
    t5_g_scores_topk.append(gs)
    if gs != 1.0:
        t5_g_mistake_topk.append((graph, t5_narrative_topk))
    
    gs = grammar_score(t5_narrative_topp)
    t5_g_scores_topp.append(gs)
    if gs != 1.0:
        t5_g_mistake_topp.append((graph, t5_narrative_topp))
    
    gs = grammar_score(bart_narrative)                          
    bart_g_scores.append(gs)
    if gs != 1.0:
        bart_g_mistake.append((graph, bart_narrative))
        
    gs = grammar_score(bart_narrative_topk)
    bart_g_scores_topk.append(gs)
    if gs != 1.0:
        bart_g_mistake_topk.append((graph, bart_narrative_topk))
    
    gs = grammar_score(bart_narrative_topp)
    bart_g_scores_topp.append(gs)
    if gs != 1.0:
        bart_g_mistake_topp.append((graph, bart_narrative_topp))
    
    print("Grammar Scores Computed: ", iso)

Processing Country:  United States

 Data Loaded

 Waves Detected


  (p,residuals,rank,s) = np.linalg.lstsq(A,y)



 Segmentation Done

 Trends Detected

 Graph Calculated

 Templated Computed
Simple Generation Complete:  United States
Top-k Complete:  United States
Top-p Complete:  United States
RE Scores Computed:  United States
TTE Scores Computed:  United States
Grammar Scores Computed:  United States
Processing Country:  India

 Data Loaded

 Waves Detected


  (p,residuals,rank,s) = np.linalg.lstsq(A,y)



 Segmentation Done

 Trends Detected

 Graph Calculated

 Templated Computed
Simple Generation Complete:  India
Top-k Complete:  India
Top-p Complete:  India
RE Scores Computed:  India
TTE Scores Computed:  India
Grammar Scores Computed:  India
Processing Country:  Brazil

 Data Loaded

 Waves Detected


  (p,residuals,rank,s) = np.linalg.lstsq(A,y)



 Segmentation Done

 Trends Detected

 Graph Calculated

 Templated Computed
Simple Generation Complete:  Brazil
Top-k Complete:  Brazil
Top-p Complete:  Brazil
RE Scores Computed:  Brazil
TTE Scores Computed:  Brazil
Grammar Scores Computed:  Brazil
Processing Country:  Russia

 Data Loaded

 Waves Detected


  (p,residuals,rank,s) = np.linalg.lstsq(A,y)



 Segmentation Done

 Trends Detected

 Graph Calculated

 Templated Computed
Simple Generation Complete:  Russia
Top-k Complete:  Russia
Top-p Complete:  Russia
RE Scores Computed:  Russia
TTE Scores Computed:  Russia
Grammar Scores Computed:  Russia
Processing Country:  United Kingdom

 Data Loaded

 Waves Detected


  (p,residuals,rank,s) = np.linalg.lstsq(A,y)



 Segmentation Done

 Trends Detected

 Graph Calculated

 Templated Computed
Simple Generation Complete:  United Kingdom
Top-k Complete:  United Kingdom
Top-p Complete:  United Kingdom
RE Scores Computed:  United Kingdom
TTE Scores Computed:  United Kingdom
Grammar Scores Computed:  United Kingdom
Processing Country:  France

 Data Loaded

 Waves Detected


  (p,residuals,rank,s) = np.linalg.lstsq(A,y)



 Segmentation Done

 Trends Detected

 Graph Calculated

 Templated Computed
Simple Generation Complete:  France
Top-k Complete:  France
Top-p Complete:  France
RE Scores Computed:  France
TTE Scores Computed:  France
Grammar Scores Computed:  France
Processing Country:  Spain

 Data Loaded

 Waves Detected


  (p,residuals,rank,s) = np.linalg.lstsq(A,y)



 Segmentation Done

 Trends Detected

 Graph Calculated

 Templated Computed
Simple Generation Complete:  Spain
Top-k Complete:  Spain
Top-p Complete:  Spain
RE Scores Computed:  Spain
TTE Scores Computed:  Spain
Grammar Scores Computed:  Spain
Processing Country:  Italy

 Data Loaded

 Waves Detected


  (p,residuals,rank,s) = np.linalg.lstsq(A,y)



 Segmentation Done

 Trends Detected

 Graph Calculated

 Templated Computed
Simple Generation Complete:  Italy
Top-k Complete:  Italy
Top-p Complete:  Italy
RE Scores Computed:  Italy
TTE Scores Computed:  Italy
Grammar Scores Computed:  Italy
Processing Country:  Turkey

 Data Loaded

 Waves Detected


  (p,residuals,rank,s) = np.linalg.lstsq(A,y)



 Segmentation Done

 Trends Detected

 Graph Calculated

 Templated Computed
Simple Generation Complete:  Turkey
Top-k Complete:  Turkey
Top-p Complete:  Turkey
RE Scores Computed:  Turkey
TTE Scores Computed:  Turkey
Grammar Scores Computed:  Turkey
Processing Country:  Germany

 Data Loaded

 Waves Detected


  (p,residuals,rank,s) = np.linalg.lstsq(A,y)



 Segmentation Done

 Trends Detected

 Graph Calculated

 Templated Computed
Simple Generation Complete:  Germany
Top-k Complete:  Germany
Top-p Complete:  Germany
RE Scores Computed:  Germany
TTE Scores Computed:  Germany
Grammar Scores Computed:  Germany


In [9]:
#RE Scores
print("*** RE Scores ***")
print("template_re_scores: ", np.mean(template_re_scores))
print("t5_re_scores: ", np.mean(t5_re_scores))
print("t5_re_scores_topk: ", np.mean(t5_re_scores_topk))
print("t5_re_scores_topp: ", np.mean(t5_re_scores_topp))
print("bart_re_scores: ", np.mean(bart_re_scores))
print("bart_re_scores_topk: ", np.mean(bart_re_scores_topk))
print("bart_re_scores_topp: ", np.mean(bart_re_scores_topp))

print("\n")
print("*** Diversity Scores ***")
#Diveristy Scores
print("template_tte_scores: ", np.mean(template_tte_scores))
print("t5_tte_scores: ", np.mean(t5_tte_scores))
print("t5_tte_scores_topk: ", np.mean(t5_tte_scores_topk))
print("t5_tte_scores_topp: ", np.mean(t5_tte_scores_topp))
print("bart_tte_scores: ", np.mean(bart_tte_scores))
print("bart_tte_scores_topk: ", np.mean(bart_tte_scores_topk))
print("bart_tte_scores_topp: ", np.mean(bart_tte_scores_topp))

print("\n")
print("*** Grammar Scores ***")
#Grammar Scores
print("t5_g_scores: ", np.mean(t5_g_scores))
print("t5_g_scores_topk: ", np.mean(t5_g_scores_topk))
print("t5_g_scores_topp: ", np.mean(t5_g_scores_topp))
print("bart_g_scores: ", np.mean(bart_g_scores))
print("bart_g_scores_topk: ", np.mean(bart_g_scores_topk))
print("bart_g_scores_topp: ", np.mean(bart_g_scores_topp))

*** RE Scores ***
template_re_scores:  -32.60000000000001
t5_re_scores:  67.479
t5_re_scores_topk:  66.064
t5_re_scores_topp:  66.98
bart_re_scores:  63.959
bart_re_scores_topk:  64.581
bart_re_scores_topp:  65.467


*** Diversity Scores ***
template_tte_scores:  0.37362918642999043
t5_tte_scores:  0.39800298942676315
t5_tte_scores_topk:  0.4648429017172216
t5_tte_scores_topp:  0.4459926517363096
bart_tte_scores:  0.4230103896435353
bart_tte_scores_topk:  0.40254574550059957
bart_tte_scores_topp:  0.4141072704373586


*** Grammar Scores ***
t5_g_scores:  0.9160556938702124
t5_g_scores_topk:  0.962284671126657
t5_g_scores_topp:  0.9597113481616681
bart_g_scores:  0.9200155183341259
bart_g_scores_topk:  0.9444883556410734
bart_g_scores_topp:  0.935676241115152
