# Pre-load + others..

- Calculate WER per row for pairs
- Average


In [1]:
# drive access
from google.colab import drive
drive.mount('/content/drive')

# standard library
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns


# for evaluation metrics
!pip install jiwer -q
!pip install bert_score -q
!pip install evaluate -q
from evaluate import load
from jiwer import wer


# models
import tensorflow as tf
from tensorflow import keras
from keras.layers import Conv2D, Input, MaxPool2D,Flatten, Dense, Permute, GlobalAveragePooling2D
from keras.models import Model
from keras.optimizers import Adam
from keras.preprocessing.sequence import pad_sequences #padding for GED

!pip install transformers -q
from transformers import TFRobertaModel, RobertaTokenizer
from transformers import AutoModel, AutoTokenizer
from transformers import TFAutoModelForSequenceClassification

# for confusion sets
import spacy

# to check differences between sentences
import math
from difflib import SequenceMatcher

#phonetics...
nlp = spacy.load("en_core_web_sm") #english language model

!pip install nltk
from nltk.corpus import wordnet

import nltk
nltk.download('words')

from nltk.corpus import words
from nltk.corpus import wordnet

!pip install pyphonetics

#https://github.com/Lilykos/pyphonetics
#https://pypi.org/project/pyphonetics/
import pyphonetics as pyph
from pyphonetics import Metaphone
from pyphonetics import Soundex
soundex = Soundex()
metaphone = Metaphone()

!pip install Levenshtein
import Levenshtein
#len(words.words())

!pip install pyinflect
import pyinflect

MessageError: ignored

In [None]:
# for proper casing function
# for POS NER
import spacy
NER = spacy.load("en_core_web_sm")

# for regex
import re


In [None]:
!pip install evaluate -q
from evaluate import load

Functions

In [None]:
def proper_casing(sentence):
  # capitalize the first letter of the sentence
  sentence = sentence.capitalize()
  # if there is .!? in the middle of the sentence, also capitalize the first letter that follows .!?
  re_sentence = re.split(r'(?<=[.!?])\s*', sentence)
  sentence = ' '.join([s.capitalize() for s in re_sentence])
  sentence = sentence.strip()
  # add . at the end of the sentence if not there already
  if sentence[-1] not in ['.', '!', '?']:
    sentence += '.'
  # if there is an 'i' character that's not in a word, capitalize 'i'
  sentence=re.sub(r'\bi\b', 'I', sentence)
  # proper casing the named entities
  candidate_ner = NER(sentence)
  for ent in candidate_ner.ents:
    if ent.label_ in ['PERSON', 'NORP', 'FAC', 'ORG', 'GPE', 'LOC', 'PRODUCT']:
      words = ent.text.split()
      capitalized_entity = ' '.join([word.capitalize() for word in words])
      sentence = sentence.replace(ent.text, capitalized_entity)
  return sentence

# Import results from GEC models

|Model|Grammatical threshold|Improvement threshold|k-beams|
|-----|---------------------|---------------------|----|
SIMPLE| 0.5 |0.25|3
||0.75|0.25|
||0.90|0.1|
FINETUNED| 0.5 |0.25|3
||0.75|0.25|
||0.90|0.1|
PHONETIC| 0.5 |0.25|20
||0.75|0.25|
||0.90|0.1|
RAW|0|0.0025|20
||0|0.05|
||0|0.15|




## Load RawGEC model

In [None]:
#load raw gecs (a,b,c)
csv_list = [("0a.RawGEC_reduced_____010_20.csv", "RawGEC", 0, 0.10),
            ("0b.RawGEC_reduced_____015_20.csv", "RawGEC", 0, 0.15),
            ("0c.RawGEC_reduced_____025_20.csv", "RawGEC", 0, 0.25)]


column_names = data = ['csvfile', 'modelname', 'thres1', 'thres2',
                       'filename','clean_filename', 'actor','gender',
                       'emotion', 'auto_transcription','label' ,
                       'cleaned_auto_transcription','cleaned_label',
                       'base_transcription', 'base_score',
                       'GEC_transcription_dis0','GEC_score_dis0']



df = pd.DataFrame(columns=column_names)


for csvfile, modelname, thres1, thres2 in csv_list:
  print(csvfile, modelname, thres1, thres2 )
  aux_df = pd.read_csv('/content/drive/MyDrive/266/Data/GEC_Data/4.RawGEC/' + csvfile)
  aux_df.insert(0, "thres2", thres2)
  aux_df.insert(0, "thres1", thres1)
  aux_df.insert(0, "modelname", modelname)
  aux_df.insert(0, "csvfile", csvfile)

  #aux_df = aux_df.rename(columns={'GEC_score_dis0': 'GEC_score'})
  #aux_df = aux_df.rename(columns={'GEC_transcription_dis0': 'GEC_transcription'})

  merge_df = aux_df[column_names]

  df = pd.concat([df, merge_df], ignore_index=True)

print(f"concat df #rows: {len(df)}")


In [None]:
# format to leave the columns the same as in the other models.
# Raw GEC uses base_transcription which has been cleaned by the proper_casing function.
# Add base_transcription NA to other models, to stack with same columns.
# label should bse passed through Rachel's logic before calculating evaluation metric.

raw_df = df.rename(columns={'GEC_score_dis0': 'GEC_score'})
raw_df = df.rename(columns={'GEC_transcription_dis0': 'GEC_transcription'})

# remove cleaned_label, because it was cleaned with out-of-the-box cleaning
raw_df = raw_df.drop(['clean_filename', 'cleaned_label'], axis=1)

# change everything in autotranscription and cleaned_auto_transcription to NA
raw_df['autotranscription'] = 'Not used'
raw_df['cleaned_auto_transcription'] = 'Not used'

# names in rawGEC
raw_df.head()



In [None]:
# pass the labels through the proper_casing function.
cleaned_labels = []

for each in raw_df['label']:
  cleaned_labels.append(proper_casing(each))

cleaned_labels[:5]

raw_df['cleaned_label'] = cleaned_labels
raw_df.head()

In [None]:
# drop label because it is not found in the other dataframe
raw_df = raw_df.drop(['label'], axis=1)
raw_df.head()

## Load other GEC models

In [None]:
# list of output datasets and characteristics
csv_list = [("0.SimpleGEC/0e_simple_gec_09_01_data.csv", "SimpleGEC", 0.9, 0.1),
          ("0.SimpleGEC/0d_simple_gec_075_025_data.csv", "SimpleGEC", 0.75, 0.25),
          ("0.SimpleGEC/0b_simple_gec_05_025_data.csv", "SimpleGEC", 0.5, 0.25),
          ("1.FineTuneGEC/0e_finetune_gec_09_01_data.csv", "FineTunedGEC", 0.9, 0.1),
          ("1.FineTuneGEC/0d_finetune_gec_075_025_10_data.csv", "FineTunedGEC", 0.75, 0.25),
          ("1.FineTuneGEC/0b_FineTuneGEC_reduced_train_05_025.csv", "FineTunedGEC", 0.5, 0.25),
          ("3.PhoneticGEC/0e_phonetic_gec_09_01_20_data.csv", "PhoneticGEC", 0.9, 0.1),
          ("3.PhoneticGEC/0d_phoneticv2_gec_075_025_20_data.csv", "PhoneticGEC", 0.75, 0.25),
          ("3.PhoneticGEC/0b_phonetic_gec_05_25_20_data.csv", "PhoneticGEC", 0.5, 0.25)
          ]


column_names = data = ['csvfile', 'modelname', 'thres1', 'thres2', 'filename', 'actor',
                       'gender', 'emotion', 'auto_transcription', 'cleaned_auto_transcription',
                       'cleaned_label', 'GEC_transcription', 'base_score', 'GEC_score']



df = pd.DataFrame(columns=column_names)

for csvfile, modelname, thres1, thres2 in csv_list:
  print(csvfile, modelname, thres1, thres2 )
  aux_df = pd.read_csv('/content/drive/MyDrive/266/Data/GEC_Data/' + csvfile)
  aux_df.insert(0, "thres2", thres2)
  aux_df.insert(0, "thres1", thres1)
  aux_df.insert(0, "modelname", modelname)
  aux_df.insert(0, "csvfile", csvfile)

  aux_df = aux_df.rename(columns={'simpleGEC_score': 'GEC_score'})
  aux_df = aux_df.rename(columns={'simpleGEC_transcription': 'GEC_transcription'})

  merge_df = aux_df[column_names]

  df = pd.concat([df, merge_df], ignore_index=True)

print(f"concat df #rows: {len(df)}")

In [None]:
# Add base_transcription NA to other models, to stack with same columns.
df['base_transcription'] = 'Not used'

## Full df of all GEC models

In [None]:
raw_df = raw_df.rename(columns={'GEC_score_dis0': 'GEC_score'})
raw_df = raw_df.reindex(columns=df.columns)

In [None]:
# Wanted to concat, but changed my mind because metrics are different
all_gecs = pd.concat([df, raw_df])
assert len(all_gecs) == len(df) + len(raw_df)

In [None]:
model_list = list(set(df['modelname'].to_list()))
model_list

In [None]:
file_list = list(set(df['csvfile'].to_list()))
file_list

## Load evaluation metrics

In [None]:
wer = load("wer")

In [None]:
bleu = load("bleu")

In [None]:
gleu = load("google_bleu")

In [None]:
bertscore = load("bertscore")

In [None]:
all_gecs.head()

# Calculate WER per row

For autotranscription-label and gectranscription-label pairs




### For all models except for RAW

- Labels = cleaned_labels (have passed through the proper_casing function)
- Transcriptions = cleaned_auto_transcription (have passed through the proper_casing function)
- GEC transcriptions: GEC_transcription

In [None]:
# convert variables to lists, for faster calculation
labels = df['cleaned_label'].to_list()
transcriptions = df['cleaned_auto_transcription'].to_list()
gecs = df['GEC_transcription'].to_list()

In [None]:
# has to be imported in this chunk, otherwise won't run
from jiwer import wer

# calculate WER per row for label:transcription pairs
wer_base = [wer(transc, label) for label, transc in zip(labels, transcriptions)]

# calculate WER per row for label:gectranscription pairs
wer_gec = [wer(gec, label) for label, gec in zip(labels, gecs)]

# wer for label:transcription pairs
df['wer_base'] = wer_base

# wer for label:gectranscription pairs
df['wer_gec'] = wer_gec

# difference in wer between label:transcription pairs and label:gectranscription pairs
df['wer_dif'] = ((df['wer_base']-df['wer_gec']))

### For RAW

- Labels = cleaned_labels (have passed through the proper_casing function)
- Transcriptions = base_autotranscriptions (have passed through the proper_casing function)
- GEC transcriptions: GEC_transcription

In [None]:
# convert variables to lists, for faster calculation
raw_labels = raw_df['cleaned_label'].to_list()
raw_transcriptions = raw_df['base_transcription'].to_list()
raw_gecs = raw_df['GEC_transcription'].to_list()

In [None]:
# has to be imported in this chunk, otherwise won't run
from jiwer import wer

# calculate WER per row for label:transcription pairs
wer_base = [wer(transc, label) for label, transc in zip(raw_labels, raw_transcriptions)]

# calculate WER per row for label:gectranscription pairs
wer_gec = [wer(gec, label) for label, gec in zip(raw_labels, raw_gecs)]

# wer for label:transcription pairs
raw_df['wer_base'] = wer_base

# wer for label:gectranscription pairs
raw_df['wer_gec'] = wer_gec

# difference in wer between label:transcription pairs and label:gectranscription pairs
raw_df['wer_dif'] = ((raw_df['wer_base']-raw_df['wer_gec']))

# Average WER per model

## For all models except for RAW

In [None]:
df.columns

In [None]:
# average WER for transcription:label pairs
mean_wer_gec_df = df.groupby(['modelname','thres1', 'thres2'])['wer_gec'].mean().reset_index(name='mean_wer_gec')
mean_wer_gec_df

In [None]:
mean_wer_gec_df = mean_wer_gec_df.drop(['thres2'], axis=1)
mean_wer_gec_df.head()

In [None]:
color_list = ['orange', 'purple', 'blue']

plt.figure(figsize=(10, 5))
ax = plt.gca()
ax.set_facecolor('white')

for i, model_name in enumerate(model_list):
    df_model = mean_wer_gec_df[mean_wer_gec_df['modelname'] == model_name]
    plt.plot(df_model['thres1'], df_model['mean_wer_gec'], color=color_list[i], label=model_name)

plt.xlabel("Grammatical threshold")
plt.ylabel("Mean WER")
plt.title("Mean WER vs. Grammatical Threshold")
plt.legend()
plt.show()

In [None]:
# difference in wer between label:transcription pairs and label:gectranscription pairs
mean_wer_dif_df = df.groupby(['modelname','thres1', 'thres2'])['wer_dif'].mean().reset_index(name='mean_wer_gec')
mean_wer_dif_df

In [None]:
#mean_wer_dif_df = mean_wer_dif_df.drop(['thres2'], axis=1)

color_list = ['orange', 'purple', 'blue']

plt.figure(figsize=(10, 5))
ax = plt.gca()
ax.set_facecolor('white')

for i, model_name in enumerate(model_list):
    df_model = mean_wer_dif_df[mean_wer_dif_df['modelname'] == model_name]
    plt.plot(df_model['thres1'], df_model['mean_wer_gec'], color=color_list[i], label=model_name)

plt.xlabel("Grammatical threshold")
plt.ylabel("Mean WER difference")
plt.title("Mean WER difference vs. Grammatical Threshold")
plt.legend()
plt.show()

## For raw

In [None]:
# average WER for transcription:label pairs
mean_wer_raw_gec_df = raw_df.groupby(['modelname','thres1', 'thres2'])['wer_gec'].mean().reset_index(name='mean_wer_gec')
mean_wer_raw_gec_df


# BLEU, GLEU and BERT Scores per row

## Simple, FineTuned, Phonetic

In [None]:
labels = [sentence for sentence in df['cleaned_label'].to_list()]
transcriptions = [sentence for sentence in df['cleaned_auto_transcription'].to_list()]
gecs = [sentence for sentence in df['GEC_transcription'].to_list()]

In [None]:
bleu_base = [bleu.compute(predictions=[trans], references=[[label]])
                for label, trans in zip(labels, transcriptions)]

In [None]:
bleu_gec = [bleu.compute(predictions=[gec], references=[[label]])
                for label, gec in zip(labels, gecs)]

In [None]:
gleu_base = [gleu.compute(predictions=[trans], references=[[label]])
                for label, trans in zip(labels, transcriptions)]

In [None]:
gleu_gec = [gleu.compute(predictions=[gec], references=[[label]])
                for label, gec in zip(labels, gecs)]

In [None]:
#bert_precision_base, bert_recall_base, bert_f1_base = bertscore.compute(predictions=transcriptions, references=labels, lang="en")
bert_scores_base = bertscore.compute(predictions=transcriptions, references=labels, lang="en")

In [None]:
bert_precision_base = bert_scores_base['precision']
bert_recall_base = bert_scores_base['recall']
bert_f1_base = bert_scores_base['f1']

In [None]:
bert_scores_gec = bertscore.compute(predictions=gecs, references=labels, lang="en")

bert_precision_gec = bert_scores_gec['precision']
bert_recall_gec = bert_scores_gec['recall']
bert_f1_gec = bert_scores_gec['f1']

In [None]:
df['bleu_base'] = [d.get('bleu') for d in bleu_base]
df['bleu_gec'] = [d.get('bleu') for d in bleu_gec]
df['gleu_base'] = [d.get('google_bleu') for d in gleu_base]
df['gleu_gec'] = [d.get('google_bleu') for d in gleu_gec]

df['bert_prec_base'] = bert_precision_base
df['bert_prec_gec'] = bert_precision_gec
df['bert_recall_base'] = bert_recall_base
df['bert_recall_gec'] = bert_recall_gec
df['bert_f1_base'] = bert_f1_base
df['bert_f1_gec'] = bert_f1_gec

#df['wer_dif'] = ((df['wer_base']-df['wer_gec']))

In [None]:
df.head(50)

## Raw GEC

In [None]:
rlabels = [[sentence] for sentence in raw_df['cleaned_label'].to_list()]
rtranscriptions = [sentence for sentence in raw_df['base_transcription'].to_list()]
rgecs = [sentence for sentence in raw_df['GEC_transcription'].to_list()]

In [None]:
bleu_base = [bleu.compute(predictions=[trans], references=[[label]])
                for label, trans in zip(rlabels, rtranscriptions)]

bleu_gec = [bleu.compute(predictions=[gec], references=[[label]])
                for label, gec in zip(rlabels, rgecs)]

In [None]:
gleu_base = [gleu.compute(predictions=[trans], references=[[label]])
                for label, trans in zip(rlabels, rtranscriptions)]

gleu_gec = [gleu.compute(predictions=[gec], references=[[label]])
                for label, gec in zip(rlabels, rgecs)]

In [None]:
raw_df['bleu_base'] = [d.get('bleu') for d in bleu_base]
raw_df['bleu_gec'] = [d.get('bleu') for d in bleu_gec]
raw_df['gleu_base'] = [d.get('google_bleu') for d in gleu_base]
raw_df['gleu_gec'] = [d.get('google_bleu') for d in gleu_gec]

In [None]:
raw_df.head(50)

In [None]:
bert_scores_base = bertscore.compute(predictions=rtranscriptions, references=rlabels, lang="en")
bert_precision_base = bert_scores_base['precision']
bert_recall_base = bert_scores_base['recall']
bert_f1_base = bert_scores_base['f1']

In [None]:
bert_scores_gec = bertscore.compute(predictions=rgecs, references=rlabels, lang="en")

bert_precision_gec = bert_scores_gec['precision']
bert_recall_gec = bert_scores_gec['recall']
bert_f1_gec = bert_scores_gec['f1']

In [None]:
raw_df['bert_prec_base'] = bert_precision_base
raw_df['bert_prec_gec'] = bert_precision_gec
raw_df['bert_recall_base'] = bert_recall_base
raw_df['bert_recall_gec'] = bert_recall_gec
raw_df['bert_f1_base'] = bert_f1_base
raw_df['bert_f1_gec'] = bert_f1_gec

## Include on datasets columns without punct and all lower casing.

In [None]:
def nop_lc(label):
  ''' nop : no punctuation
      lc lower case
  '''
  punctuation_chars = set('!,.?;:')
  nop_lc_str = ''.join(char for char in label if char not in punctuation_chars).lower()
  return nop_lc_str

In [None]:
df['nop_lc_label'] =df['cleaned_label'].apply(nop_lc)
df['nop_lc_transcription'] = df['cleaned_auto_transcription'].apply(nop_lc)
df['nop_lc_gec_trans'] = df['GEC_transcription'].apply(nop_lc)

In [None]:
df.head()

In [None]:
raw_df['nop_lc_label'] = raw_df['cleaned_label'].apply(nop_lc)
raw_df['nop_lc_transcription'] = raw_df['base_transcription'].apply(nop_lc)
raw_df['nop_lc_gec_trans'] = raw_df['GEC_transcription'].apply(nop_lc)

In [None]:
raw_df.head()

# Now calculate everything again.....

###Phonetics, Simple, Finetuned

In [None]:
#nop_lc_label
#nop_lc_transcription
#nop_lc_gec_trans
labels = [sentence for sentence in df['nop_lc_label'].to_list()]
transcriptions = [sentence for sentence in df['nop_lc_transcription'].to_list()]
gecs = [sentence for sentence in df['nop_lc_gec_trans'].to_list()]

In [None]:
bleu_base = [bleu.compute(predictions=[trans], references=[[label]])
                for label, trans in zip(labels, transcriptions)]

bleu_gec = [bleu.compute(predictions=[gec], references=[[label]])
                for label, gec in zip(labels, gecs)]

gleu_base = [gleu.compute(predictions=[trans], references=[[label]])
                for label, trans in zip(labels, transcriptions)]

gleu_gec = [gleu.compute(predictions=[gec], references=[[label]])
                for label, gec in zip(labels, gecs)]

In [None]:
bert_scores_base = bertscore.compute(predictions=transcriptions, references=labels, lang="en")
bert_precision_base = bert_scores_base['precision']
bert_recall_base = bert_scores_base['recall']
bert_f1_base = bert_scores_base['f1']

In [None]:
bert_scores_gec = bertscore.compute(predictions=gecs, references=labels, lang="en")

bert_precision_gec = bert_scores_gec['precision']
bert_recall_gec = bert_scores_gec['recall']
bert_f1_gec = bert_scores_gec['f1']

In [None]:
df['nop_lc_bleu_base'] = [d.get('bleu') for d in bleu_base]
df['nop_lc_bleu_gec'] = [d.get('bleu') for d in bleu_gec]
df['nop_lc_gleu_base'] = [d.get('google_bleu') for d in gleu_base]
df['nop_lc_gleu_gec'] = [d.get('google_bleu') for d in gleu_gec]

df['nop_lc_bert_prec_base'] = bert_precision_base
df['nop_lc_bert_prec_gec'] = bert_precision_gec
df['nop_lc_bert_recall_base'] = bert_recall_base
df['nop_lc_bert_recall_gec'] = bert_recall_gec
df['nop_lc_bert_f1_base'] = bert_f1_base
df['nop_lc_bert_f1_gec'] = bert_f1_gec

### Raw :'(

In [None]:
#nop_lc_label
#nop_lc_transcription
#nop_lc_gec_trans
rlabels = [[sentence] for sentence in raw_df['nop_lc_label'].to_list()]
rtranscriptions = [sentence for sentence in raw_df['nop_lc_transcription'].to_list()]
rgecs = [sentence for sentence in raw_df['nop_lc_gec_trans'].to_list()]

In [None]:
bleu_base = [bleu.compute(predictions=[trans], references=[[label]])
                for label, trans in zip(rlabels, rtranscriptions)]

bleu_gec = [bleu.compute(predictions=[gec], references=[[label]])
                for label, gec in zip(rlabels, rgecs)]

In [None]:
gleu_base = [gleu.compute(predictions=[trans], references=[[label]])
                for label, trans in zip(rlabels, rtranscriptions)]

gleu_gec = [gleu.compute(predictions=[gec], references=[[label]])
                for label, gec in zip(rlabels, rgecs)]

In [None]:
raw_df['nop_lc_bleu_base'] = [d.get('bleu') for d in bleu_base]
raw_df['nop_lc_bleu_gec'] = [d.get('bleu') for d in bleu_gec]
raw_df['nop_lc_gleu_base'] = [d.get('google_bleu') for d in gleu_base]
raw_df['nop_lc_gleu_gec'] = [d.get('google_bleu') for d in gleu_gec]

In [None]:
raw_df.head(50)

In [None]:
bert_scores_base = bertscore.compute(predictions=rtranscriptions, references=rlabels, lang="en")
bert_precision_base = bert_scores_base['precision']
bert_recall_base = bert_scores_base['recall']
bert_f1_base = bert_scores_base['f1']

In [None]:
bert_scores_gec = bertscore.compute(predictions=rgecs, references=rlabels, lang="en")

bert_precision_gec = bert_scores_gec['precision']
bert_recall_gec = bert_scores_gec['recall']
bert_f1_gec = bert_scores_gec['f1']

In [None]:
raw_df['nop_lc_bert_prec_base'] = bert_precision_base
raw_df['nop_lc_bert_prec_gec'] = bert_precision_gec
raw_df['nop_lc_bert_recall_base'] = bert_recall_base
raw_df['nop_lc_bert_recall_gec'] = bert_recall_gec
raw_df['nop_lc_bert_f1_base'] = bert_f1_base
raw_df['nop_lc_bert_f1_gec'] = bert_f1_gec

### WER no casing no punct

In [None]:
from jiwer import wer

labels = df['nop_lc_label'].to_list()
transcriptions = df['nop_lc_transcription'].to_list()
gecs = df['nop_lc_gec_trans'].to_list()

nop_lc_wer_base = [wer(transc, label) for label, transc in zip(labels, transcriptions)]
nop_lc_wer_gec = [wer(gec, label) for label, gec in zip(labels, gecs)]

df['nop_lc_wer_base'] = nop_lc_wer_base
df['nop_lc_wer_gec'] = nop_lc_wer_gec


In [None]:
from jiwer import wer

labels = raw_df['nop_lc_label'].to_list()
transcriptions = raw_df['nop_lc_transcription'].to_list()
gecs = raw_df['nop_lc_gec_trans'].to_list()

nop_lc_wer_base = [wer(transc, label) for label, transc in zip(labels, transcriptions)]
nop_lc_wer_gec = [wer(gec, label) for label, gec in zip(labels, gecs)]

raw_df['nop_lc_wer_base'] = nop_lc_wer_base
raw_df['nop_lc_wer_gec'] = nop_lc_wer_gec


# Queries for BLEU, GLEU, BERT PRECISION, RECALL, F1 - **with** punct and casing



In [None]:
df.columns

In [None]:
print('base values')

pt_df = pd.pivot_table(df,
                       index=['modelname'],
                        values=['wer_base', 'bleu_base', 'gleu_base', 'bert_prec_base',
                                'bert_recall_base', 'bert_f1_base'],
                        aggfunc={'wer_base': 'mean',
                                 'bleu_base': 'mean',
                                 'gleu_base': 'mean',
                                 'bert_prec_base': 'mean',
                                 'bert_recall_base': 'mean',
                                 'bert_f1_base': 'mean'
                                 })
base_m = pt_df.reset_index()
base_m[:1].T

In [None]:
print('model values')
pt_df = pd.pivot_table(df,
                       index=['modelname', 'thres1', 'thres2'],
                        values=['wer_gec', 'wer_dif',
                                'bleu_gec', 'gleu_gec', 'bert_prec_gec',
                                'bert_recall_gec', 'bert_f1_gec'],
                        aggfunc={'wer_gec': 'mean',
                                 'bleu_gec': 'mean',
                                 'gleu_gec': 'mean',
                                 'bert_prec_gec': 'mean',
                                 'bert_recall_gec': 'mean',
                                 'bert_f1_gec': 'mean'
                                 })
query_w_df = pt_df.reset_index()
query_w_df
base_row = {'modelname': "base",'bert_f1_gec': 0.97257, 'bert_prec_gec': 0.970828, 'bert_recall_gec': 0.974405, 'bleu_gec': 0.686065, 'gleu_gec': 0.73555, 'wer_gec': 0.170901}

idx = 0
query_w_df = pd.concat([query_w_df.iloc[:idx, :], pd.DataFrame([base_row]), query_w_df.iloc[idx:, :]]).reset_index(drop=False)
query_w_df_sorted = pd.concat([query_w_df.iloc[:1], query_w_df.iloc[1:].sort_values(by='wer_gec')])
query_w_df_sorted

In [None]:
print('base values')

pt_df = pd.pivot_table(raw_df,
                       index=['modelname'],
                        values=['wer_base', 'bleu_base', 'gleu_base', 'bert_prec_base',
                                'bert_recall_base', 'bert_f1_base'],
                        aggfunc={'wer_base': 'mean',
                                 'bleu_base': 'mean',
                                 'gleu_base': 'mean',
                                 'bert_prec_base': 'mean',
                                 'bert_recall_base': 'mean',
                                 'bert_f1_base': 'mean'
                                 })
base_rm = pt_df.reset_index()
base_rm[:1].T

In [None]:
print('raw values')
pt_df = pd.pivot_table(raw_df,
                       index=['modelname', 'thres1', 'thres2'],
                        values=['wer_gec', 'wer_dif',
                                'bleu_gec', 'gleu_gec', 'bert_prec_gec',
                                'bert_recall_gec', 'bert_f1_gec'],
                        aggfunc={'wer_gec': 'mean',
                                 'bleu_gec': 'mean',
                                 'gleu_gec': 'mean',
                                 'bert_prec_gec': 'mean',
                                 'bert_recall_gec': 'mean',
                                 'bert_f1_gec': 'mean'
                                 })
query_w_rawdf = pt_df.reset_index()

base_row = {'modelname': "base",'bert_f1_gec': 0.975215, 'bert_prec_gec': 0.975337, 'bert_recall_gec': 0.975177,
            'bleu_gec': 0.462496, 'gleu_gec': 	0.500952, 'wer_gec': 0.142285}
idx = 0
query_w_rawdf = pd.concat([query_w_rawdf.iloc[:idx, :], pd.DataFrame([base_row]), query_w_rawdf.iloc[idx:, :]]).reset_index(drop=False)
query_w_rawdf_sorted = pd.concat([query_w_rawdf.iloc[:1], query_w_rawdf.iloc[1:].sort_values(by='wer_gec')])
query_w_rawdf_sorted


# Queries for BLEU, GLEU, BERT PRECISION, RECALL, F1 - **without** punct and casing

In [None]:
df.columns

In [None]:
print('base values')

pt_df = pd.pivot_table(df,
                       index=['modelname'],
                        values=['nop_lc_wer_base', 'nop_lc_bleu_base', 'nop_lc_gleu_base', 'nop_lc_bert_prec_base',
                                'nop_lc_bert_recall_base', 'nop_lc_bert_f1_base'],
                        aggfunc={'nop_lc_wer_base': 'mean',
                                 'nop_lc_bleu_base': 'mean',
                                 'nop_lc_gleu_base': 'mean',
                                 'nop_lc_bert_prec_base': 'mean',
                                 'nop_lc_bert_recall_base': 'mean',
                                 'nop_lc_bert_f1_base': 'mean'
                                 })
base_m = pt_df.reset_index()
base_m[:1].T

In [None]:
print('model values')
pt_df = pd.pivot_table(df,
                       index=['modelname', 'thres1', 'thres2'],
                        values=['nop_lc_wer_gec',
                                'nop_lc_bleu_gec', 'nop_lc_gleu_gec', 'nop_lc_bert_prec_gec',
                                'nop_lc_bert_recall_gec', 'nop_lc_bert_f1_gec'],
                        aggfunc={'nop_lc_wer_gec': 'mean',
                                 'nop_lc_bleu_gec': 'mean',
                                 'nop_lc_gleu_gec': 'mean',
                                 'nop_lc_bert_prec_gec': 'mean',
                                 'nop_lc_bert_recall_gec': 'mean',
                                 'nop_lc_bert_f1_gec': 'mean'
                                 })
query_wo_df = pt_df.reset_index()
query_wo_df = pt_df.reset_index()
query_wo_df

base_wo_row = {'modelname': "base",'nop_lc_bert_f1_gec': 0.9791, 'nop_lc_bert_prec_gec': 0.978208, 'nop_lc_bert_recall_gec': 0.980068, 'nop_lc_bleu_gec': 0.819161, 'nop_lc_gleu_gec': 0.85686, 'nop_lc_wer_gec': 0.089672}

idx = 0
query_wo_df = pd.concat([query_wo_df.iloc[:idx, :], pd.DataFrame([base_wo_row]), query_wo_df.iloc[idx:, :]]).reset_index(drop=False)
query_wo_df_sorted = pd.concat([query_wo_df.iloc[:1], query_wo_df.iloc[1:].sort_values(by='nop_lc_wer_gec')])
query_wo_df_sorted

In [None]:
print('base values')

pt_df = pd.pivot_table(raw_df,
                       index=['modelname'],
                        values=['nop_lc_wer_base', 'nop_lc_bleu_base', 'nop_lc_gleu_base', 'nop_lc_bert_prec_base',
                                'nop_lc_bert_recall_base', 'nop_lc_bert_f1_base'],
                        aggfunc={'nop_lc_wer_base': 'mean',
                                 'nop_lc_bleu_base': 'mean',
                                 'nop_lc_gleu_base': 'mean',
                                 'nop_lc_bert_prec_base': 'mean',
                                 'nop_lc_bert_recall_base': 'mean',
                                 'nop_lc_bert_f1_base': 'mean'
                                 })
base_rm_wo = pt_df.reset_index()
base_rm_wo[:1].T

In [None]:
print('model values')
pt_df = pd.pivot_table(raw_df,
                       index=['modelname', 'thres1', 'thres2'],
                        values=['nop_lc_wer_gec',
                                'nop_lc_bleu_gec', 'nop_lc_gleu_gec', 'nop_lc_bert_prec_gec',
                                'nop_lc_bert_recall_gec', 'nop_lc_bert_f1_gec'],
                        aggfunc={'nop_lc_wer_gec': 'mean',
                                 'nop_lc_bleu_gec': 'mean',
                                 'nop_lc_gleu_gec': 'mean',
                                 'nop_lc_bert_prec_gec': 'mean',
                                 'nop_lc_bert_recall_gec': 'mean',
                                 'nop_lc_bert_f1_gec': 'mean'
                                 })
query_wo_df = pt_df.reset_index()
query_wo_df = pt_df.reset_index()
query_wo_df

base_wo_row = {'modelname': "base",'nop_lc_bert_f1_gec': 0.978416, 'nop_lc_bert_prec_gec': 0.977622, 'nop_lc_bert_recall_gec': 	0.979282,
               'nop_lc_bleu_gec': 	0.428723, 'nop_lc_gleu_gec': 	0.473926, 'nop_lc_wer_gec': 	0.094416}

idx = 0
query_wo_df = pd.concat([query_wo_df.iloc[:idx, :], pd.DataFrame([base_wo_row]), query_wo_df.iloc[idx:, :]]).reset_index(drop=False)
query_wo_df_sorted = pd.concat([query_wo_df.iloc[:1], query_wo_df.iloc[1:].sort_values(by='nop_lc_wer_gec')])
query_wo_df_sorted

##Does this mean our model literally did not correct any words, only corrected for punctuation?

In [None]:
top_model_df = raw_df[raw_df['thres2']==0.25]

In [None]:
top_model_df

In [None]:
column_to_drop = ['csvfile', 'filename', 'auto_transcription', 'modelname', 'modelname', 'thres1', 'thres2', 'cleaned_auto_transcription', 'actor','wer_dif']
top_model_df = top_model_df.drop(columns=column_to_drop)

In [None]:
top_model_df.columns


In [None]:
order_col = ['gender', 'emotion', 'base_transcription', 'GEC_transcription', 'cleaned_label',
             'base_score', 'GEC_score',  'wer_base', 'wer_gec',
             'bleu_base', 'bleu_gec', 'gleu_base', 'gleu_gec',
             'bert_prec_base', 'bert_prec_gec',
             'bert_recall_base', 'bert_recall_gec',
             'bert_f1_base', 'bert_f1_gec',
             'nop_lc_transcription', 'nop_lc_gec_trans', 'nop_lc_label',
             'nop_lc_wer_base', 'nop_lc_wer_gec',
             'nop_lc_bleu_base', 'nop_lc_bleu_gec',
             'nop_lc_gleu_base', 'nop_lc_gleu_gec',
             'nop_lc_bert_prec_base', 'nop_lc_bert_prec_gec',
             'nop_lc_bert_recall_base', 'nop_lc_bert_recall_gec',
             'nop_lc_bert_f1_base', 'nop_lc_bert_f1_gec'
       ]
top_model_df = top_model_df[order_col]

In [None]:
top_model_df

In [None]:
top_model_df.to_csv('/content/drive/MyDrive/266/EDA/GEC_examples/topmodel.csv', index=False)


# Compare GEC and Transcriptions from a no casing and casing POV



###Functions Definitions

In [None]:
def check_pos(word, sentence, pos_tag):
    doc = nlp(sentence)
    word_pos = None

    for token in doc:
        if token.text == word:
            word_pos = token.pos_
            break

    if word_pos == pos_tag:
      output = f"POS tag' {word_pos}' is as expected "

    else:
      output = f"The word' {word}' plays another role: {word_pos} "

    return output

In [None]:
def sentence_postag(sentence):

    doc = nlp(sentence) # using lower so not to double count some words when capitalized or not - gotta check if that makes sense)
    word_pos = None

    l_tokens = []
    l_postag = []

    for token in doc:
        l_tokens.append(token)
        l_postag.append(token.pos_)
        #print(l_postag)
#    return  l_postag
    return l_tokens, l_postag

### Create token and postag base for counts

In [None]:
#add no casing and no punct columns back
top_model_df['nop_lc_label'] = top_model_df['cleaned_label'].apply(nop_lc)
top_model_df['nop_lc_transcription'] = top_model_df['base_transcription'].apply(nop_lc)
top_model_df['nop_lc_gec_trans'] = top_model_df['GEC_transcription'].apply(nop_lc)

In [None]:
# punct and cased
transcription = top_model_df['base_transcription'].to_list()
pos_transcription = [[token.pos_ for token in nlp(sentence)] for sentence in transcription]
tok_transcription = [[token.text for token in nlp(sentence)] for sentence in transcription]

label = top_model_df['cleaned_label'].to_list()
pos_label = [[token.pos_ for token in nlp(sentence)] for sentence in label]
tok_label = [[token.text for token in nlp(sentence)] for sentence in label]

# no punct and lowercased
transcription = top_model_df['nop_lc_transcription'].to_list()
pos_transcription_nop_lc = [[token.pos_ for token in nlp(sentence)] for sentence in transcription]
tok_transcription_nop_lc = [[token.text for token in nlp(sentence)] for sentence in transcription]

label = top_model_df['nop_lc_label'].to_list()
pos_label_nop_lc = [[token.pos_ for token in nlp(sentence)] for sentence in label]
tok_label_nop_lc = [[token.text for token in nlp(sentence)] for sentence in label]


In [None]:
#
top_model_df['pos_transcription'] = pos_transcription
top_model_df['tok_transcription'] = tok_transcription
top_model_df['pos_label'] = pos_label
top_model_df['tok_label'] = tok_label

#
top_model_df['pos_transcription_nop_lc'] = pos_transcription_nop_lc
top_model_df['tok_transcription_nop_lc'] = tok_transcription_nop_lc
top_model_df['pos_label_nop_lc'] = pos_label_nop_lc
top_model_df['tok_label_nop_lc'] = tok_label_nop_lc

In [None]:
pos_tags_list = [
    "ADJ", "ADP", "ADV", "AUX", "CONJ", "CCONJ", "DET", "INTJ", "NOUN",
    "NUM", "PART", "PRON", "PROPN", "PUNCT", "SCONJ", "SYM", "VERB", "X", "SPACE"
]

a_pos_counts = {tag: {'total': 0, 'dif': 0} for tag in pos_tags_list}
m_pos_counts = {tag: {'total': 0, 'dif': 0} for tag in pos_tags_list}

In [None]:
aligned_df = top_model_df[top_model_df.apply(lambda row: len(row['tok_transcription_nop_lc']) == len(row['tok_label_nop_lc']), axis=1)]
aligned_df

In [None]:
len(aligned_df)

In [None]:
len(top_model_df) - len(aligned_df)

In [None]:
aligned_pos_transcription = aligned_df['base_transcription'].to_list()
aligned_pos_label = aligned_df['cleaned_label'].to_list()

In [None]:
assert len(aligned_pos_label) == len(aligned_pos_transcription)

In [None]:
all_pos = []

for each in aligned_pos_label:
  for pos in each:
    if pos not in all_pos:
      all_pos.append(pos)

for each in aligned_pos_transcription:
  for pos in each:
    if pos not in all_pos:
      all_pos.append(pos)

In [None]:
confusion={}
for each in all_pos:
  confusion[each]={}

In [None]:
for i, pos_transcriptions in enumerate(aligned_pos_transcription):
  pos_labels = aligned_pos_label[i]
  for j, pos_transcription in enumerate(pos_transcriptions):
    pos_label = pos_labels[j]
    if pos_label != pos_transcription:
      if pos_transcription in confusion[pos_label].keys():
        confusion[pos_label][pos_transcription] += 1
      else:
        confusion[pos_label][pos_transcription] = 1

In [None]:
sorted_confusion = dict(sorted(confusion.items(), key=lambda item: sum(item[1].values()), reverse=True))
sorted_confusion

In [None]:
confusion_df = pd.DataFrame.from_dict(sorted_confusion)
confusion_df.fillna(0, inplace=True)
confusion_df

In [None]:
mistake1={}
mistake2={}
mistake3={}

for i, pos_transcriptions in enumerate(aligned_pos_transcription):
  pos_labels = aligned_pos_label[i]
  for j, pos_transcription in enumerate(pos_transcriptions):
    pos_label = pos_labels[j]
    if pos_label=='PROPN' and pos_transcription=='NOUN':
      label = tok_label[i][j]
      transcription = tok_transcription[i][j]
      mistake1[(i,j)] = (label, transcription)
    elif pos_label=='PROPN' and pos_transcription=='ADJ':
      label = tok_label[i][j]
      transcription = tok_transcription[i][j]
      mistake2[(i,j)] = (label, transcription)
    elif pos_label=='PRON' and pos_transcription=='NOUN':
      label = tok_label[i][j]
      transcription = tok_transcription[i][j]
      mistake3[(i,j)] = (label, transcription)

#Export both DFs, just in case

In [None]:
df.to_csv('/content/drive/MyDrive/266/EDA/GEC_examples/df_GEC.csv', index=False)
raw_df.to_csv('/content/drive/MyDrive/266/EDA/GEC_examples/rawdf_GEC.csv', index=False)

# Check which sentences were initially over the grammatical acceptability threshold, and which passed it after GEC

In [None]:
# check whether sentences that were initially over the GAC threshold
df['acc_base'] = df['base_score'] >= thres1

# check whether sentences passed it after GEC
df['acc_GEC'] = df['GEC_score'] >= thres1

## Calculate difference in grammatical score before (base) and after GEC

In [None]:
# check whether sentences improved after GEC
df['imp_base_GEC'] = df['GEC_score'] >= df['base_score']

# calculate change in score after GEC
df['totalimp_base_GEC'] = df['GEC_score'] - df['base_score']

In [None]:
df['n_sentences'] = 1

In [None]:
df.head()
len(df)

In [None]:
# sentences that were not grammatically acceptable before GEC and are now
improv_df = df[(df['acc_base'] == False) & (df['acc_GEC']==True)]

In [None]:
improv_df.sort_values('totalimp_base_GEC', ascending=False)

In [None]:
# sentences that were not grammatically acceptable before, and show no improvement
bad_df = df[(df['acc_base'] == False) & (df['totalimp_base_GEC'] == 0)]

In [None]:
# no sentences that were grammatically acceptable before and now
worse_now = df[(df['acc_base'] == True) & (df['acc_GEC'] == False)]
len(worse_now)

In [None]:
# were acceptable before and are still acceptable
no_change_good = df[(df['acc_base'] == True) & (df['acc_GEC'] == True)]
len(no_change_good)

In [None]:
# were unacceptable before and still are
no_change_bad = df[(df['acc_base'] == False) & (df['acc_GEC'] == False)]
len(no_change_bad)

In [None]:
# check that all cases have been accounted for
assert len(improv_df) + len(worse_now) + len(no_change_good) + len(no_change_bad) == len(df)

In [None]:
# check really bad cases...
# ones that are not true on acc_base and did not have any totalimp
print(f"{len(improv_df)} instances of improvement, in all models")
print(f"{len(bad_df)} instances of no improvement, in all models")

dfs_dict = {}
for file in file_list:
    aux = df[df['csvfile'] == file].copy()
    aux2 = bad_df[bad_df['csvfile'] == file].copy()
    aux3 = improv_df[improv_df['csvfile'] == file].copy()

    dfs_dict[f't_{file}'] = aux
    dfs_dict[f'bad_{file}'] = aux2
    dfs_dict[f'improv_{file}'] = aux2

## Look at individual sentences
- Is there a pattern in the sentences? (i.e. sentences which all models struggle with?)

In [None]:
## Sentences which show