In [None]:
import sys
COLAB_LOC = 'google.colab' in sys.modules
RUN_CELLS = COLAB_LOC

PROJECT_NAME = 'Question_Generation/'
FOLDER_LOCATION = '/content/drive/MyDrive/Dissertation/QG/' + PROJECT_NAME

In [None]:
if COLAB_LOC:
    from google.colab import drive
    drive.mount('/content/drive', force_remount=True)

Mounted at /content/drive


In [None]:
if COLAB_LOC:
    # Setting as working environment
    import os 

    def create_and_set_working_directory(path: str):
        # check if your project folder exists. if not, it will be created.
        if os.path.isdir(path) == False:
            os.mkdir(path)
            print(path + ' did not exist but was created.')

        # change the OS to use your project folder as the working directory
        os.chdir(path)

        print('Working directory changed to: \n' + path)

    create_and_set_working_directory(FOLDER_LOCATION)
    !pwd

Working directory changed to: 
/content/drive/MyDrive/Dissertation/QG/Question_Generation/
/content/drive/MyDrive/Dissertation/QG/Question_Generation


In [None]:
import pandas as pd
import numpy as np

In [None]:
df = pd.read_csv('Model_4_output.csv')

In [None]:
generated_questions = df['Generated Question']
original_question_list = df['SQuAD']

In [None]:
generated_questions

0       What percentage of the incoming class at Notre...
1              What did Pauline Viardot advise Chopin on?
2       When did Destiny's Child change their name to ...
3       What is the most common classification of anti...
4       How many Grammy nomination is Beyoncé the most...
                              ...                        
1496                What has Beyoncé received praise for?
1497                         What is genocide defined as?
1498     Who came to Paris with her husband and daughter?
1499    What is on the top of the Main Building's gold...
1500    How much money did Beyoncé contribute to the S...
Name: Generated Question, Length: 1501, dtype: object

In [None]:
original_question_list

0       What percentage of students at Notre Dame part...
1           What two things did Chopin advise Viardot on?
2       Beyonce's group changed their name to Destiny'...
3       Besides sprectrum of activity and chemical str...
4          How many Grammy nominations does Beyonce have?
                              ...                        
1496                For what does Beyonce receive praise?
1497                  What is the definition of genocide?
1498      Which family member came to Paris in June 1849?
1499    What sits on top of the Main Building at Notre...
1500    How much money did Beyoncé contribute at the b...
Name: SQuAD, Length: 1501, dtype: object

In [None]:
def wer_score(hyp, ref, print_matrix=False):
  N = len(hyp)
  M = len(ref)
  L = np.zeros((N,M))
  for i in range(0, N):
    for j in range(0, M):
      if min(i,j) == 0:
        L[i,j] = max(i,j)
      else:
        deletion = L[i-1,j] + 1
        insertion = L[i,j-1] + 1
        sub = 1 if hyp[i] != ref[j] else 0
        substitution = L[i-1,j-1] + sub
        L[i,j] = min(deletion, min(insertion, substitution))
        # print("{} - {}: del {} ins {} sub {} s {}".format(hyp[i], ref[j], deletion, insertion, substitution, sub))
  if print_matrix:
    print("WER matrix ({}x{}): ".format(N, M))
    print(L)
  return int(L[N-1, M-1])

In [None]:
wer_score_list = []
for hyp, ref in zip(generated_questions, original_question_list):
  wer_score_list.append(wer_score(hyp.split(), ref.split()))

In [None]:
import statistics
wer_score_list
x = pd.Series(wer_score_list)

In [None]:
x.describe()

count    1501.000000
mean        7.936709
std         3.892128
min         0.000000
25%         5.000000
50%         8.000000
75%        10.000000
max        30.000000
dtype: float64

In [None]:
dicti = {}
gen = []
org = []
for idx,x in enumerate(wer_score_list):
  # print(x)
  if x == 30:
    print(idx)
    gen.append(generated_questions[idx])
    org.append(original_question_list[idx])
dicti = {'SQuAD':gen, 'Generated Question':org}

844


In [None]:
save_df = pd.DataFrame(data = dicti)
if COLAB_LOC:
  save_df.to_csv('Model_4_wer_30.csv')

Read csv

In [None]:
df_1 = pd.read_csv('Model_4_wer_0.csv').sample(n=2)
df_1['WER'] = 0
df_1 = df_1.drop(['Unnamed: 0'], axis=1)
df_2 = pd.read_csv('Model_4_wer_8.csv').sample(n=3)
df_2['WER'] = 8
df_2 = df_2.drop(['Unnamed: 0'], axis=1)
df_3 = pd.read_csv('Model_4_wer_30.csv').sample(n=1)
df_3['WER'] = 30
df_3 = df_3.drop(['Unnamed: 0'], axis=1)

In [None]:
final_df = [df_1, df_2, df_3]

In [None]:
data_df = pd.concat(final_df)

In [None]:
with pd.option_context("max_colwidth", 1000):
  print(data_df.to_latex(index = False))

\begin{tabular}{llr}
\toprule
                                                                                                                                                                                                                             SQuAD &                                                              Generated Question &  WER \\
\midrule
                                                                                                                                                                                        What did Beyoncé announce in January 2010? &                                      What did Beyoncé announce in January 2010? &    0 \\
                                                                                                                                                                                                 When did Chopin return to Warsaw? &                                               When did Chopin return to Warsaw? &    0 \\
    