In [2]:
%%capture
!pip3 install datasets
!pip3 install googletrans==3.1.0a0
!pip3 install torch
!pip3 install tabulate

In [3]:
import pandas as pd
import copy
import numpy as np
import torch
import nltk
import string

nltk.download('punkt')
from datasets import load_dataset
from tabulate import tabulate
from googletrans import Translator

# stopwords
from nltk.corpus import stopwords
nltk.download('stopwords')

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/marcusthomsen/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
  from .autonotebook import tqdm as notebook_tqdm
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/marcusthomsen/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

# Week 36 (5-11 September)

## (a)

### Loading dataset

In [4]:
languages = ['indonesian', 'bengali', 'arabic']
%run dict_maker.py
train_set_dict = train_set_dict
val_set_dict = val_set_dict

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/marcusthomsen/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/marcusthomsen/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Import Success
Adding  indonesian to dict
Adding  bengali to dict
Adding  arabic to dict

Tokenizing question, answer and document text for indonesian
Done


Tokenizing question, answer and document text for bengali
Done


Tokenizing question, answer and document text for arabic
Done

Removing stopwords


Map: 100%|██████████| 29598/29598 [01:30<00:00, 325.36 examples/s]


### Aux functions

In [7]:
#Creating BOW
def unique_words(data):
  word_to_ix = {}
  for sent in data:
      for word in sent:
          if word not in word_to_ix:
              word_to_ix[word] = len(word_to_ix)
  return word_to_ix

## Basic statistics: no. of questions avg words per question and no. of answerable questions

In [8]:
table_data = []
for language in languages:
    train_set = train_set_dict[language]
    val_set = val_set_dict[language]

    train_questions = train_set['question_words']
    val_questions = val_set['question_words']

    stop_words = set(stopwords.words(language.lower()))

    train_word_counts = len(train_questions)
    val_word_counts = len(val_questions)
    train_answerable_ratio = np.mean([len(x['annotations']['answer_text'][0]) > 0 for x in train_set])
    val_answerable_ratio = np.mean([len(nltk.word_tokenize(x['annotations']['answer_text'][0])) > 0 for x in val_set])

    flat_train_q = [item for sublist in train_questions for item in sublist]
    flat_val_q = [item for sublist in val_questions for item in sublist]

    # Calculate the total length of inner lists
    train_avg_words_per_question = sum(len(inner_list) for inner_list in train_questions) / len(train_questions)
    val_avg_words_per_question = sum(len(inner_list) for inner_list in val_questions) / len(val_questions)

    train_total_words = train_word_counts * train_avg_words_per_question
    val_total_words = val_word_counts * val_avg_words_per_question

    train_unique_words = len(unique_words(train_questions))
    val_unique_words = len(unique_words(val_questions))

    table_data.append([
        language,
        "Train Set",
        len(train_questions),
        np.round(train_avg_words_per_question, 2),
        np.round(train_answerable_ratio, 2),
        np.round(train_total_words, 2),
        np.round(train_unique_words, 2)
    ])
    table_data.append([
        "",
        "Validation Set",
        len(val_questions),
        np.round(val_avg_words_per_question, 2),
        np.round(val_answerable_ratio, 2),
        np.round(val_total_words, 2),
        np.round(val_unique_words, 2)
    ])

headers = ["Language", "Dataset", "No. of Questions", "Avg. Words per Question", "Answerable Ratio", "No. of Words in Questions", "No. of Unique Words"]

table = tabulate(table_data, headers, tablefmt="latex_raw")
print(table)

\begin{tabular}{llrrrrr}
\hline
 Language   & Dataset        &   No. of Questions &   Avg. Words per Question &   Answerable Ratio &   No. of Words in Questions &   No. of Unique Words \\
\hline
 indonesian & Train Set      &              11394 &                      3.37 &                0.5 &                       38374 &                  5558 \\
            & Validation Set &               1191 &                      3.62 &                0.5 &                        4306 &                  1285 \\
 bengali    & Train Set      &               4779 &                      7.04 &                0.5 &                       33623 &                  3744 \\
            & Validation Set &                224 &                      7.43 &                0.5 &                        1664 &                   431 \\
 arabic     & Train Set      &              29598 &                      4.12 &                0.5 &                      121969 &                 16183 \\
            & Validation 

## (b)

In [10]:
def get_top_words(countries: list):
    combined_data = {'Language': [], 'Type': [], 'Top Word (English)': [], 'Count': []}
    for country in countries:
        nltk.download('stopwords')
        stop_words = set(stopwords.words(country.lower()))

        # we decided to remove additional characters that were top words
        # for the countries because they are not particulary interesting
        additional_characters = list(string.punctuation) + ["``", "''", "؟", ","]

        doc_words = [word for sublist in train_set_dict[country]['doc_text_words'] for word in sublist if word not in stop_words and word not in additional_characters]
        question_words = [word for sublist in train_set_dict[country]['question_words'] for word in sublist if word not in stop_words and word not in additional_characters]

        top_words_doc = pd.Series(doc_words).value_counts()[:5]
        top_words_q = pd.Series(question_words).value_counts()[:5]

        translator = Translator()

        translated_words_doc = [translator.translate(word, src=country, dest='en').text for word in top_words_doc.index]
        translated_words_q = [translator.translate(word, src=country, dest='en').text for word in top_words_q.index]

        for word, count in zip(translated_words_doc, top_words_doc.values):
            combined_data['Language'].append(country.capitalize())
            combined_data['Type'].append('doc')
            combined_data['Top Word (English)'].append(word)
            combined_data['Count'].append(count)

        for word, count in zip(translated_words_q, top_words_q.values):
            combined_data['Language'].append(country.capitalize())
            combined_data['Type'].append('question')
            combined_data['Top Word (English)'].append(word)
            combined_data['Count'].append(count)

    combined_df = pd.DataFrame(combined_data)
    print(combined_df)
countries = ['arabic', 'bengali', 'indonesian']

table = get_top_words(countries)

print(tabulate(table, headers='keys', tablefmt='pretty'))

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/marcusthomsen/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/marcusthomsen/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/marcusthomsen/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


      Language      Type Top Word (English)  Count
0       Arabic       doc     classification  15655
1       Arabic       doc            general  13640
2       Arabic       doc                  1   8364
3       Arabic       doc                son   6161
4       Arabic       doc                  a   5842
5       Arabic  question              child   1564
6       Arabic  question            Located   1540
7       Arabic  question   It was completed   1334
8       Arabic  question            general   1159
9       Arabic  question               city   1002
10     Bengali       doc                 is   2916
11     Bengali       doc                 in   2365
12     Bengali       doc                 do   2355
13     Bengali       doc                 as   1334
14     Bengali       doc           by doing   1293
15     Bengali  question               name    837
16     Bengali  question                 in    522
17     Bengali  question              where    300
18     Bengali  question       

## (C)

In [11]:
first_column = [item['answer_text'][0] for item in train_set_dict['indonesian']['annotations']]


In [14]:
def oracle(data, language):
  labels = []
  for elem in data[language]:
      if elem['annotations']['answer_text'][0] == '':
          labels.append(0)
      else:
          labels.append(1)
  return labels

In [15]:
# Function that returns 1 if more than or equal to n words from questions are in plaintext
def is_answerable(question_words, doc_words, n):
    common_words = [word for word in question_words if word in doc_words]
    return len(common_words) >= n

In [16]:
new_data = {}
def rule_based_classifier(data, country, n):
    new_data = copy.deepcopy(data[country]) 

    answerable_values = []

    for i in range(0,data[country].num_rows):
        q = set(new_data['question_words'][i])
        doc = new_data['doc_text_words'][i]
        answerable = 0  

        answerable = is_answerable(q, doc, n)

        answerable_values.append(answerable)
    new_data
    #new_data['answerable'] = answerable_values

    return answerable_values

In [19]:
# Accuracy of our classifier for indonesian
indo_classifier_answers = rule_based_classifier(val_set_dict, 'indonesian', 2)
np.round((np.array(indo_classifier_answers) == np.array(oracle(val_set_dict, "indonesian"))).mean(), 2)

0.69

In [20]:
# Accuracy of our classifier for bengali
beng_classifier_answers = rule_based_classifier(val_set_dict, 'bengali', 2)
np.round((np.array(beng_classifier_answers) == np.array(oracle(val_set_dict, 'bengali'))).mean(), 2)

0.73

In [21]:
# Accuracy of our classifier for arabic
arab_classifier_answers = rule_based_classifier(val_set_dict, 'arabic', 2)
np.round((np.array(arab_classifier_answers) == np.array(oracle(val_set_dict, 'arabic'))).mean(), 1)

0.7