### Week 1 exercises

In [None]:
from .util import get_pre

(b) For each of the languages Arabic, Bengali and Indonesian, report the 5 most common words in the documents from the training set. Then report the 5 most common words in the questions from the training set. What do you observe?

In [None]:
from collections import Counter

def top_5_words(word_list):
    word_count = Counter(word_list)
    top_5 = word_count.most_common(5)
    return top_5

In [None]:
print('Top 5 Bengali words:', top_5_words(question_text_tokenized_bengali))
print('Top 5 Arabic words:', top_5_words(question_text_tokenized_arabic))
print('Top 5 Indonesian words:', top_5_words(question_text_tokenized_indonesian))

(c) implement an “oracle” function that indicates whether a question is an- swerable or not given the document and answer. That is, the function will output 1 if the answer to the question appears in the document and 0 otherwise. Then implement a rule-based classifier that predicts whether a question is answerable only using the document and question. Use the oracle function to evaluate it. What is the performance of your classifier on the validation set for each of the languages?

In [None]:
# Oracle function which takes a dataframe and row of a dataframe to check whether the text of the question appears in the document text
def oracle(df, row):
    """
    If text (a word) from question appears in document, assume that question is answerable
    Return 1 if answerable
    Return 0 if not answerable
    """
    
    question = df['question_text'][row].split()
    document = df['document_plaintext'][row].split()
    
    found = False
    for word in question:
        if word in document:
            found = True
            break 

    if found:
        return 1
    else:
        return 0

In [None]:
# Creating a column with whether the oracle function classifies the result as 0 or 1
answer_classification = []

for index, row in df_train.iterrows():
    result = oracle(df_train, index) 
    answer_classification.append(result)
    
df_train['answer_classification'] = answer_classification

In [None]:
# Creating a binary column where if the question is answered it is equal to 1, and if not answerable it is 0
def check_annotations(annotation):
    return annotation == {'answer_start': [-1], 'answer_text': ['']}

df_train['correct_answer'] = df_train['annotations'].apply(check_annotations)
df_train['correct_answer'] = (~df_train['correct_answer']).astype(int)

In [None]:
# Performance metrics function
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

def performance_metrics(df):
    y_true = df['correct_answer']
    y_pred = df['answer_classification']

    accuracy = accuracy_score(y_true, y_pred)
    precision = precision_score(y_true, y_pred)
    recall = recall_score(y_true, y_pred)
    f1 = f1_score(y_true, y_pred)

    metrics_dict = {
        "Accuracy": accuracy,
        "Precision": precision,
        "Recall": recall,
        "F1-score": f1}
    
    return metrics_dict

In [None]:
# Update other languages dataframes
df_train_bengali = df_train[df_train['language'] == 'bengali']
df_train_arabic = df_train[df_train['language'] == 'arabic']
df_train_indonesian = df_train[df_train['language'] == 'indonesian']

In [None]:
# display performance metrics
print('Overall:', performance_metrics(df_train))
print('Bengali:', performance_metrics(df_train_bengali))
print('Arabic:', performance_metrics(df_train_arabic))
print('Indonesian:', performance_metrics(df_train_indonesian))