In [1]:
from datasets import load_dataset
import pandas as pd

dataset = load_dataset("copenlu/answerable_tydiqa")

train_set = dataset["train"]
validation_set = dataset["validation"]

df_train = train_set.to_pandas()
df_val = validation_set.to_pandas()

df_train.head()

Unnamed: 0,question_text,document_title,language,annotations,document_plaintext,document_url
0,Milloin Charles Fort syntyi?,Charles Fort,finnish,"{'answer_start': [18], 'answer_text': ['6. elo...",Charles Hoy Fort (6. elokuuta (joidenkin lähte...,https://fi.wikipedia.org/wiki/Charles%20Fort
1,“ダン” ダニエル・ジャドソン・キャラハンの出身はどこ,ダニエル・J・キャラハン,japanese,"{'answer_start': [35], 'answer_text': ['カリフォルニ...",“ダン”こと、ダニエル・ジャドソン・キャラハンは1890年7月26日、カリフォルニア州サンフ...,https://ja.wikipedia.org/wiki/%E3%83%80%E3%83%...
2,వేప చెట్టు యొక్క శాస్త్రీయ నామం ఏమిటి?,వేప,telugu,"{'answer_start': [12], 'answer_text': ['Azadir...","వేప (లాటిన్ Azadirachta indica, syn. Melia aza...",https://te.wikipedia.org/wiki/%E0%B0%B5%E0%B1%...
3,চেঙ্গিস খান কোন বংশের রাজা ছিলেন ?,চেঙ্গিজ খান,bengali,"{'answer_start': [414], 'answer_text': ['বোরজি...",চেঙ্গিজ খান (মঙ্গোলীয়: Чингис Хаан আ-ধ্ব-ব: ...,https://bn.wikipedia.org/wiki/%E0%A6%9A%E0%A7%...
4,రెయ్యలగడ్ద గ్రామ విస్తీర్ణత ఎంత?,రెయ్యలగడ్ద,telugu,"{'answer_start': [259], 'answer_text': ['27 హె...","రెయ్యలగడ్ద, విశాఖపట్నం జిల్లా, గంగరాజు మాడుగుల...",https://te.wikipedia.org/wiki/%E0%B0%B0%E0%B1%...


In [2]:

# Filter the dataframe to get only the rows where the language is 'bengali', 'arab', or 'indonesian', then concatenate these dataframes and display the last five rows
df_train_bengali = df_train[df_train['language'] == 'bengali']
df_train_arab = df_train[df_train['language'] == 'arab']
df_train_indonesian = df_train[df_train['language'] == 'indonesian']
df_train_new = pd.concat([df_train_bengali, df_train_arab, df_train_indonesian])
df_train_new.head()

# Apply the same filtering and concatenation to the validation dataframe
df_val_bengali = df_val[df_val['language'] == 'bengali']
df_val_arab = df_val[df_val['language'] == 'arab']
df_val_indonesian = df_val[df_val['language'] == 'indonesian']
df_val_new = pd.concat([df_val_bengali, df_val_arab, df_val_indonesian])
# df_val_new.tail()


df_train_new.head()




Unnamed: 0,question_text,document_title,language,annotations,document_plaintext,document_url
3,চেঙ্গিস খান কোন বংশের রাজা ছিলেন ?,চেঙ্গিজ খান,bengali,"{'answer_start': [414], 'answer_text': ['বোরজি...",চেঙ্গিজ খান (মঙ্গোলীয়: Чингис Хаан আ-ধ্ব-ব: ...,https://bn.wikipedia.org/wiki/%E0%A6%9A%E0%A7%...
22,অপারেশন জ্যাকপটের কর্নেল কে ছিলেন ?,অপারেশন জ্যাকপট,bengali,"{'answer_start': [153], 'answer_text': ['ওসমান...",তারপর উক্ত ৮জনের সাথে আরো কয়েকজনকে একত্র করে ...,https://bn.wikipedia.org/wiki/%E0%A6%85%E0%A6%...
31,কত সালে সর্বভারতীয় তৃণমূল কংগ্রেসের প্রতিষ্ঠা...,সর্বভারতীয় তৃণমূল কংগ্রেস,bengali,"{'answer_start': [114], 'answer_text': ['১৯৯৮']}",সর্বভারতীয় তৃণমূল কংগ্রেস (সংক্ষেপে তৃণমূল কং...,https://bn.wikipedia.org/wiki/%E0%A6%B8%E0%A6%...
115,কত সালে কম্পিউটার আবিষ্কার করা হয় ?,কম্পিউটার,bengali,"{'answer_start': [365], 'answer_text': ['খ্রিষ...",প্রাগৈতিহাসিক যুগে গণনার যন্ত্র উদ্ভাবিত বিভিন...,https://bn.wikipedia.org/wiki/%E0%A6%95%E0%A6%...
126,মোহনদাস করমচাঁদ গান্ধী বা মহাত্মা গান্ধীর জন্ম...,মোহনদাস করমচাঁদ গান্ধী,bengali,"{'answer_start': [33], 'answer_text': ['পোরবন্...",মোহনদাস করমচাঁদ গান্ধী ১৮৬৯ সালে পোরবন্দরের হি...,https://bn.wikipedia.org/wiki/%E0%A6%AE%E0%A7%...


In [3]:
# Add a new column to the dataframe that indicates whether the question is answerable or not
df_train_new['is_answerable'] = df_train_new['annotations'].apply(lambda x: 0 if x['answer_start'] == [-1] else 1)
df_val_new['is_answerable'] = df_val_new['annotations'].apply(lambda x: 0 if x['answer_start'] == [-1] else 1)

def oracle(document_plaintext, question_text):
    row = df_val_new[(df_val_new['document_plaintext'] == document_plaintext) & (df_val_new['question_text'] == question_text)]
    if row['is_answerable'].values[0] == 1:
        return 1
    else:
        return 0

# Test the oracle function with an example
# print(oracle(df_val_new['document_plaintext'].iloc[0], df_val_new['question_text'].iloc[0]))
# print(oracle(বিষয়শ্রেণী:চেঙ্গিজ খান বিষয়শ্রেণী:১১৬২-এ জন্ম বিষয়শ্রেণী:১২২৭-এ মৃত্যু বিষয়শ্রেণী:মঙ্গোলিয়া বিষয়শ্রেণী:বোরজিগিন বিষয়শ্রেণী:মঙ্গোল সাম্রাজ্যের খাগান, s))


# Find the first instance in the training set that is not answerable
first_unanswerable = df_train_new[df_train_new['is_answerable'] == 0].iloc[0]
# print(first_unanswerable['document_plaintext'])
print(first_unanswerable['question_text'])



চেঙ্গিস খান কোন বংশের রাজা ছিলেন ?


In [4]:
from sklearn.metrics import confusion_matrix, recall_score, f1_score
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn import metrics

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(df_train_new['document_plaintext'], df_train_new['is_answerable'], test_size=0.2, random_state=42)

# Initialize the CountVectorizer
count_vect = CountVectorizer()

# Fit and transform the training data and transform the testing data
X_train_counts = count_vect.fit_transform(X_train)
X_test_counts = count_vect.transform(X_test)

# Initialize the Multinomial Naive Bayes classifier
clf_nb = MultinomialNB()

# Fit the Naive Bayes classifier with the training data
clf_nb.fit(X_train_counts, y_train)

# Predict the testing data using Naive Bayes classifier
y_pred_nb = clf_nb.predict(X_test_counts)

# Generate the confusion matrix for Naive Bayes classifier
cm_nb = confusion_matrix(y_test, y_pred_nb)

# Print the confusion matrix for Naive Bayes classifier
print("Confusion Matrix (Naive Bayes):")
print(cm_nb)

# Print the accuracy score for Naive Bayes classifier
print("Accuracy (Naive Bayes):", metrics.accuracy_score(y_test, y_pred_nb))
# Print the recall score for Naive Bayes classifier
print("Recall (Naive Bayes):", recall_score(y_test, y_pred_nb))
# Print the f1 score for Naive Bayes classifier
print("F1 Score (Naive Bayes):", f1_score(y_test, y_pred_nb))

# Initialize the Logistic Regression classifier
clf_lr = LogisticRegression()

# Fit the Logistic Regression classifier with the training data
clf_lr.fit(X_train_counts, y_train)

# Predict the testing data using Logistic Regression classifier
y_pred_lr = clf_lr.predict(X_test_counts)

# Generate the confusion matrix for Logistic Regression classifier
cm_lr = confusion_matrix(y_test, y_pred_lr)

# Print the confusion matrix for Logistic Regression classifier
print("Confusion Matrix (Logistic Regression):")
print(cm_lr)

# Print the accuracy score for Logistic Regression classifier
print("Accuracy (Logistic Regression):", metrics.accuracy_score(y_test, y_pred_lr))
# Print the recall score for Logistic Regression classifier
print("Recall (Logistic Regression):", recall_score(y_test, y_pred_lr))
# Print the f1 score for Logistic Regression classifier
print("F1 Score (Logistic Regression):", f1_score(y_test, y_pred_lr))

# Initialize the Random Forest classifier
clf_rfc = RandomForestClassifier()

# Fit the Random Forest classifier with the training data
clf_rfc.fit(X_train_counts, y_train)

# Predict the testing data using Random Forest classifier
y_pred_rfc = clf_rfc.predict(X_test_counts)

# Generate the confusion matrix for Random Forest classifier
cm_rfc = confusion_matrix(y_test, y_pred_rfc)

# Print the confusion matrix for Random Forest classifier
print("Confusion Matrix (Random Forest):")
print(cm_rfc)

# Print the accuracy score for Random Forest classifier
print("Accuracy (Random Forest):", metrics.accuracy_score(y_test, y_pred_rfc))
# Print the recall score for Random Forest classifier
print("Recall (Random Forest):", recall_score(y_test, y_pred_rfc))
# Print the f1 score for Random Forest classifier
print("F1 Score (Random Forest):", f1_score(y_test, y_pred_rfc))


Confusion Matrix (Naive Bayes):
[[ 759  842]
 [ 314 1320]]
Accuracy (Naive Bayes): 0.6426584234930448
Recall (Naive Bayes): 0.8078335373317014
F1 Score (Naive Bayes): 0.6954689146469969


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Confusion Matrix (Logistic Regression):
[[1182  419]
 [ 462 1172]]
Accuracy (Logistic Regression): 0.7276661514683153
Recall (Logistic Regression): 0.7172582619339045
F1 Score (Logistic Regression): 0.7268217054263566
Confusion Matrix (Random Forest):
[[1231  370]
 [ 424 1210]]
Accuracy (Random Forest): 0.7545595054095827
Recall (Random Forest): 0.7405140758873929
F1 Score (Random Forest): 0.7529558182949595
