<a href="https://colab.research.google.com/github/dragonsan17/faq_retrieval_deep_learning/blob/main/theme_classification.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Theme Classification

Here, we experiment on automatic theme classification, using BERT and Tf-Idf Weighted N-Gram models

## Import Libraries

In [None]:
import os
import tensorflow as tf
from getpass import getpass
import urllib
import pandas as pd
import numpy as np
pd.set_option('max_colwidth', 1000)
from IPython.display import display
from collections import Counter
import warnings
warnings.filterwarnings('ignore') 

from transformers import TFBertModel, BertTokenizer, TFAutoModel, AutoTokenizer
import random
from keras.preprocessing.sequence import pad_sequences

from sklearn.metrics import f1_score as f1
from sklearn.metrics import accuracy_score as acc
from sklearn.metrics import classification_report as report

## Data Pre-processing

In [None]:
"""
Here it is assumed that the training data has Caller query transcription, hence we are matching that
with all_data.csv file and then picking out broad theme, which is the label. Here, we train with
Relevant Topic (we can also experiment with Caller query transcription itself). 
"""
data = 'paj' #mention the dataset on which theme classification is to be performed

df_all_data = pd.read_csv('.data/' + data + '_all_data.csv', encoding = 'utf-8')
df_train = pd.read_csv('.data/' + data + '_train.csv')

x_train_ = list(set(list(df_train['q1']) + list(df_train['q2'])))
x_train = []
y_train_labels = []
for i in range(len(x_train_)):
    
    l = list(df_all_data[df_all_data['Caller query transcription'] == x_train_[i]]['Broad theme'])
    if len(l) > 0:
        x_train.append(list(df_all_data[df_all_data['Caller query transcription'] == x_train_[i]]['Relevant Topic'])[0])
        y_train_labels.append(l[0])

In [None]:
"""
We test here on the test set of corresponding dataset, or split train into train and val
and then import val here if we are doing hyperparameter tuning. Make sure to use 
'STT Transcript' while validating or testing as that is what we receive in real world scenario
"""
df_val = pd.read_csv('.data/' + data + '_test.csv')
x_val_ = list(set(list(df_val['q1'])))
x_val = []
y_val_labels = []
for i in range(len(x_val_)):
    
    l = list(df_all_data[df_all_data['STT Transcript'] == x_val_[i]]['Broad theme'])
    if len(l) > 0:
        x_val.append(list(df_all_data[df_all_data['STT Transcript'] == x_val_[i]]['STT Transcript'])[0])
        y_val_labels.append(l[0])
    else:
        print(i)

In [None]:
"""
Data split, if needed for tuning hyperparameters.
"""

# from sklearn.model_selection import train_test_split
# df_train, df_val, _, _ = train_test_split(df_all_data, df_all_data['Broad theme'], test_size=0.3, random_state=42, stratify=df_all_data['Broad theme'])
# df_train = df_train.reset_index()
# df_val = df_val.reset_index()

## N-Gram Models

All the models here consider tf-idf weighted inputs.

In [None]:
from IPython.display import display
from collections import Counter
def computeTF(wordDict, bagOfWords):
    tfDict = {}
    bagOfWordsCount = len(bagOfWords)
    for word, count in wordDict.items():
        tfDict[word] = count / float(bagOfWordsCount)
    return tfDict

def computeIDF(unique_words, documents):
    import math
    N = len(documents)
    
    idfDict = dict.fromkeys(unique_words, 0)
    for document in documents:
        for word in document:
            idfDict[word] += 1
    
    for word, val in idfDict.items():

        idfDict[word] = math.log(N / float(val))
    return idfDict

def computeTFIDF(tfBagOfWords, idfs):
    tfidf = {}
    for word, val in tfBagOfWords.items():
        tfidf[word] = val * idfs[word]
    return tfidf
    

stopwords = []
labels = list(set(list(df_train['Broad theme'])))

documents = list(df_train['Relevant Topic'])

for i in range(len(documents)):
    documents[i] = ' '.join(documents[i].split('?'))
    
train_text = documents
print(len(documents))

bag_of_words = []
unique_words = set([])
for document in documents:
    new_words = set(document.split())
    #     print(len(new_words))
    unique_words = unique_words.union(new_words)
    bag_of_words.append(new_words)

unique_words.discard(set(stopwords))
print(len(unique_words))
num_words_all = []

for i in range(len(bag_of_words)):
    num_words = dict.fromkeys(unique_words, 0)
    for word in documents[i].split():
        num_words[word] += 1
    num_words_all.append(num_words)
    

idf = computeIDF(unique_words, bag_of_words)

all_tfidf = []
for i in range(len(bag_of_words)):
    tf = computeTF(num_words_all[i], bag_of_words[i])
    tfidf = computeTFIDF(tf, idf)
    all_tfidf.append(tfidf)

df_t  = pd.DataFrame(all_tfidf)

documents = list(df_val['Relevant Topic']) 
# documents = list(df_val['STT Transcript'])
bag_of_words = []
for document in documents:
    new_words = set(document.split())
    bag_of_words.append(new_words)
    
num_words_all = []

for i in range(len(bag_of_words)):
    num_words = dict.fromkeys(unique_words, 0)
    for word in documents[i].split():
        if word in num_words:
            num_words[word] += 1
    num_words_all.append(num_words)
  
all_tfidf = []
for i in range(len(bag_of_words)):
    tf = computeTF(num_words_all[i], bag_of_words[i])
    tfidf = computeTFIDF(tf, idf)
    all_tfidf.append(tfidf)
    
df_v= pd.DataFrame(all_tfidf)
   
y_train = df_train['Broad theme']
y_val = df_val['Broad theme']

In [None]:
from sklearn.svm import LinearSVC, SVC
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier

model_rf = RandomForestClassifier(n_jobs = 10, n_estimators = 1000, min_samples_split = 2, class_weight = 'balanced')
model_dt = DecisionTreeClassifier(class_weight = 'balanced')
model_gb = GradientBoostingClassifier(n_estimators = 200, min_samples_split = 3, max_depth = 9)
model_lr = LogisticRegression(max_iter = 1000, penalty = 'l2', class_weight = 'balanced', C = 3)
model_svc = LinearSVC(C = 2)

model = model_rf #specify the model here
model.fit(np.array(df_t), y_train)
y_train_p = model.predict(np.array(df_t))
y_val_p = model.predict(np.array(df_v))
print(report(y_val, y_val_p))
print(acc(y_val, y_val_p))
print(f1(y_val, y_val_p, average = 'macro'))

In [None]:
import seaborn as sn
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.metrics import confusion_matrix
array = confusion_matrix(y_val, y_val_p, labels = df_train['Broad theme'].unique())
df_cm = pd.DataFrame(array, index = [i for i in df_train['Broad theme'].unique()],
                  columns = [i for i in df_train['Broad theme'].unique()])
plt.figure(figsize = (10,7))
sn.heatmap(df_cm, annot=True)

## Transformer Model

In [None]:
all_themes_ = list(df_all_data['Broad theme'].unique())
all_themes = {all_themes_[i] : i for i in range(len(all_themes_))}

y_train_ = [all_themes[label] for label in y_train_labels]
y_train = tf.one_hot(y_train_, len(all_themes))

y_val_ = [all_themes[label] for label in y_val_labels]
y_val = tf.one_hot(y_val_, len(all_themes))

def preprocess_text(tokenizer, ques):
    input_ids = []
    attention_masks = []
    for q1 in ques:
        q1 = '[CLS] ' + q1 + ' [SEP] '
        token = tokenizer.tokenize(q1)
        attention_mask = [1]*len(token)

        input_id = tokenizer.convert_tokens_to_ids(token)

        input_ids.append(input_id)
        attention_masks.append(attention_mask)

    input_ids = np.array(pad_sequences(input_ids, maxlen=MAX_LEN, dtype="long", truncating="post", padding="post"))  
    attention_masks = np.array(pad_sequences(attention_masks, maxlen=MAX_LEN, dtype="long", truncating="post", padding="post")) 

    return input_ids, attention_masks

In [None]:
"""
  BERT
"""

def build_model():
    
    input_ids = tf.keras.layers.Input((MAX_LEN,), dtype=tf.int32)
    attention_masks = tf.keras.layers.Input((MAX_LEN,), dtype=tf.int32)

    bert_model = TFBertModel.from_pretrained(pretrained_model_name_or_path = 'bert-base-multilingual-cased', return_dict=True)
    x = bert_model(input_ids,attention_mask=attention_masks).pooler_output
    x1 = tf.keras.layers.Dropout(0.1)(x) 
    x1 = tf.keras.layers.Dense(len(all_themes))(x1)
    x1 = tf.keras.layers.Activation('softmax')(x1)

    model = tf.keras.models.Model(inputs=[input_ids, attention_masks], outputs=[x1])
    optimizer = tf.keras.optimizers.Adam(learning_rate= LR)
    model.compile(loss='categorical_crossentropy', optimizer=optimizer, metrics=['acc'])
    return model, BertTokenizer.from_pretrained('bert-base-multilingual-cased')

In [None]:
# BERT On PAJ
MAX_LEN = 512
LR = 2e-5
EPOCH_NUM = 15
BATCH_SIZE = 2

#BERT On JEE
# MAX_LEN = 256
# LR = 6e-6
# EPOCH_NUM = 10
# BATCH_SIZE = 4

model, tokenizer = build_model()

train_input_ids, train_attention_masks = preprocess_text(tokenizer, x_train)
val_input_ids, val_attention_masks = preprocess_text(tokenizer, x_val)

model.fit([train_input_ids, train_attention_masks], y_train, epochs=EPOCH_NUM, batch_size=BATCH_SIZE, verbose=1, validation_data=([val_input_ids, val_attention_masks], y_val))

y_pred = model.predict([val_input_ids, val_attention_masks])

## Saving Test Data

We modify the test set as follows: We consider the top 3 themes predicted for the test query q2, and then only consider the pairs (q1,q2), where the q1's theme is one of the top 3 predicted themes for q2. We then test on this modified test set, which is equivalent to filtering out queries and then testing.

In [None]:
#use y_val_p instead of y_pred if using N gram models

all_themes_ = list(df_all_data['Broad theme'].unique())
all_themes = {all_themes_[i] : i for i in range(len(all_themes_))}

q1 = []
q2 = []
for i,r in df_val.iterrows():
    q = r['q1']
    if q in x_val:
        id_ = x_val.index(q)
        preds = np.argsort(y_pred[id_])[-3:]
        s = set([all_themes_[preds[-3]], all_themes_[preds[-2]], all_themes_[preds[-1]]])
        l = list(df_all_data[df_all_data['Caller query transcription'] == r['q2']]['Broad theme'])
        if len(l) == 0:
            continue
            
        t2 = l[0]
        if t2 in s:
            q1.append(r['q1'])
            q2.append(r['q2'])

df = pd.DataFrame({'q1' : q1, 'q2' : q2})
df.to_csv(data + '_test_themes.csv', index = False)