# Question Classification

In [70]:
# imports
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.svm import LinearSVC
from sklearn.metrics import accuracy_score
from sklearn.model_selection import KFold, StratifiedKFold
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import RegexpTokenizer
import spacy
from scipy.sparse import hstack

In [69]:
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))
tokenizer = RegexpTokenizer(r'\w+')

nlp = spacy.load("en_core_web_sm")

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/markusmuller/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [10]:
# list with relevant categories
cat_target_names = [
    'HUM',  # human beings
    'LOC',  # locations
    'NUM',  # numeric values
    'ENTY', # entities
]

cat_map_dict = {
    'HUM': 0,
    'LOC': 1,
    'NUM': 2,
    'ENTY': 3
}

sub_target_names = [
    'HUM:ind',     # an individual
    'HUM:gr',      # a group or organization of persons
    'LOC:other',   # other location
    'LOC:country', # countries
    'LOC:city',    # cities
    'NUM:count',   # number of somthing
    'NUM:date',    # dates
    'ENTY:other',  # other entities
    'ENTY:cremat', # ?
    'ENTY:animal', # animals
    'ENTY:food'    # food
]

sub_cat_map_dict = {
    'HUM:ind': 0,
    'HUM:gr': 1,
    'LOC:other': 2,
    'LOC:country': 3,
    'LOC:city': 4, 
    'NUM:count': 5,
    'NUM:date': 6,
    'ENTY:other': 7,
    'ENTY:cremat': 8,
    'ENTY:animal': 9,
    'ENTY:food': 10 
}

In [12]:
# paths to data
DATA_PATH_WEB_TRAIN = "Data/question_clf_train_set.txt"
DATA_PATH_WEB_TEST = "Data/question_clf_test_set.txt"

## Preprocessing

In [13]:
def process_web_data(data_path):
    # read file and add question with cat to list
    question_cat_list = []
    question_list = []

    with open(data_path, 'r', encoding = "ISO-8859-1") as f:
        for line in f:
            q_cat, q = line.split(" ", 1)
            # check if question 
            if q_cat.split(":")[0] in cat_target_names:
                question_cat_list.append(q_cat)
                question_list.append(q.strip())

    assert(len(question_cat_list) == len(question_list))

    # create Dataframe
    df = pd.DataFrame({"sub_cat": question_cat_list, "question": question_list})

    # split sub_cat to get just the category
    df["cat"] = df["sub_cat"].str.split(":", n=1, expand=True)[0]

    # encode category with values
    df["cat_encoded"] = df["cat"].replace(cat_map_dict)

    # encode sub category with values
    df["sub_cat_encoded"] = df["sub_cat"].replace(sub_cat_map_dict)

    # replace string in sub_cat_encoded
    df["sub_cat_encoded"] =  df['sub_cat_encoded'].apply(lambda x: 99 if str(type(x))=="<class 'str'>" else x)
    
    return df

In [14]:
df_data_web_train = process_web_data(DATA_PATH_WEB_TRAIN)
df_data_web_test = process_web_data(DATA_PATH_WEB_TEST)

In [15]:
df_data_web_train

Unnamed: 0,sub_cat,question,cat,cat_encoded,sub_cat_encoded
0,ENTY:cremat,What films featured the character Popeye Doyle ?,ENTY,3,8
1,ENTY:animal,What fowl grabs the spotlight after the Chines...,ENTY,3,9
2,HUM:ind,What contemptible scoundrel stole the cork fro...,HUM,0,0
3,HUM:gr,What team did baseball 's St. Louis Browns bec...,HUM,0,1
4,HUM:title,What is the oldest profession ?,HUM,0,99
...,...,...,...,...,...
4199,ENTY:other,What 's the shape of a camel 's spine ?,ENTY,3,7
4200,ENTY:currency,What type of currency is used in China ?,ENTY,3,99
4201,NUM:temp,What is the temperature today ?,NUM,2,99
4202,NUM:temp,What is the temperature for cooking ?,NUM,2,99


In [16]:
df_train = df_data_web_train[df_data_web_train["sub_cat_encoded"] != 99]
df_test = df_data_web_test[df_data_web_test["sub_cat_encoded"] != 99]

In [17]:
df_train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 3119 entries, 0 to 4199
Data columns (total 5 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   sub_cat          3119 non-null   object
 1   question         3119 non-null   object
 2   cat              3119 non-null   object
 3   cat_encoded      3119 non-null   int64 
 4   sub_cat_encoded  3119 non-null   int64 
dtypes: int64(2), object(3)
memory usage: 146.2+ KB


In [18]:
df_test.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 220 entries, 1 to 351
Data columns (total 5 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   sub_cat          220 non-null    object
 1   question         220 non-null    object
 2   cat              220 non-null    object
 3   cat_encoded      220 non-null    int64 
 4   sub_cat_encoded  220 non-null    int64 
dtypes: int64(2), object(3)
memory usage: 10.3+ KB


In [19]:
df_train["cat"].value_counts()

HUM     1151
LOC      748
ENTY     639
NUM      581
Name: cat, dtype: int64

In [20]:
df_test["cat"].value_counts()

LOC     71
HUM     61
NUM     56
ENTY    32
Name: cat, dtype: int64

In [21]:
frames = [df_train, df_test]
df_all = pd.concat(frames)

In [22]:
df_all["cat"].value_counts(normalize=True)

HUM     0.362983
LOC     0.245283
ENTY    0.200958
NUM     0.190776
Name: cat, dtype: float64

## Prepare Data for Training

In [80]:
# simple preprocessing
# just calculating bag of words
def simple_pre(df):
    question_list = df["question"].values
    y = df["sub_cat_encoded"].values
    
    vectorizer = CountVectorizer()
    bow_vec = vectorizer.fit_transform(question_list)
    
    return bow_vec, y

# removeing stopwords and numbers
def remove_stopwords_pre(df):
    question_list = df["question"].values
    question_list_pre = []
    for q in question_list:
        question_tokenized = tokenizer.tokenize(q)
        question_stopwords_removed = [w for w in question_tokenized if not w.lower() in stop_words]
        str_join = ' '.join(question_stopwords_removed)
        question_list_pre.append(str_join)
    
    y = df["sub_cat_encoded"].values
    
    vectorizer = CountVectorizer()
    bow_vec = vectorizer.fit_transform(question_list_pre)
    
    return bow_vec, y

# add Names Entities
def add_ner_pre(df):
    # list with NER labels from spacy
    ner_list = nlp.pipe_labels['ner']
    question_list = df["question"].values
    
    question_NER = {}
    for idx, q in enumerate(question_list):
        q = nlp(q)
        ent_list = []
        for ent in q.ents:
            ent_list.append(ent.label_)
        question_NER[idx] = ent_list
    
    rows = len(question_list)
    cols = len(ner_list)
    ner_encoded = np.zeros(shape=(rows, cols))
    
    for key, value in question_NER.items():
        for ent in value:
            update_at_idx = ner_list.index(ent)
            ner_encoded[key][update_at_idx] = 1
        
    vectorizer = CountVectorizer()
    bow_vec = vectorizer.fit_transform(question_list)
    
    # combine bow_vec with ner_encoded
    X = hstack((bow_vec, ner_encoded))    
    X = X.toarray()
    y = df["sub_cat_encoded"].values
    
    return X, y

In [76]:
# Function to implement KFold cross validation 
def valdiate_kfold(df, preprocess_function):
    # define F
    X, y = preprocess_function(df)
    lsvc = LinearSVC()
    n_splits = 5
    kfold = KFold(n_splits=n_splits)

    accurarcy_list = []
    idx = 0
    # when using KFold remove y
    for train_index, test_index in kfold.split(X):
        idx += 1    
        # split the data into training and test sets
        X_train, X_test = X[train_index], X[test_index]
        y_train, y_test = y[train_index], y[test_index]
        # fit the model with the data
        lsvc.fit(X_train, y_train)
        y_pred = lsvc.predict(X_test)
        # calculate accuracy
        accuracy = accuracy_score(y_test, y_pred)
        accurarcy_list.append(accuracy)
        print(f"Accuracy fold {idx}: {accuracy}")
        
    print(f"mean accuracy: {np.mean(accurarcy_list)}, std: {np.std(accurarcy_list)}")

# Funktion to implement stratified KFold cross validation 
# for imbalanced dataset
def valdiate_skf(df, preprocess_function):
    X, y = preprocess_function(df)
    lsvc = LinearSVC()
    n_splits = 5
    skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=0)

    accurarcy_list = []
    idx = 0
    for train_index, test_index in skf.split(X, y):
        idx += 1    
        # split the data into training and test sets
        X_train, X_test = X[train_index], X[test_index]
        y_train, y_test = y[train_index], y[test_index]
        # fit the model with the data
        lsvc.fit(X_train, y_train)
        y_pred = lsvc.predict(X_test)
        # calculate accuracy
        accuracy = accuracy_score(y_test, y_pred)
        accurarcy_list.append(accuracy)
        print(f"Accuracy fold {idx}: {accuracy}")

    print(f"mean accuracy: {np.mean(accurarcy_list)}, std: {np.std(accurarcy_list)}")
    
    

## Train Classifier
- since the main categories are slightly imbalanced I will use stratifed KFold

In [66]:
# simple preprocessing
valdiate_skf(df_all, simple_pre)

Accuracy fold 1: 0.8547904191616766
Accuracy fold 2: 0.8143712574850299
Accuracy fold 3: 0.8488023952095808
Accuracy fold 4: 0.842814371257485
Accuracy fold 5: 0.8440779610194903
mean accuracy: 0.8409712808266525, std: 0.01394922666482055


In [67]:
# with stopword and number removal
valdiate_skf(df_all, remove_stopwords_pre)

Accuracy fold 1: 0.7544910179640718
Accuracy fold 2: 0.7200598802395209
Accuracy fold 3: 0.7170658682634731
Accuracy fold 4: 0.75
Accuracy fold 5: 0.7436281859070465
mean accuracy: 0.7370489904748224, std: 0.015512535724976228


In [81]:
# with NER added
valdiate_skf(df_all, add_ner_pre)

Accuracy fold 1: 0.8562874251497006
Accuracy fold 2: 0.8143712574850299
Accuracy fold 3: 0.8547904191616766
Accuracy fold 4: 0.8368263473053892
Accuracy fold 5: 0.8530734632683659
mean accuracy: 0.8430697824740324, std: 0.015967083960133507


# Train Sub-Category classifier

- Question -> Category [HUM, NUM, LOC, ENTY]
-> [HUM]
    - clf_HUM -> [ind, gr]
- ...
- we have 4 classifier for in category classification

In [83]:
df_hum = df_all[df_all["cat_encoded"] == 0]
df_loc = df_all[df_all["cat_encoded"] == 1]
df_num = df_all[df_all["cat_encoded"] == 2]
df_enty = df_all[df_all["cat_encoded"] == 3]

In [84]:
df_hum["sub_cat"].value_counts()

HUM:ind    1017
HUM:gr      195
Name: sub_cat, dtype: int64

In [85]:
df_loc["sub_cat"].value_counts()

LOC:other      514
LOC:country    158
LOC:city       147
Name: sub_cat, dtype: int64

In [86]:
df_num["sub_cat"].value_counts()

NUM:count    372
NUM:date     265
Name: sub_cat, dtype: int64

In [87]:
df_enty["sub_cat"].value_counts()

ENTY:other     229
ENTY:cremat    207
ENTY:animal    128
ENTY:food      107
Name: sub_cat, dtype: int64

## Train and Validate Sub-Category

### HUM

In [88]:
# skf for hum
# simple preprocessing
valdiate_skf(df_hum, simple_pre)

Accuracy fold 1: 0.9300411522633745
Accuracy fold 2: 0.9218106995884774
Accuracy fold 3: 0.9504132231404959
Accuracy fold 4: 0.9504132231404959
Accuracy fold 5: 0.9297520661157025
mean accuracy: 0.9364860728497092, std: 0.011748884703312121


In [89]:
# stopword preprocessing
valdiate_skf(df_hum, remove_stopwords_pre)

Accuracy fold 1: 0.8847736625514403
Accuracy fold 2: 0.9176954732510288
Accuracy fold 3: 0.9586776859504132
Accuracy fold 4: 0.9297520661157025
Accuracy fold 5: 0.9256198347107438
mean accuracy: 0.9233037445158658, std: 0.023724589693505202


In [90]:
# NER preprocessing
valdiate_skf(df_hum, add_ner_pre)

Accuracy fold 1: 0.9300411522633745
Accuracy fold 2: 0.9176954732510288
Accuracy fold 3: 0.9545454545454546
Accuracy fold 4: 0.9504132231404959
Accuracy fold 5: 0.9214876033057852
mean accuracy: 0.9348365813012277, std: 0.015007200897664632


### LOC

In [91]:
# skf for loc
valdiate_skf(df_loc, simple_pre)

Accuracy fold 1: 0.9878048780487805
Accuracy fold 2: 0.975609756097561
Accuracy fold 3: 0.9512195121951219
Accuracy fold 4: 0.975609756097561
Accuracy fold 5: 0.9754601226993865
mean accuracy: 0.9731408050276821, std: 0.01194277011628502


In [92]:
# stopword preprocessing
valdiate_skf(df_loc, remove_stopwords_pre)

Accuracy fold 1: 0.9878048780487805
Accuracy fold 2: 0.975609756097561
Accuracy fold 3: 0.9634146341463414
Accuracy fold 4: 0.9817073170731707
Accuracy fold 5: 0.9754601226993865
mean accuracy: 0.976799341613048, std: 0.008094060418876024


In [93]:
# NER preprocessing
valdiate_skf(df_loc, add_ner_pre)

Accuracy fold 1: 0.9878048780487805
Accuracy fold 2: 0.9695121951219512
Accuracy fold 3: 0.9573170731707317
Accuracy fold 4: 0.975609756097561
Accuracy fold 5: 0.9754601226993865
mean accuracy: 0.9731408050276821, std: 0.00990017465648438


### NUM

In [94]:
# kfold for num
valdiate_kfold(df_num, simple_pre)

Accuracy fold 1: 1.0
Accuracy fold 2: 0.9921875
Accuracy fold 3: 0.984251968503937
Accuracy fold 4: 1.0
Accuracy fold 5: 1.0
mean accuracy: 0.9952878937007874, std: 0.006293106122982452


In [95]:
# stopword preprocessing
valdiate_kfold(df_num, remove_stopwords_pre)

Accuracy fold 1: 0.9921875
Accuracy fold 2: 1.0
Accuracy fold 3: 0.984251968503937
Accuracy fold 4: 0.9921259842519685
Accuracy fold 5: 1.0
mean accuracy: 0.9937130905511811, std: 0.0058891361703223965


In [96]:
# NER preprocessing
valdiate_kfold(df_num, add_ner_pre)

Accuracy fold 1: 1.0
Accuracy fold 2: 1.0
Accuracy fold 3: 0.984251968503937
Accuracy fold 4: 1.0
Accuracy fold 5: 1.0
mean accuracy: 0.9968503937007874, std: 0.0062992125984251855


### ENTY

In [97]:
# kfold for enty
valdiate_kfold(df_enty, simple_pre)

Accuracy fold 1: 0.7851851851851852
Accuracy fold 2: 0.753731343283582
Accuracy fold 3: 0.7985074626865671
Accuracy fold 4: 0.7835820895522388
Accuracy fold 5: 0.8432835820895522
mean accuracy: 0.7928579325594252, std: 0.02915635651214357


In [98]:
# stopword preprocessing
valdiate_kfold(df_enty, remove_stopwords_pre)

Accuracy fold 1: 0.7777777777777778
Accuracy fold 2: 0.7910447761194029
Accuracy fold 3: 0.7761194029850746
Accuracy fold 4: 0.8134328358208955
Accuracy fold 5: 0.7985074626865671
mean accuracy: 0.7913764510779437, std: 0.013821337668998094


In [99]:
# NER preprocessing
valdiate_kfold(df_enty, add_ner_pre)

Accuracy fold 1: 0.7703703703703704
Accuracy fold 2: 0.7686567164179104
Accuracy fold 3: 0.7611940298507462
Accuracy fold 4: 0.746268656716418
Accuracy fold 5: 0.8134328358208955
mean accuracy: 0.7719845218352681, std: 0.02240260226927894
