In [1]:
import os

import warnings 
warnings.filterwarnings('ignore')

# importing packages
import numpy as np
import pandas as pd
import spacy
import pickle
from scipy import spatial
import re
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

# nltk packages
import nltk
from nltk import word_tokenize
from nltk.stem import WordNetLemmatizer 
from nltk.corpus import stopwords
from string import punctuation
import unidecode
from nltk.tokenize import word_tokenize
from nltk.tag import pos_tag
from nltk.corpus import wordnet as wn
from nltk.stem import PorterStemmer

from sentence_transformers import SentenceTransformer

# sklearn packages
from sklearn import metrics
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score, StratifiedKFold
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.preprocessing import FunctionTransformer, StandardScaler
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.dummy import DummyClassifier
from sklearn.multioutput import ClassifierChain
from sklearn.model_selection import train_test_split
from sklearn.multiclass import OneVsRestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC

from xgboost import XGBClassifier

# utilies functions
import helper
import metrics as m
import models

In [2]:
df = pd.read_csv("./output/queries_df.csv")
df.head()

Unnamed: 0,QUERY,REMEMBERING,UNDERSTANDING,APPLYING,ANALYZING,EVALUATING,CREATING
0,"""their own language"" means _ .",1.0,0.0,0.0,0.0,0.0,0.0
1,"""you may think the popular singer jay chow is ...",0.0,1.0,0.0,0.0,0.0,0.0
2,". at midnight, nasreddin saw _ in his garden.",1.0,0.0,0.0,0.0,0.0,0.0
3,. david beckham is _ years old.,1.0,0.0,0.0,0.0,0.0,0.0
4,. david beckham's family name is _ and zhou...,1.0,0.0,0.0,0.0,0.0,0.0


In [3]:
df.shape

(189799, 7)

In [4]:
def unidecode_text(text):
    try:
        text = unidecode.unidecode(text)
    except:
        pass
    return text

In [5]:
df["QUERY_UC"] = df.QUERY.apply(unidecode_text)
categories = ["REMEMBERING", "UNDERSTANDING", "APPLYING", "ANALYZING", "EVALUATING", "CREATING"]

In [6]:
for category in categories:
    print(df[category].value_counts())

0.0    143926
1.0     45873
Name: REMEMBERING, dtype: int64
0.0    181279
1.0      8520
Name: UNDERSTANDING, dtype: int64
0.0    166346
1.0     23453
Name: APPLYING, dtype: int64
0.0    149809
1.0     39990
Name: ANALYZING, dtype: int64
0.0    173122
1.0     16677
Name: EVALUATING, dtype: int64
0.0    134513
1.0     55286
Name: CREATING, dtype: int64


In [7]:
df["QUERY_UC"].sample(10)

137214    have n't you already given your hair a positiv...
84468                            so whats x|y 1|7 3|21 9| ?
15580     how do i find an explicit formula for a functi...
55155     could i write the expression `` xx + xy + yx +...
165744              what is `` first ionization energy '' ?
102777         does the denominator always have to be a 9 ?
93833     b then also be a 3x3 matrix or could a 3x1 ( w...
100040    how do you write equations for reflections/rot...
127273                 what is greek symbol for frequency ?
37054                  at 2:13 , why did n't sal write +3 ?
Name: QUERY_UC, dtype: object

In [8]:
statistique = models.StatModel()

In [32]:
n_questions = 10
head_paraphases = statistique.generate_bloom_result(phrases=df["QUERY_UC"].head(n_questions).values)

In [33]:
head_paraphases.index = phrases=df["QUERY_UC"].head(n_questions).values

In [34]:
head_paraphases

Unnamed: 0,REMEMBERING,UNDERSTANDING,APPLYING,ANALYZING,EVALUATING,CREATING
"""their own language"" means _ .",0,0,0,0,0,0
"""you may think the popular singer jay chow is cool"". here ""cool"" means",2,0,0,0,1,1
". at midnight, nasreddin saw _ in his garden.",1,2,7,2,2,2
. david beckham is _ years old.,0,0,0,0,0,0
. david beckham's family name is _ and zhou jielun's given name is _ .,0,1,2,0,0,1
. does mrs. green buy a bag of rice?,0,0,0,0,0,0
. mimi likes _ .,0,0,0,0,0,0
. nasreddin was so frightened that he _ .,0,0,0,0,0,0
. there isn't a _ in my room.,0,0,0,0,0,0
. where's the supermarket?,0,0,0,0,0,0


In [35]:
np.max(head_paraphases.values, axis=1)

array([0, 2, 7, 0, 2, 0, 0, 0, 0, 0])

In [36]:
head_paraphases.ge(np.max(head_paraphases.values, axis=1), axis=0)

Unnamed: 0,REMEMBERING,UNDERSTANDING,APPLYING,ANALYZING,EVALUATING,CREATING
"""their own language"" means _ .",True,True,True,True,True,True
"""you may think the popular singer jay chow is cool"". here ""cool"" means",True,False,False,False,False,False
". at midnight, nasreddin saw _ in his garden.",False,False,True,False,False,False
. david beckham is _ years old.,True,True,True,True,True,True
. david beckham's family name is _ and zhou jielun's given name is _ .,False,False,True,False,False,False
. does mrs. green buy a bag of rice?,True,True,True,True,True,True
. mimi likes _ .,True,True,True,True,True,True
. nasreddin was so frightened that he _ .,True,True,True,True,True,True
. there isn't a _ in my room.,True,True,True,True,True,True
. where's the supermarket?,True,True,True,True,True,True


In [37]:
preds= head_paraphases.ge(np.max(head_paraphases.values, axis=1), axis=0)

In [38]:
np.argmax(preds.values, axis=1)

array([0, 0, 2, 0, 2, 0, 0, 0, 0, 0])

In [39]:
head_paraphases.columns[np.argmax(preds.values, axis=1)]

Index(['REMEMBERING', 'REMEMBERING', 'APPLYING', 'REMEMBERING', 'APPLYING',
       'REMEMBERING', 'REMEMBERING', 'REMEMBERING', 'REMEMBERING',
       'REMEMBERING'],
      dtype='object')

In [40]:
def predict(df_paraphrase:pd.DataFrame):
    preds = df_paraphrase.ge(np.max(df_paraphrase.values, axis=1), axis=0)
    preds["predict"] = df_paraphrase.columns[np.argmax(preds.values, axis=1)]
    return preds

In [41]:
preds = predict(head_paraphases)
preds.head()

Unnamed: 0,REMEMBERING,UNDERSTANDING,APPLYING,ANALYZING,EVALUATING,CREATING,predict
"""their own language"" means _ .",True,True,True,True,True,True,REMEMBERING
"""you may think the popular singer jay chow is cool"". here ""cool"" means",True,False,False,False,False,False,REMEMBERING
". at midnight, nasreddin saw _ in his garden.",False,False,True,False,False,False,APPLYING
. david beckham is _ years old.,True,True,True,True,True,True,REMEMBERING
. david beckham's family name is _ and zhou jielun's given name is _ .,False,False,True,False,False,False,APPLYING


In [42]:
X_train, X_test, y_train, y_test = train_test_split(df["QUERY_UC"], df[categories], test_size=0.25, random_state=41)

In [43]:
X_test.shape

(47450,)

In [44]:
test_paraphrase = statistique.generate_bloom_result(phrases=X_test.values)
test_paraphrase.head()

Unnamed: 0,REMEMBERING,UNDERSTANDING,APPLYING,ANALYZING,EVALUATING,CREATING
0,0,0,5,0,0,0
1,0,1,1,0,0,0
2,2,1,2,1,0,1
3,0,0,0,0,0,0
4,0,0,0,2,2,0


In [45]:
preds = predict(test_paraphrase)

In [46]:
preds.head()

Unnamed: 0,REMEMBERING,UNDERSTANDING,APPLYING,ANALYZING,EVALUATING,CREATING,predict
0,False,False,True,False,False,False,APPLYING
1,False,True,True,False,False,False,UNDERSTANDING
2,True,False,True,False,False,False,REMEMBERING
3,True,True,True,True,True,True,REMEMBERING
4,False,False,False,True,True,False,ANALYZING


In [47]:
y_test.head()

Unnamed: 0,REMEMBERING,UNDERSTANDING,APPLYING,ANALYZING,EVALUATING,CREATING
54935,0.0,0.0,1.0,0.0,0.0,0.0
4535,0.0,0.0,0.0,1.0,0.0,0.0
82074,1.0,0.0,0.0,0.0,0.0,0.0
141706,1.0,0.0,0.0,0.0,0.0,0.0
158650,0.0,0.0,0.0,0.0,0.0,1.0


In [48]:
X_test.head()

54935         can the same thing apply for inferior goods ?
4535                             where does the energy go ?
82074     at 2:23 , sal said over the square of the dist...
141706    is the prefix `` bis '' only applicable for sy...
158650     are polyatomic ions considered to be molecules ?
Name: QUERY_UC, dtype: object

In [49]:
y_true = predict(y_test)

In [50]:
y_true.head()

Unnamed: 0,REMEMBERING,UNDERSTANDING,APPLYING,ANALYZING,EVALUATING,CREATING,predict
54935,False,False,True,False,False,False,APPLYING
4535,False,False,False,True,False,False,ANALYZING
82074,True,False,False,False,False,False,REMEMBERING
141706,True,False,False,False,False,False,REMEMBERING
158650,False,False,False,False,False,True,CREATING


In [52]:
print(metrics.classification_report(y_true.predict.values, preds.predict.values, digits=4))

               precision    recall  f1-score   support

    ANALYZING     0.2837    0.0610    0.1004      9836
     APPLYING     0.2684    0.4967    0.3485      5897
     CREATING     0.2567    0.0328    0.0581     13825
   EVALUATING     0.1457    0.0535    0.0783      4147
  REMEMBERING     0.2983    0.7363    0.4246     11607
UNDERSTANDING     0.1024    0.1188    0.1100      2138

     accuracy                         0.2741     47450
    macro avg     0.2258    0.2498    0.1866     47450
 weighted avg     0.2572    0.2741    0.1967     47450



In [53]:
train_paraphrase= statistique.generate_bloom_result(phrases=X_train.values)
train_paraphrase.head()

Unnamed: 0,REMEMBERING,UNDERSTANDING,APPLYING,ANALYZING,EVALUATING,CREATING
0,0,0,0,0,0,0
1,0,0,0,2,2,0
2,0,0,0,0,0,0
3,0,0,0,0,0,0
4,0,0,0,0,0,0


In [54]:
y_train_true = predict(y_train)
y_train_true.head()

Unnamed: 0,REMEMBERING,UNDERSTANDING,APPLYING,ANALYZING,EVALUATING,CREATING,predict
182859,False,False,False,False,False,True,CREATING
28322,True,False,False,False,False,False,REMEMBERING
176082,True,False,False,False,False,False,REMEMBERING
41043,False,False,False,False,True,False,EVALUATING
13035,False,False,False,True,False,False,ANALYZING


### Try fit model with some machine learning techniques

In [55]:
svm = SVC(gamma="auto").fit(train_paraphrase.values, y_train_true.predict.values)

In [56]:
svm

SVC(gamma='auto')

In [57]:
preds = svm.predict(test_paraphrase.values)

In [58]:
preds

array(['APPLYING', 'CREATING', 'CREATING', ..., 'CREATING', 'REMEMBERING',
       'REMEMBERING'], dtype=object)

In [59]:
print(metrics.classification_report(y_true.predict.values, preds, digits=4))

               precision    recall  f1-score   support

    ANALYZING     0.3227    0.1057    0.1593      9836
     APPLYING     0.4475    0.2635    0.3317      5897
     CREATING     0.3161    0.3551    0.3345     13825
   EVALUATING     0.4095    0.0458    0.0824      4147
  REMEMBERING     0.3045    0.6490    0.4145     11607
UNDERSTANDING     0.3636    0.0037    0.0074      2138

     accuracy                         0.3211     47450
    macro avg     0.3606    0.2372    0.2216     47450
 weighted avg     0.3413    0.3211    0.2806     47450



In [60]:
lr = LogisticRegression(
    random_state=40, class_weight="balanced", solver="lbfgs", multi_class="auto", max_iter=7500
).fit(train_paraphrase.values, y_train_true.predict.values)

In [61]:
preds = lr.predict(test_paraphrase.values)

In [62]:
print(metrics.classification_report(y_true.predict.values, preds, digits=4))

               precision    recall  f1-score   support

    ANALYZING     0.2764    0.0543    0.0908      9836
     APPLYING     0.3059    0.4085    0.3498      5897
     CREATING     0.3567    0.0430    0.0768     13825
   EVALUATING     0.1787    0.2474    0.2075      4147
  REMEMBERING     0.3023    0.6948    0.4213     11607
UNDERSTANDING     0.0822    0.1366    0.1027      2138

     accuracy                         0.2723     47450
    macro avg     0.2504    0.2641    0.2081     47450
 weighted avg     0.2925    0.2723    0.2105     47450



In [63]:
xgb = XGBClassifier(
    random_state=42, seed=2, colsample_bytree=0.6, subsample=0.8
).fit(train_paraphrase.values, y_train_true.predict.values)

In [64]:
preds = xgb.predict(test_paraphrase.values)

In [65]:
print(metrics.classification_report(y_true.predict.values, preds, digits=4))

               precision    recall  f1-score   support

    ANALYZING     0.3233    0.1280    0.1834      9836
     APPLYING     0.4465    0.2620    0.3302      5897
     CREATING     0.3202    0.3399    0.3298     13825
   EVALUATING     0.3732    0.0564    0.0980      4147
  REMEMBERING     0.3044    0.6496    0.4146     11607
UNDERSTANDING     0.3571    0.0047    0.0092      2138

     accuracy                         0.3222     47450
    macro avg     0.3541    0.2401    0.2275     47450
 weighted avg     0.3390    0.3222    0.2855     47450



In [66]:
dtc = DecisionTreeClassifier(
    class_weight="balanced"
).fit(train_paraphrase.values, y_train_true.predict.values)

In [67]:
preds = dtc.predict(test_paraphrase.values)

In [68]:
print(metrics.classification_report(y_true.predict.values, preds, digits=4))

               precision    recall  f1-score   support

    ANALYZING     0.3398    0.0716    0.1182      9836
     APPLYING     0.3204    0.3866    0.3504      5897
     CREATING     0.3843    0.0711    0.1200     13825
   EVALUATING     0.1855    0.3554    0.2438      4147
  REMEMBERING     0.3035    0.6559    0.4149     11607
UNDERSTANDING     0.1111    0.1389    0.1235      2138

     accuracy                         0.2814     47450
    macro avg     0.2741    0.2799    0.2285     47450
 weighted avg     0.3177    0.2814    0.2314     47450



In [69]:
rfc = RandomForestClassifier(
    n_estimators=100
).fit(train_paraphrase.values, y_train_true.predict.values)

In [70]:
preds = rfc.predict(test_paraphrase.values)

In [71]:
print(metrics.classification_report(y_true.predict.values, preds, digits=4))

               precision    recall  f1-score   support

    ANALYZING     0.3147    0.1305    0.1845      9836
     APPLYING     0.4330    0.2564    0.3221      5897
     CREATING     0.3214    0.3030    0.3119     13825
   EVALUATING     0.3229    0.0670    0.1110      4147
  REMEMBERING     0.3007    0.6716    0.4154     11607
UNDERSTANDING     0.2500    0.0075    0.0145      2138

     accuracy                         0.3177     47450
    macro avg     0.3238    0.2393    0.2266     47450
 weighted avg     0.3257    0.3177    0.2811     47450

