# This notebook's aim is to transform the pre-processed documents into vectors and do a supervised analysis

[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/Jeremynadal33/categorize_question_API/blob/master/supervised_approach.ipynb)


## Here, are compared supervised methods : 
## We will compare the results for two different vectorization methods : Bag of Word and Tfidf
First, import the relevant libraries : 

In [31]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

import os
import re
import time
import datetime as datetime

## My script : 
from function import *

In [175]:
# Specific libraries : 
import nltk
from bs4 import BeautifulSoup


from sklearn.feature_extraction.text import CountVectorizer # BoW
from sklearn.feature_extraction.text import TfidfVectorizer # Tfidf


from sklearn.metrics import f1_score,precision_score,recall_score

from sklearn.multiclass import OneVsRestClassifier

from sklearn.linear_model import LogisticRegression,SGDClassifier
from sklearn.ensemble import RandomForestClassifier
import xgboost
import lightgbm
from sklearn.model_selection import train_test_split
from sklearn import metrics

In [3]:
root_dir = '/Users/jeremynadal/Documents/Formation OC IML/P5-API/'
input_dir = root_dir + 'inputs/'
png_dir = root_dir + 'pngs/'

In [4]:
data = pd.read_csv(input_dir+'processed_dataset.csv')

print(data.shape)
print(data.dtypes)
data.head()

(39409, 3)
Tags              object
processed_text    object
nb_tags            int64
dtype: object


Unnamed: 0,Tags,processed_text,nb_tags
0,['c#'],"['convert', 'double', 'c#', 'convert', 'double...",1
1,"['c#', '.net']","['c#', 'calculate', 'someone', 'age', 'base', ...",2
2,['c#'],"['calculate', 'time', 'c#', 'calculate', 'time...",1
3,['html'],"['determine', 'user', 'timezone', 'determine',...",1
4,['.net'],"['difference', 'mathfloor', 'mathtruncate', 'd...",1


In [5]:
def reforme_tags_processed_text(data):
    tags = []
    processed_text = []
    for indx in range(data.shape[0]):
        new_tags = []
        new_processed = []
        
        split = data['Tags'][indx].split(',')
        for nb_tags in range(data['nb_tags'][indx]):    
            to_append = re.sub('[\[\]\'\"!*+-]','',split[nb_tags]).replace('\\','').replace(' ','')
            if not to_append in ['',' '] : new_tags.append(to_append)
        tags.append(new_tags)
        
        text = data['processed_text'][indx].split(',')
        for i in range(len(text)):
            to_append = re.sub('[\[\]\'\"!*+-:.]','',text[i]).replace('\\','').replace(' ','')
            if not to_append in ['',' '] : new_processed.append(to_append)
        processed_text.append(new_processed)
        
    data['Tags'] = tags
    data['processed_text'] = processed_text
    return data

In [6]:
data = reforme_tags_processed_text(data)
data.head()

Unnamed: 0,Tags,processed_text,nb_tags
0,[c#],"[convert, double, c#, convert, double, c#, wan...",1
1,"[c#, .net]","[c#, calculate, someone, age, base, datetime, ...",2
2,[c#],"[calculate, time, c#, calculate, time, c#, giv...",1
3,[html],"[determine, user, timezone, determine, user, t...",1
4,[.net],"[difference, mathfloor, mathtruncate, differen...",1


In [7]:
def get_all_tags(tags):
    res = []
    for i in range(len(tags)):
        for j in range(len(tags[i])):
            res.append(tags[i][j])
            
    return pd.Series(res)

tags = get_all_tags(data['Tags'])

In [8]:
unique_tags = np.unique(tags)

In [9]:
for tag in unique_tags : 
    data[tag] = 0

In [10]:
for idx in data.index:
    for id_tag in range(data['nb_tags'][idx]):
        data[data['Tags'][idx][id_tag]][idx] = 1

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  This is separate from the ipykernel package so we can avoid doing imports until


In [12]:
data.head()

Unnamed: 0,Tags,processed_text,nb_tags,.net,android,arrays,asp.net,bash,c,c#,...,linux,mysql,objectivec,php,python,regex,sql,sqlserver,string,windows
0,[c#],"[convert, double, c#, convert, double, c#, wan...",1,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
1,"[c#, .net]","[c#, calculate, someone, age, base, datetime, ...",2,1,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
2,[c#],"[calculate, time, c#, calculate, time, c#, giv...",1,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
3,[html],"[determine, user, timezone, determine, user, t...",1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,[.net],"[difference, mathfloor, mathtruncate, differen...",1,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


## Lets separate the data into train and test for both bow and tfidf

In [16]:
bow_vectorizer = CountVectorizer(tokenizer = lambda x: x,
                                 preprocessor = lambda x: x,
                                 lowercase = False,
                                 max_features = 1000,
                                 binary = True,
                                 max_df = 0.9
                                 )  
bow_X = bow_vectorizer.fit_transform(data['processed_text'])
print(len(bow_vectorizer.get_feature_names()))
print(bow_X.toarray().shape)

1000
(39409, 1000)


In [22]:
tfidf_vectorizer = TfidfVectorizer(tokenizer = lambda x: x,
                                   preprocessor = lambda x: x,
                                   lowercase = False,
                                   max_features = 1000,
                                   max_df = 0.9
                                   )  
tfidf_X = tfidf_vectorizer.fit_transform(data['processed_text'])


In [25]:
X_train_bow, X_test_bow, y_train_multi, y_test_multi = train_test_split(bow_X.toarray(), 
                                                                     data[unique_tags], 
                                                                     test_size=0.2, 
                                                                     random_state=42)
X_train_tfidf, X_test_tfidf, y_train_tags, y_test_tags = train_test_split(tfidf_X.toarray(), 
                                                                            data['Tags'], 
                                                                            test_size=0.2, 
                                                                            random_state=42)
print('Train dataset : X.shape = ' , X_train_bow.shape, ' Y.shape = ', y_train_bow.shape)
print('Test dataset : X.shape = ' , X_test_bow.shape, ' Y.shape = ', y_test_bow.shape)

Train dataset : X.shape =  (31527, 1000)  Y.shape =  (31527, 24)
Test dataset : X.shape =  (7882, 1000)  Y.shape =  (7882, 24)


## We define some metrics to compare the different models 

In [121]:
def cosine_similarity(x, y):
    x = np.array(x)
    y = np.array(y)
    assert x.shape == y.shape , 'x and y doesnt have same shape'
    assert len(x.shape)==2 , 'x and y must be matrixes [nb_samp,x], if only 1 sample use : np.reshape(1,-1)'
    cosin = []
    for idx in range(x.shape[0]):
        if (np.dot(x[idx], x[idx]) !=0 and np.dot(y[idx], y[idx]) != 0 ) :
            cosin.append(np.dot(x[idx], y[idx]) / (np.sqrt(np.dot(x[idx], x[idx])) * np.sqrt(np.dot(y[idx], y[idx]))))
        elif (np.dot(x[idx], x[idx]) ==0 and np.dot(y[idx], y[idx]) == 0 ) : 
            cosin.append(1)
        else:
            cosin.append(-1)
    return np.mean(cosin)

def print_metrics(y_true, pred):
    '''Prints and return a summary of results'''
    y_true = np.array(y_true)
    pred = np.array(pred)
    assert y_true.shape == pred.shape, 'arrays doesnt have same shape'
    
    res = [metrics.accuracy_score(y_true, pred),
           metrics.hamming_loss(y_true,pred),
           precision_score(y_true, pred, average='micro'),
           recall_score(y_true, pred, average='micro'),
           f1_score(y_true, pred, average='micro'),
           cosine_similarity(y_true,pred)]
    
    print("Accuracy :{:.3}\nHamming loss :{:.4}\n\nMicro-averaged quality metrics :\nPrecision :{:.3}\nRecall :{:.3}\nF1-score :{:.3}\nCosine similarity : {:.3}".format(*res))
    return res

## Lets compare different models on both vectorizers (bow and tfidf)
* SGD 
* Logistic regression 
* Random Forest
* XGBoost 
* LightGBM

In [170]:
start = time.process_time()
classifier = OneVsRestClassifier(SGDClassifier(loss='log', alpha=0.001, penalty='l1'))
classifier.fit(X_train_bow, y_train_multi)
predictions = classifier.predict(X_test_bow)


res = print_metrics(y_test_multi, predictions)
print("Time taken to run this cell :", time.process_time() - start)

Accuracy :0.454
Hamming loss :0.0293

Micro-averaged quality metrics :
Precision :0.857
Recall :0.55
F1-score :0.67
Cosine similarity : 0.281
Time taken to run this cell : 38.405165999999994


In [174]:
start = time.process_time()
classifier = OneVsRestClassifier(LogisticRegression(max_iter = 1000))
classifier.fit(X_train_bow, y_train_multi)
predictions = classifier.predict(X_test_bow)


res = print_metrics(y_test_multi, predictions)
print("Time taken to run this cell :", time.process_time() - start)

Accuracy :0.513
Hamming loss :0.02672

Micro-averaged quality metrics :
Precision :0.821
Recall :0.646
F1-score :0.723
Cosine similarity : 0.489
Time taken to run this cell : 163.64250900000002


In [None]:
start = time.process_time()
classifier = OneVsRestClassifier(LogisticRegression(max_iter = 1000))
classifier.fit(X_train_bow, y_train_multi)
predictions = classifier.predict(X_test_bow)


res = print_metrics(y_test_multi, predictions)
print("Time taken to run this cell :", time.process_time() - start)

In [168]:
pos_from_tag = lambda tag: ('a' if tag[0].lower() == 'j' else tag[0].lower()) if tag[0].lower() in ['n', 'r', 'v'] else 'n'

def handle_body(text, 
                #tokenizer = nltk.RegexpTokenizer(r'\w+'), 
                stop_words = nltk.corpus.stopwords.words("english"), 
                lemmatizer = nltk.stem.WordNetLemmatizer()  ):
    
    POS_to_rm = ['RB','RBR','RBS','JJ','JJR','JJS','CD'] #Removing adverbs and adjectives and digits
    stop_words += ['.','€','$','?','\'s',',',':',';','=','+','-']  
    
    soup = BeautifulSoup(text, 'html.parser')
    tokens = nltk.word_tokenize( soup.get_text().lower() )
    
    tokens = [re.sub('[.,?!)()<>:;\ \"\'\]\[+-=\{\}\|^*@&`’]','',token).replace('\\','').replace('?','') for token in tokens]
    tokens = [token for token in tokens if token != '']
    tags = nltk.pos_tag(tokens)

    tokens = [ tokens[i] for i in range(len(tokens)) if ( (not tokens[i] in stop_words) and (not tags[i][1] in POS_to_rm ) )]
    
    tags = nltk.pos_tag(tokens)
    
    result = [lemmatizer.lemmatize(tokens[i], pos=pos_from_tag(tags[i][1])) for i in range(len(tokens))]
    
    return result 

def handle_title(text,
                 stop_words = nltk.corpus.stopwords.words("english"), 
                 lemmatizer = nltk.stem.WordNetLemmatizer()  ): 
    
    POS_to_rm = ['RB','RBR','RBS','JJ','JJR','JJS'] #Removing adverbs and adjectives
    stop_words += ['.','€','$','?','\'s',',',':',';','=','+','-']  
    
    tokens = re.split(' ', text.lower())
    tokens = [re.sub('[.,?!)()<>:;\ \"\'\]\[+-=\{\}\|^*@&`’]','',token).replace('\\','').replace('?','') for token in tokens]

    tokens = [token for token in tokens if token != '']
    tags = nltk.pos_tag(tokens)
    
    tokens = [ tokens[i] for i in range(len(tokens)) if ( (not tokens[i] in stop_words) and (not tags[i][1] in POS_to_rm ) )]
    
    tags = nltk.pos_tag(tokens)
    
    result = [lemmatizer.lemmatize(tokens[i], pos=pos_from_tag(tags[i][1])) for i in range(len(tokens))]
    
    return result 

def preprocess_text(body, title = None):
    body = handle_body(body)
    if title :
        title = handle_title(title)
        body = title + title + body
    return body


def predict_new_sentence(text, vectorizer, model, all_tags, title = None):
    all_tags = np.array(all_tags)
    text = np.array(preprocess_text(text, title=title)).reshape(1, -1)
    text = vectorizer.transform(text)
    pred = model.predict(text)
    
    assert pred[0].shape == all_tags.shape, 'the passed tags doesnot have same shape as model output'
    idx = [idx for idx in range(len(all_tags)) if pred[0][idx]==1]
    return all_tags[idx]


In [172]:
text = 'Hi, my name is Jack and I love potatoes'
title = ''
predict_new_sentence(text, bow_vectorizer, classifier, unique_tags, title=title )

array([], dtype=object)

In [67]:
unique_tags

array(['.net', 'android', 'arrays', 'asp.net', 'bash', 'c', 'c#', 'css',
       'git', 'html', 'iphone', 'java', 'javascript', 'jquery', 'linux',
       'mysql', 'objectivec', 'php', 'python', 'regex', 'sql',
       'sqlserver', 'string', 'windows'], dtype=object)

In [82]:
from sklearn.preprocessing import MultiLabelBinarizer
mlb = MultiLabelBinarizer()
test = mlb.fit_transform(y_train_tags)
test[0]

array([0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1,
       0, 0])

In [83]:
y_train_tags

5770          [c#, sqlserver]
36859            [javascript]
13756             [html, css]
10742    [javascript, arrays]
22065                  [java]
                 ...         
6265                 [iphone]
11284             [sqlserver]
38158        [sql, sqlserver]
860             [c#, asp.net]
15795                [python]
Name: Tags, Length: 31527, dtype: object