# This notebook's aim is to transform the pre-processed documents into vectors and do a supervised analysis

[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/Jeremynadal33/categorize_question_API/blob/master/supervised_approach.ipynb)


## Here, are compared supervised methods : 
## We will compare the results for two different vectorization methods : Bag of Word and Tfidf
First, import the relevant libraries : 

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

import os
import re
import time
import datetime as datetime

In [2]:
# Specific libraries : 
import nltk
from bs4 import BeautifulSoup


from sklearn.feature_extraction.text import CountVectorizer # BoW
from sklearn.feature_extraction.text import TfidfVectorizer # Tfidf


from sklearn.metrics import f1_score,precision_score,recall_score

from sklearn.multiclass import OneVsRestClassifier

from sklearn.linear_model import LogisticRegression,SGDClassifier
from sklearn.ensemble import RandomForestClassifier
import xgboost
import lightgbm
from sklearn.model_selection import train_test_split
from sklearn import metrics

In [3]:
if 'google.colab' in str(get_ipython()):
  print('Running on CoLab')
  from google.colab import drive
  drive.mount('/content/gdrive',force_remount=True)
  !ls /content/gdrive/My\ Drive/Formation-OC/P5-Stackoverflow
  root_dir = '/content/gdrive/My Drive/Formation-OC/P5-Stackoverflow/'
  input_dir = root_dir + 'inputs/'
  png_dir = root_dir + 'pngs/'

  #my script
  %run /content/gdrive/My\ Drive/Formation-OC/P5-Stackoverflow/function.py
  !ls gdrive/MyDrive/Formation-OC/P5-Stackoverflow/
else:
  print('Not running on CoLab')
  #my script
  root_dir = '/Users/jeremynadal/Documents/Formation OC IML/P5-API/'
  input_dir = root_dir + 'inputs/'
  png_dir = root_dir + 'pngs/'
  model_dir = root_dir +'models/'
  from function import *

Not running on CoLab


In [4]:
data_example = pd.read_csv(input_dir+'posts.csv',nrows=100)

In [5]:
data = pd.read_csv(input_dir+'processed_dataset.csv')

print(data.shape)
print(data.dtypes)
data.head()

(39409, 3)
Tags              object
processed_text    object
nb_tags            int64
dtype: object


Unnamed: 0,Tags,processed_text,nb_tags
0,['c#'],"['convert', 'double', 'c#', 'convert', 'double...",1
1,"['c#', '.net']","['c#', 'calculate', 'someone', 'age', 'base', ...",2
2,['c#'],"['calculate', 'time', 'c#', 'calculate', 'time...",1
3,['html'],"['determine', 'user', 'timezone', 'determine',...",1
4,['.net'],"['difference', 'mathfloor', 'mathtruncate', 'd...",1


In [6]:
def reforme_tags_processed_text(data):
    tags = []
    processed_text = []
    for indx in range(data.shape[0]):
        new_tags = []
        new_processed = []
        
        split = data['Tags'][indx].split(',')
        for nb_tags in range(data['nb_tags'][indx]):    
            to_append = re.sub('[\[\]\'\"!*+-]','',split[nb_tags]).replace('\\','').replace(' ','')
            if not to_append in ['',' '] : new_tags.append(to_append)
        tags.append(new_tags)
        
        text = data['processed_text'][indx].split(',')
        for i in range(len(text)):
            to_append = re.sub('[\[\]\'\"!*+-:.]','',text[i]).replace('\\','').replace(' ','')
            if not to_append in ['',' '] : new_processed.append(to_append)
        processed_text.append(new_processed)
        
    data['Tags'] = tags
    data['processed_text'] = processed_text
    return data

In [7]:
data = reforme_tags_processed_text(data)
data.head()

Unnamed: 0,Tags,processed_text,nb_tags
0,[c#],"[convert, double, c#, convert, double, c#, wan...",1
1,"[c#, .net]","[c#, calculate, someone, age, base, datetime, ...",2
2,[c#],"[calculate, time, c#, calculate, time, c#, giv...",1
3,[html],"[determine, user, timezone, determine, user, t...",1
4,[.net],"[difference, mathfloor, mathtruncate, differen...",1


In [8]:
def get_all_tags(tags):
    res = []
    for i in range(len(tags)):
        for j in range(len(tags[i])):
            res.append(tags[i][j])
            
    return pd.Series(res)

tags = get_all_tags(data['Tags'])

In [9]:
unique_tags = np.unique(tags)

In [10]:
unique_tags

array(['.net', 'android', 'arrays', 'asp.net', 'bash', 'c', 'c#', 'css',
       'git', 'html', 'iphone', 'java', 'javascript', 'jquery', 'linux',
       'mysql', 'objectivec', 'php', 'python', 'regex', 'sql',
       'sqlserver', 'string', 'windows'], dtype=object)

In [11]:
for tag in unique_tags : 
    data[tag] = 0

In [12]:
for idx in data.index:
    for id_tag in range(data['nb_tags'][idx]):
        data[data['Tags'][idx][id_tag]][idx] = 1

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  This is separate from the ipykernel package so we can avoid doing imports until


In [13]:
data.head()

Unnamed: 0,Tags,processed_text,nb_tags,.net,android,arrays,asp.net,bash,c,c#,...,linux,mysql,objectivec,php,python,regex,sql,sqlserver,string,windows
0,[c#],"[convert, double, c#, convert, double, c#, wan...",1,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
1,"[c#, .net]","[c#, calculate, someone, age, base, datetime, ...",2,1,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
2,[c#],"[calculate, time, c#, calculate, time, c#, giv...",1,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
3,[html],"[determine, user, timezone, determine, user, t...",1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,[.net],"[difference, mathfloor, mathtruncate, differen...",1,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


## Lets separate the data into train and test for both bow and tfidf

In [15]:
bow_vectorizer = CountVectorizer(tokenizer = lambda x: x,
                                 preprocessor = lambda x: x,
                                 lowercase = False,
                                 max_features = 1000,
                                 binary = True,
                                 max_df = 0.9
                                 )  
bow_X = bow_vectorizer.fit_transform(data['processed_text'])
print(len(bow_vectorizer.get_feature_names()))
print(bow_X.toarray().shape)

1000
(39409, 1000)


In [14]:
def _nothing(x):
    return x


tfidf_vectorizer = TfidfVectorizer(tokenizer = lambda x: x,
                                   preprocessor = lambda x: x,
                                   lowercase = False,
                                   max_features = 1000,
                                   max_df = 0.9
                                   )  
tfidf_X = tfidf_vectorizer.fit_transform(data['processed_text'])
data.info()
tfidf_X.toarray().shape

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 39409 entries, 0 to 39408
Data columns (total 27 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   Tags            39409 non-null  object
 1   processed_text  39409 non-null  object
 2   nb_tags         39409 non-null  int64 
 3   .net            39409 non-null  int64 
 4   android         39409 non-null  int64 
 5   arrays          39409 non-null  int64 
 6   asp.net         39409 non-null  int64 
 7   bash            39409 non-null  int64 
 8   c               39409 non-null  int64 
 9   c#              39409 non-null  int64 
 10  css             39409 non-null  int64 
 11  git             39409 non-null  int64 
 12  html            39409 non-null  int64 
 13  iphone          39409 non-null  int64 
 14  java            39409 non-null  int64 
 15  javascript      39409 non-null  int64 
 16  jquery          39409 non-null  int64 
 17  linux           39409 non-null  int64 
 18  mysql 

(39409, 1000)

In [19]:
test = data[['processed_text']]
test.info()
test.to_csv('inputs/'+'processed_text.csv',index=False)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 39409 entries, 0 to 39408
Data columns (total 1 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   processed_text  39409 non-null  object
dtypes: object(1)
memory usage: 308.0+ KB


In [23]:
X_train_bow, X_test_bow, y_train_multi, y_test_multi = train_test_split(bow_X.toarray(), 
                                                                     data[unique_tags], 
                                                                     test_size=0.2, 
                                                                     random_state=42)
X_train_tfidf, X_test_tfidf, y_train_tags, y_test_tags = train_test_split(tfidf_X.toarray(), 
                                                                            data['Tags'], 
                                                                            test_size=0.2, 
                                                                            random_state=42)
print('Train dataset : X.shape = ' , X_train_bow.shape, ' Y_multi.shape = ', y_train_multi.shape)
print('Test dataset : X.shape = ' , X_test_bow.shape, ' Y_tags.shape = ', y_test_tags.shape)

Train dataset : X.shape =  (31527, 1000)  Y_multi.shape =  (31527, 24)
Test dataset : X.shape =  (7882, 1000)  Y_tags.shape =  (7882,)


## We define some metrics to compare the different models 

In [20]:
def cosine_similarity(x, y):
    x = np.array(x)
    y = np.array(y)
    assert x.shape == y.shape , 'x and y doesnt have same shape'
    assert len(x.shape)==2 , 'x and y must be matrixes [nb_samp,x], if only 1 sample use : np.reshape(1,-1)'
    cosin = []
    for idx in range(x.shape[0]):
        if (np.dot(x[idx], x[idx]) !=0 and np.dot(y[idx], y[idx]) != 0 ) :
            cosin.append(np.dot(x[idx], y[idx]) / (np.sqrt(np.dot(x[idx], x[idx])) * np.sqrt(np.dot(y[idx], y[idx]))))
        elif (np.dot(x[idx], x[idx]) ==0 and np.dot(y[idx], y[idx]) == 0 ) : 
            cosin.append(1)
        else:
            cosin.append(-1)
    return np.mean(cosin)

def print_metrics(y_true, pred):
    '''Prints and return a summary of results'''
    y_true = np.array(y_true)
    pred = np.array(pred)
    assert y_true.shape == pred.shape, 'arrays doesnt have same shape'
    
    res = [metrics.accuracy_score(y_true, pred),
           metrics.hamming_loss(y_true,pred),
           precision_score(y_true, pred, average='micro'),
           recall_score(y_true, pred, average='micro'),
           f1_score(y_true, pred, average='micro'),
           cosine_similarity(y_true,pred)]
    
    print("Accuracy :{:.3}\nHamming loss :{:.4}\n\nMicro-averaged quality metrics :\nPrecision :{:.3}\nRecall :{:.3}\nF1-score :{:.3}\nCosine similarity : {:.3}".format(*res))
    return res

## Lets compare different models on both vectorizers (bow and tfidf)
* SGD 
* Logistic regression 
* Random Forest
* XGBoost 
* LightGBM

Lets put results into a csv file

In [36]:
name = 'results.csv'
if os.path.exists(input_dir+name):
  classifier_comparison = pd.read_csv(input_dir+name)
else : 
    classifier_comparison = pd.DataFrame(columns = ['model','vectorizer','acc','hamm','precision','recall','F1','cosine','tps'])
classifier_comparison

Unnamed: 0,model,vectorizer,acc,hamm,precision,recall,F1,cosine,tps
0,SGD,bow,0.451789,0.029397,0.85899,0.545721,0.667424,0.272044,38.429488
1,SGD,tfidf,0.318066,0.036354,0.904348,0.366161,0.521267,-0.136463,32.303356
2,RegLog,bow,0.512687,0.026722,0.821278,0.646259,0.723332,0.488656,177.660644
3,RegLog,tfidf,0.493783,0.02696,0.882634,0.578093,0.698617,0.338966,116.288684
4,RF,bow,0.486679,0.027441,0.900796,0.553252,0.685489,0.325105,493.970196
5,RF,tfidf,0.515478,0.025818,0.903094,0.585134,0.710148,0.380185,1099.684074
6,lgbm,bow,0.541233,0.02484,0.845721,0.661027,0.742054,0.52304,270.265418
7,lgbm,tfidf,0.565592,0.023408,0.864179,0.672665,0.756489,0.543978,730.452619
8,xgboost,bow,0.533113,0.025158,0.84931,0.649878,0.736329,0.49584,7738.712713
9,xgboost,tfidf,0.549734,0.024127,0.860895,0.660342,0.747399,0.51389,8050.586103


In [18]:
# Train SGD with bow
start = time.process_time()
classifier = OneVsRestClassifier(SGDClassifier(loss='log', alpha=0.001, penalty='l1'))
classifier.fit(X_train_bow, y_train_multi)
predictions = classifier.predict(X_test_bow)


res = print_metrics(y_test_multi, predictions)
tps = time.process_time() - start
print("Time taken to run this cell :", tps)
dico = {'model':'SGD',
      'vectorizer':'bow',
      'acc':res[0],
      'hamm':res[1],
      'precision':res[2],
      'recall':res[3],
      'F1':res[4],
      'cosine':res[5],
      'tps':tps}
classifier_comparison = classifier_comparison.append(dico,ignore_index=True)
classifier_comparison


Accuracy :0.452
Hamming loss :0.0294

Micro-averaged quality metrics :
Precision :0.859
Recall :0.546
F1-score :0.667
Cosine similarity : 0.272
Time taken to run this cell : 38.429488000999996


Unnamed: 0,model,vectorizer,acc,hamm,precision,recall,F1,cosine,tps
0,SGD,bow,0.451789,0.029397,0.85899,0.545721,0.667424,0.272044,38.429488


In [20]:
# Train SGD with tfidf
start = time.process_time()
classifier = OneVsRestClassifier(SGDClassifier(loss='log', alpha=0.001, penalty='l1'))
classifier.fit(X_train_tfidf, y_train_multi)
predictions = classifier.predict(X_test_tfidf)


res = print_metrics(y_test_multi, predictions)
tps = time.process_time() - start
print("Time taken to run this cell :", tps)
dico = {'model':'SGD',
      'vectorizer':'tfidf',
      'acc':res[0],
      'hamm':res[1],
      'precision':res[2],
      'recall':res[3],
      'F1':res[4],
      'cosine':res[5],
      'tps':tps}
classifier_comparison = classifier_comparison.append(dico,ignore_index=True)
classifier_comparison


Accuracy :0.318
Hamming loss :0.03635

Micro-averaged quality metrics :
Precision :0.904
Recall :0.366
F1-score :0.521
Cosine similarity : -0.136
Time taken to run this cell : 32.303356313999984


Unnamed: 0,model,vectorizer,acc,hamm,precision,recall,F1,cosine,tps
0,SGD,bow,0.451789,0.029397,0.85899,0.545721,0.667424,0.272044,38.429488
1,RegLog,bow,0.512687,0.026722,0.821278,0.646259,0.723332,0.488656,177.660644
2,SGD,tfidf,0.318066,0.036354,0.904348,0.366161,0.521267,-0.136463,32.303356


In [19]:
# Train RegLog with bow
start = time.process_time()
classifier = OneVsRestClassifier(LogisticRegression(max_iter = 1000))
classifier.fit(X_train_bow, y_train_multi)
predictions = classifier.predict(X_test_bow)


res = print_metrics(y_test_multi, predictions)
tps = time.process_time() - start
print("Time taken to run this cell :", tps)
dico = {'model':'RegLog',
      'vectorizer':'bow',
      'acc':res[0],
      'hamm':res[1],
      'precision':res[2],
      'recall':res[3],
      'F1':res[4],
      'cosine':res[5],
      'tps':tps}
classifier_comparison = classifier_comparison.append(dico,ignore_index=True)
classifier_comparison

Accuracy :0.513
Hamming loss :0.02672

Micro-averaged quality metrics :
Precision :0.821
Recall :0.646
F1-score :0.723
Cosine similarity : 0.489
Time taken to run this cell : 177.660643808


Unnamed: 0,model,vectorizer,acc,hamm,precision,recall,F1,cosine,tps
0,SGD,bow,0.451789,0.029397,0.85899,0.545721,0.667424,0.272044,38.429488
1,RegLog,bow,0.512687,0.026722,0.821278,0.646259,0.723332,0.488656,177.660644


In [22]:
# Train RegLog with tfidf
start = time.process_time()
classifier = OneVsRestClassifier(LogisticRegression(max_iter = 1000))
classifier.fit(X_train_tfidf, y_train_multi)
predictions = classifier.predict(X_test_tfidf)


res = print_metrics(y_test_multi, predictions)
tps = time.process_time() - start
print("Time taken to run this cell :", tps)
dico = {'model':'RegLog',
      'vectorizer':'tfidf',
      'acc':res[0],
      'hamm':res[1],
      'precision':res[2],
      'recall':res[3],
      'F1':res[4],
      'cosine':res[5],
      'tps':tps}
classifier_comparison = classifier_comparison.append(dico,ignore_index=True)
classifier_comparison

Accuracy :0.494
Hamming loss :0.02696

Micro-averaged quality metrics :
Precision :0.883
Recall :0.578
F1-score :0.699
Cosine similarity : 0.339
Time taken to run this cell : 116.28868398600002


Unnamed: 0,model,vectorizer,acc,hamm,precision,recall,F1,cosine,tps
0,SGD,bow,0.451789,0.029397,0.85899,0.545721,0.667424,0.272044,38.429488
1,RegLog,bow,0.512687,0.026722,0.821278,0.646259,0.723332,0.488656,177.660644
2,SGD,tfidf,0.318066,0.036354,0.904348,0.366161,0.521267,-0.136463,32.303356
3,RegLog,tfidf,0.493783,0.02696,0.882634,0.578093,0.698617,0.338966,116.288684


In [23]:
classifier_comparison.to_csv(input_dir+name, index = False)

In [24]:
# Train RandomForest with bow
start = time.process_time()
classifier = OneVsRestClassifier(RandomForestClassifier(random_state=42))
classifier.fit(X_train_bow, y_train_multi)
predictions = classifier.predict(X_test_bow)


res = print_metrics(y_test_multi, predictions)
tps = time.process_time() - start
print("Time taken to run this cell :", tps)
dico = {'model':'RF',
      'vectorizer':'bow',
      'acc':res[0],
      'hamm':res[1],
      'precision':res[2],
      'recall':res[3],
      'F1':res[4],
      'cosine':res[5],
      'tps':tps}
classifier_comparison = classifier_comparison.append(dico,ignore_index=True)
classifier_comparison

Accuracy :0.487
Hamming loss :0.02744

Micro-averaged quality metrics :
Precision :0.901
Recall :0.553
F1-score :0.685
Cosine similarity : 0.325
Time taken to run this cell : 493.97019612


Unnamed: 0,model,vectorizer,acc,hamm,precision,recall,F1,cosine,tps
0,SGD,bow,0.451789,0.029397,0.85899,0.545721,0.667424,0.272044,38.429488
1,RegLog,bow,0.512687,0.026722,0.821278,0.646259,0.723332,0.488656,177.660644
2,SGD,tfidf,0.318066,0.036354,0.904348,0.366161,0.521267,-0.136463,32.303356
3,RegLog,tfidf,0.493783,0.02696,0.882634,0.578093,0.698617,0.338966,116.288684
4,RF,bow,0.486679,0.027441,0.900796,0.553252,0.685489,0.325105,493.970196


In [25]:
# Train RandomForest with tfidf
start = time.process_time()
classifier = OneVsRestClassifier(RandomForestClassifier(random_state=42))
classifier.fit(X_train_tfidf, y_train_multi)
predictions = classifier.predict(X_test_tfidf)


res = print_metrics(y_test_multi, predictions)
tps = time.process_time() - start
print("Time taken to run this cell :", tps)
dico = {'model':'RF',
      'vectorizer':'tfidf',
      'acc':res[0],
      'hamm':res[1],
      'precision':res[2],
      'recall':res[3],
      'F1':res[4],
      'cosine':res[5],
      'tps':tps}
classifier_comparison = classifier_comparison.append(dico,ignore_index=True)
classifier_comparison

Accuracy :0.515
Hamming loss :0.02582

Micro-averaged quality metrics :
Precision :0.903
Recall :0.585
F1-score :0.71
Cosine similarity : 0.38
Time taken to run this cell : 1099.6840743859998


Unnamed: 0,model,vectorizer,acc,hamm,precision,recall,F1,cosine,tps
0,SGD,bow,0.451789,0.029397,0.85899,0.545721,0.667424,0.272044,38.429488
1,RegLog,bow,0.512687,0.026722,0.821278,0.646259,0.723332,0.488656,177.660644
2,SGD,tfidf,0.318066,0.036354,0.904348,0.366161,0.521267,-0.136463,32.303356
3,RegLog,tfidf,0.493783,0.02696,0.882634,0.578093,0.698617,0.338966,116.288684
4,RF,bow,0.486679,0.027441,0.900796,0.553252,0.685489,0.325105,493.970196
5,RF,tfidf,0.515478,0.025818,0.903094,0.585134,0.710148,0.380185,1099.684074


In [26]:
classifier_comparison.to_csv(input_dir+name, index = False)

In [18]:
start = time.process_time()
classifier = OneVsRestClassifier(lightgbm.LGBMClassifier(random_state=42))
classifier.fit(X_train_bow, y_train_multi)
predictions = classifier.predict(X_test_bow)


res = print_metrics(y_test_multi, predictions)
tps = time.process_time() - start
print("Time taken to run this cell :", tps)
dico = {'model':'lgbm',
      'vectorizer':'bow',
      'acc':res[0],
      'hamm':res[1],
      'precision':res[2],
      'recall':res[3],
      'F1':res[4],
      'cosine':res[5],
      'tps':tps}
classifier_comparison = classifier_comparison.append(dico,ignore_index=True)
classifier_comparison

Accuracy :0.541
Hamming loss :0.02484

Micro-averaged quality metrics :
Precision :0.846
Recall :0.661
F1-score :0.742
Cosine similarity : 0.523
Time taken to run this cell : 270.26541799999995


Unnamed: 0.1,Unnamed: 0,model,vectorizer,acc,hamm,precision,recall,F1,cosine,tps
0,0.0,SGD,bow,0.451789,0.029397,0.85899,0.545721,0.667424,0.272044,38.429488
1,1.0,RegLog,bow,0.512687,0.026722,0.821278,0.646259,0.723332,0.488656,177.660644
2,2.0,SGD,tfidf,0.318066,0.036354,0.904348,0.366161,0.521267,-0.136463,32.303356
3,3.0,RegLog,tfidf,0.493783,0.02696,0.882634,0.578093,0.698617,0.338966,116.288684
4,4.0,RF,bow,0.486679,0.027441,0.900796,0.553252,0.685489,0.325105,493.970196
5,5.0,RF,tfidf,0.515478,0.025818,0.903094,0.585134,0.710148,0.380185,1099.684074
6,,lgbm,bow,0.541233,0.02484,0.845721,0.661027,0.742054,0.52304,270.265418


In [23]:
classifier_comparison.to_csv(input_dir+name, index = False)

In [57]:
start = time.process_time()
classifier = OneVsRestClassifier(lightgbm.LGBMClassifier(random_state=42))
classifier.fit(X_train_tfidf, y_train_multi)
predictions = classifier.predict(X_test_tfidf)


res = print_metrics(y_test_multi, predictions)
tps = time.process_time() - start
print("Time taken to run this cell :", tps)
dico = {'model':'lgbm',
      'vectorizer':'tfidf',
      'acc':res[0],
      'hamm':res[1],
      'precision':res[2],
      'recall':res[3],
      'F1':res[4],
      'cosine':res[5],
      'tps':tps}
#classifier_comparison = classifier_comparison.append(dico,ignore_index=True)
classifier_comparison

Accuracy :0.566
Hamming loss :0.02341

Micro-averaged quality metrics :
Precision :0.864
Recall :0.673
F1-score :0.756
Cosine similarity : 0.544
Time taken to run this cell : 738.3291649999992


Unnamed: 0,model,vectorizer,acc,hamm,precision,recall,F1,cosine,tps
0,SGD,bow,0.451789,0.029397,0.85899,0.545721,0.667424,0.272044,38.429488
1,SGD,tfidf,0.318066,0.036354,0.904348,0.366161,0.521267,-0.136463,32.303356
2,RegLog,bow,0.512687,0.026722,0.821278,0.646259,0.723332,0.488656,177.660644
3,RegLog,tfidf,0.493783,0.02696,0.882634,0.578093,0.698617,0.338966,116.288684
4,RF,bow,0.486679,0.027441,0.900796,0.553252,0.685489,0.325105,493.970196
5,RF,tfidf,0.515478,0.025818,0.903094,0.585134,0.710148,0.380185,1099.684074
6,lgbm,bow,0.541233,0.02484,0.845721,0.661027,0.742054,0.52304,270.265418
7,lgbm,tfidf,0.565592,0.023408,0.864179,0.672665,0.756489,0.543978,730.452619
8,xgboost,bow,0.533113,0.025158,0.84931,0.649878,0.736329,0.49584,7738.712713
9,xgboost,tfidf,0.549734,0.024127,0.860895,0.660342,0.747399,0.51389,8050.586103


In [27]:
start = time.process_time()
classifier = OneVsRestClassifier(xgboost.XGBClassifier(random_state=42))
classifier.fit(X_train_bow, y_train_multi)
predictions = classifier.predict(X_test_bow)


res = print_metrics(y_test_multi, predictions)
tps = time.process_time() - start
print("Time taken to run this cell :", tps)
dico = {'model':'xgboost',
      'vectorizer':'bow',
      'acc':res[0],
      'hamm':res[1],
      'precision':res[2],
      'recall':res[3],
      'F1':res[4],
      'cosine':res[5],
      'tps':tps}
classifier_comparison = classifier_comparison.append(dico,ignore_index=True)
classifier_comparison

Accuracy :0.533
Hamming loss :0.02516

Micro-averaged quality metrics :
Precision :0.849
Recall :0.65
F1-score :0.736
Cosine similarity : 0.496
Time taken to run this cell : 7738.712713000001


Unnamed: 0,model,vectorizer,acc,hamm,precision,recall,F1,cosine,tps
0,SGD,bow,0.451789,0.029397,0.85899,0.545721,0.667424,0.272044,38.429488
1,RegLog,bow,0.512687,0.026722,0.821278,0.646259,0.723332,0.488656,177.660644
2,SGD,tfidf,0.318066,0.036354,0.904348,0.366161,0.521267,-0.136463,32.303356
3,RegLog,tfidf,0.493783,0.02696,0.882634,0.578093,0.698617,0.338966,116.288684
4,RF,bow,0.486679,0.027441,0.900796,0.553252,0.685489,0.325105,493.970196
5,RF,tfidf,0.515478,0.025818,0.903094,0.585134,0.710148,0.380185,1099.684074
6,lgbm,bow,0.541233,0.02484,0.845721,0.661027,0.742054,0.52304,270.265418
7,lgbm,tfidf,0.565592,0.023408,0.864179,0.672665,0.756489,0.543978,730.452619
8,xgboost,bow,0.533113,0.025158,0.84931,0.649878,0.736329,0.49584,7738.712713


In [28]:
start = time.process_time()
classifier = OneVsRestClassifier(xgboost.XGBClassifier(random_state=42))
classifier.fit(X_train_tfidf, y_train_multi)
predictions = classifier.predict(X_test_tfidf)


res = print_metrics(y_test_multi, predictions)
tps = time.process_time() - start
print("Time taken to run this cell :", tps)
dico = {'model':'xgboost',
      'vectorizer':'tfidf',
      'acc':res[0],
      'hamm':res[1],
      'precision':res[2],
      'recall':res[3],
      'F1':res[4],
      'cosine':res[5],
      'tps':tps}
classifier_comparison = classifier_comparison.append(dico,ignore_index=True)
classifier_comparison

Accuracy :0.55
Hamming loss :0.02413

Micro-averaged quality metrics :
Precision :0.861
Recall :0.66
F1-score :0.747
Cosine similarity : 0.514
Time taken to run this cell : 8050.586103


Unnamed: 0,model,vectorizer,acc,hamm,precision,recall,F1,cosine,tps
0,SGD,bow,0.451789,0.029397,0.85899,0.545721,0.667424,0.272044,38.429488
1,RegLog,bow,0.512687,0.026722,0.821278,0.646259,0.723332,0.488656,177.660644
2,SGD,tfidf,0.318066,0.036354,0.904348,0.366161,0.521267,-0.136463,32.303356
3,RegLog,tfidf,0.493783,0.02696,0.882634,0.578093,0.698617,0.338966,116.288684
4,RF,bow,0.486679,0.027441,0.900796,0.553252,0.685489,0.325105,493.970196
5,RF,tfidf,0.515478,0.025818,0.903094,0.585134,0.710148,0.380185,1099.684074
6,lgbm,bow,0.541233,0.02484,0.845721,0.661027,0.742054,0.52304,270.265418
7,lgbm,tfidf,0.565592,0.023408,0.864179,0.672665,0.756489,0.543978,730.452619
8,xgboost,bow,0.533113,0.025158,0.84931,0.649878,0.736329,0.49584,7738.712713
9,xgboost,tfidf,0.549734,0.024127,0.860895,0.660342,0.747399,0.51389,8050.586103


In [29]:
classifier_comparison.to_csv(input_dir+name, index = False)

In [61]:
import joblib

joblib.dump(classifier, model_dir + 'lightgbm_tfidf.bin')

['/Users/jeremynadal/Documents/Formation OC IML/P5-API/models/lightgbm_tfidf.bin']

In [33]:
test = joblib.load(model_dir + 'xgboost_tfidf.bin')
predictions = test.predict(X_test_tfidf)

res = print_metrics(y_test_multi, predictions)

Accuracy :0.55
Hamming loss :0.02413

Micro-averaged quality metrics :
Precision :0.861
Recall :0.66
F1-score :0.747
Cosine similarity : 0.514


In [34]:
import pickle 

pickle.dump(classifier, open(model_dir+'xgboost_tfidf_pickle.pkl','wb'))

In [35]:
test = pickle.load(open(model_dir+'xgboost_tfidf_pickle.pkl','rb'))
predictions = test.predict(X_test_tfidf)

res = print_metrics(y_test_multi, predictions)

Accuracy :0.55
Hamming loss :0.02413

Micro-averaged quality metrics :
Precision :0.861
Recall :0.66
F1-score :0.747
Cosine similarity : 0.514


## Lets create a function that is able to predict tags directly from a new sentence or a title/body pairs

In [106]:
pos_from_tag = lambda tag: ('a' if tag[0].lower() == 'j' else tag[0].lower()) if tag[0].lower() in ['n', 'r', 'v'] else 'n'

def handle_body(text, 
                #tokenizer = nltk.RegexpTokenizer(r'\w+'), 
                stop_words = nltk.corpus.stopwords.words("english"), 
                lemmatizer = nltk.stem.WordNetLemmatizer()  ):
    
    POS_to_rm = ['RB','RBR','RBS','JJ','JJR','JJS','CD'] #Removing adverbs and adjectives and digits
    stop_words += ['.','€','$','?','\'s',',',':',';','=','+','-']  
    
    soup = BeautifulSoup(text, 'html.parser')
    tokens = nltk.word_tokenize( soup.get_text().lower() )
    
    tokens = [re.sub('[.,?!)()<>:;\ \"\'\]\[+-=\{\}\|^*@&`’]','',token).replace('\\','').replace('?','') for token in tokens]
    tokens = [token for token in tokens if token != '']
    tags = nltk.pos_tag(tokens)

    tokens = [ tokens[i] for i in range(len(tokens)) if ( (not tokens[i] in stop_words) and (not tags[i][1] in POS_to_rm ) )]
    
    tags = nltk.pos_tag(tokens)
    
    result = [lemmatizer.lemmatize(tokens[i], pos=pos_from_tag(tags[i][1])) for i in range(len(tokens))]
    
    return result 

def handle_title(text,
                 stop_words = nltk.corpus.stopwords.words("english"), 
                 lemmatizer = nltk.stem.WordNetLemmatizer()  ): 
    
    POS_to_rm = ['RB','RBR','RBS','JJ','JJR','JJS'] #Removing adverbs and adjectives
    stop_words += ['.','€','$','?','\'s',',',':',';','=','+','-']  
    
    tokens = re.split(' ', text.lower())
    tokens = [re.sub('[.,?!)()<>:;\ \"\'\]\[=\{\}\|^*@&`’]','',token).replace('\\','').replace('?','') for token in tokens]

    tokens = [token for token in tokens if token != '']
    tags = nltk.pos_tag(tokens)
    
    tokens = [ tokens[i] for i in range(len(tokens)) if ( (not tokens[i] in stop_words) and (not tags[i][1] in POS_to_rm ) )]
    
    tags = nltk.pos_tag(tokens)
    
    result = [lemmatizer.lemmatize(tokens[i], pos=pos_from_tag(tags[i][1])) for i in range(len(tokens))]
    
    return result 

def preprocess_text(body, title = None):
    body = handle_body(body)
    if title :
        title = handle_title(title)
        body = title + title + body
    return body


def predict_new_sentence(text, vectorizer, model, all_tags, title = None):
    all_tags = np.array(all_tags)
    text = np.array(preprocess_text(text, title=title)).reshape(1, -1)
    print(text)
    text = vectorizer.transform(text)
    print(text)
    pred = model.predict(text)
    
    assert pred[0].shape == all_tags.shape, 'the passed tags doesnot have same shape as model output'
    idx = [idx for idx in range(len(all_tags)) if pred[0][idx]==1]
    return all_tags[idx]


In [54]:
joblib.dump(tfidf_vectorizer, model_dir + 'vectorizer_tfidf.bin')
#with open('vectorizer.pk', 'wb') as fin:
#    pickle.dump(tfidf_vectorizer, fin)

['/Users/jeremynadal/Documents/Formation OC IML/P5-API/models/vectorizer_tfidf.bin']

In [55]:
test = joblib.load(model_dir + 'vectorizer_tfidf.bin')

In [107]:
text = 'Hi, my name is Jack and I love potatoes c++'
title = 'c++'
predict_new_sentence(text, tfidf_vectorizer, classifier, unique_tags, title=title )

[['c++' 'c++' 'hi' 'name' 'jack' 'love' 'potato' 'c']]


TypeError: cannot use a string pattern on a bytes-like object

In [58]:
joblib.dump(unique_tags, model_dir + 'unique_tags.bin')

['/Users/jeremynadal/Documents/Formation OC IML/P5-API/models/unique_tags.bin']

In [60]:
test = joblib.load(model_dir + 'unique_tags.bin')
test[0]

'.net'

In [None]:
from sklearn.preprocessing import MultiLabelBinarizer
mlb = MultiLabelBinarizer()
test = mlb.fit_transform(y_train_tags)
test[0]

array([0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1,
       0, 0])

In [63]:
predict_new_sentence(X_train_tfidf[0],tfidf_vectorizer,classifier,unique_tags)

  if (len(data) >= 4) and (data[:2] == b'\xfe\xff') \
  elif (len(data) >= 4) and (data[:2] == b'\xff\xfe') \
  elif data[:3] == b'\xef\xbb\xbf':
  elif data[:4] == b'\x00\x00\xfe\xff':
  elif data[:4] == b'\xff\xfe\x00\x00':
  if isinstance(markup, str) or markup == '':


TypeError: cannot use a string pattern on a bytes-like object

In [85]:
id = 5
print(data_example['Body'][id])
print(data_example['Tags'][id])

<p>How do you expose a LINQ query as an ASMX web service? Usually, from the business tier, I can return a typed <code>DataSet</code> or <code>DataTable</code> which can be serialized for transport over ASMX.</p>

<p>How can I do the same for a LINQ query? Is there a way to populate a typed <code>DataSet</code> or <code>DataTable</code> via a LINQ query?</p>

<pre><code>public static MyDataTable CallMySproc()
{
    string conn = "...";

    MyDatabaseDataContext db = new MyDatabaseDataContext(conn);
    MyDataTable dt = new MyDataTable();

    // execute a sproc via LINQ
    var query = from dr
                in db.MySproc().AsEnumerable
                select dr;

    // copy LINQ query resultset into a DataTable -this does not work !    
    dt = query.CopyToDataTable();

    return dt;
}
</code></pre>

<p>How can I get the result set of a LINQ query into a <code>DataSet</code> or <code>DataTable</code>? Alternatively, is the LINQ query serializable so that I can expose it as an ASMX

In [82]:
unique_tags

array(['.net', 'android', 'arrays', 'asp.net', 'bash', 'c', 'c#', 'css',
       'git', 'html', 'iphone', 'java', 'javascript', 'jquery', 'linux',
       'mysql', 'objectivec', 'php', 'python', 'regex', 'sql',
       'sqlserver', 'string', 'windows'], dtype=object)

In [89]:
data_example[data_example['python' in data[:100]['Tags']]]

KeyError: False

## Lets do a grid search over lightgbm hyperparameters


In [59]:
from scipy.stats import randint as sp_randint
from scipy.stats import uniform as sp_uniform


param_test ={'estimator__num_leaves': sp_randint(6, 50), 
             'estimator__min_child_samples': sp_randint(100, 500), 
             'estimator__min_child_weight': [1e-5, 1e-3, 1e-2, 1e-1, 1, 1e1, 1e2, 1e3, 1e4],
             'estimator__subsample': sp_uniform(loc=0.2, scale=0.8), 
             'estimator__reg_alpha': [0, 1e-1, 1, 2, 5, 7, 10, 50, 100],
             'estimator__reg_lambda': [0, 1e-1, 1, 5, 10, 20, 50, 100]}


In [68]:
import lightgbm as lgb
from sklearn.model_selection import RandomizedSearchCV

#n_estimators is set to a "large value". The actual number of trees build will depend on early stopping and 400 define only the absolute maximum
classifier = OneVsRestClassifier(lightgbm.LGBMClassifier(random_state=42,n_estimators = 100))

randomized_search = RandomizedSearchCV(estimator=classifier, 
                                       param_distributions=param_test, 
                                       n_iter=20,
                                       scoring='f1_micro',
                                       cv=2,
                                       refit=True,
                                       random_state=42,
                                       verbose=True)

In [57]:
classifier.get_params().keys()

dict_keys(['estimator__boosting_type', 'estimator__class_weight', 'estimator__colsample_bytree', 'estimator__importance_type', 'estimator__learning_rate', 'estimator__max_depth', 'estimator__min_child_samples', 'estimator__min_child_weight', 'estimator__min_split_gain', 'estimator__n_estimators', 'estimator__n_jobs', 'estimator__num_leaves', 'estimator__objective', 'estimator__random_state', 'estimator__reg_alpha', 'estimator__reg_lambda', 'estimator__silent', 'estimator__subsample', 'estimator__subsample_for_bin', 'estimator__subsample_freq', 'estimator', 'n_jobs'])

In [None]:
start = time.process_time()


randomized_search.fit(X_train_tfidf, y_train_multi)
print('Best score reached: {} with params: {} '.format(randomized_search.best_score_, randomized_search.best_params_))


#res = print_metrics(y_test_multi, predictions)
tps = time.process_time() - start
print("Time taken to run this cell :", tps)

Fitting 2 folds for each of 20 candidates, totalling 40 fits


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
