In [3]:
from config import RAW_DATA_PATH
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline

import os

Load the dataframe and tokenized documents

In [4]:
df = pd.read_csv('data/preprocessing/document_tokens_labelled.csv', sep=',')

In [5]:
df

Unnamed: 0,document_tokens,label
0,commission|accès|document|administratif|examin...,défavorable
1,commission|accès|document|administratif|examin...,défavorable
2,commission|accès|document|administratif|examin...,défavorable
3,commission|accès|document|administratif|examin...,
4,commission|accès|document|administratif|examin...,
...,...,...
48741,monsieur|x|saisir|commission|accès|document|ad...,favorable
48742,monsieur|x|saisir|commission|accès|document|ad...,sans objet
48743,maître|x|x|saisir|commission|accès|document|ad...,sans objet
48744,Monsieur|x|x|saisir|commission|accès|document|...,sans objet


Remove unlabelled documents.

In [6]:
df.dropna(inplace=True)

In [7]:
len(df)

40454

In [8]:
df.label.value_counts()

favorable      26940
sans objet      9849
défavorable     3665
Name: label, dtype: int64

# Pipeline

Create a pipeline that will be applied to Train and Test separately

## Split into Train and Test
Do it upfront in order to avoid data leakage. Also we need to stratify the test sample by label class.

In [9]:
df_train, df_test, y_train, y_test = train_test_split(
    df.document_tokens, df.label, test_size=0.30, #stratify=[0.5],
)

 ## Encode the labels
 Important to encode the labels on training only (again, prevent leakage).


In [10]:
from sklearn.preprocessing import LabelEncoder

label_encoder = LabelEncoder()

label_encoder.fit(y_train)

y_train = label_encoder.transform(y_train)
y_test = label_encoder.transform(y_test)

In [11]:
label_encoder.classes_

array(['défavorable', 'favorable', 'sans objet'], dtype=object)

In [12]:
unique, counts = np.unique(y_train, return_counts=True)

In [13]:
dict(zip(unique, counts))

{0: 2524, 1: 18882, 2: 6911}

In [15]:
MOST_POPULAR_CLASS = 1

## Build the components of the training pipeline

Start with a document vectorizer.

In [16]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf = TfidfVectorizer(
    analyzer='word',
    max_df=10000,  # ignore tokens that appear more than X times in the document collection
    max_features=10000,   # capping on vocabulary size
)

## Define the models

### Choose a baseline naive model

What is the simplest model you could think of? Think of a model that always predicts the most popular class
from the train data.

In [17]:
class NaiveClassifier():
    "Always predict the most popular class"
    
    # we know from label_encoder that most popular class Favorable takes index 1.
    most_popular_class = MOST_POPULAR_CLASS
    
    def fit(self, X, y=None, **fit_params):
        return
    
    def transform(self):
        return
    
    def predict(self, X):
        return np.array([self.most_popular_class] * X.shape[0])
    

In [18]:
naive_classifier = NaiveClassifier()

### Choose a baseline ML model

Now choose a ML classifier, the simplest one.

In [19]:
from sklearn.naive_bayes import MultinomialNB

naive_bayes = MultinomialNB()

### Non parametric ML

In [48]:
from sklearn.ensemble import RandomForestClassifier

random_forest = RandomForestClassifier(n_estimators=150)

## Assemble pipeline components

In [21]:
pipeline_naive_classifier = Pipeline(steps=[
    ('tfidf', tfidf),
    ('naive_classifier', naive_classifier),
])

In [22]:
pipeline_naive_classifier

In [23]:
pipeline_baseline_ml = Pipeline(steps=[
    ('tfidf', tfidf),
    ('naive_bayes', naive_bayes),
])

In [24]:
pipeline_baseline_ml

In [49]:
pipeline_random_forest = Pipeline(steps=[
    ('tfidf', tfidf),
    ('random_forest', random_forest),
])

## Train the whole pipeline
Remark: no need to fit the naive model.

In [25]:
pipeline_baseline_ml.fit(df_train, y_train)

In [50]:
pipeline_random_forest.fit(df_train, y_train)

## Serve the model

In [51]:
prediction_models = dict(naive=None, baseline_ml=None, random_forest=None)

In [52]:
prediction_models['naive'] = pipeline_naive_classifier.predict(df_test)

prediction_models['baseline_ml'] = pipeline_baseline_ml.predict(df_test)

prediction_models['random_forest'] = pipeline_random_forest.predict(df_test)


# Results

In [53]:
from sklearn.metrics import classification_report

In [54]:
for k,v in prediction_models.items():
    print('------------------------------------------------------------')
    print(k)
    print(classification_report(y_test, v))

------------------------------------------------------------
naive
              precision    recall  f1-score   support

           0       0.00      0.00      0.00      1141
           1       0.66      1.00      0.80      8058
           2       0.00      0.00      0.00      2938

    accuracy                           0.66     12137
   macro avg       0.22      0.33      0.27     12137
weighted avg       0.44      0.66      0.53     12137

------------------------------------------------------------
baseline_ml
              precision    recall  f1-score   support

           0       0.95      0.55      0.70      1141
           1       0.91      0.99      0.95      8058
           2       0.98      0.89      0.93      2938

    accuracy                           0.92     12137
   macro avg       0.95      0.81      0.86     12137
weighted avg       0.93      0.92      0.92     12137

------------------------------------------------------------
random_forest
              precision

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [55]:
xx = prediction_models['random_forest'] - prediction_models['baseline_ml']

np.where(xx != 0)

(array([   89,   106,   177,   213,   231,   333,   388,   468,   527,
          656,   704,   775,   788,   891,  1079,  1093,  1100,  1355,
         1357,  1417,  1577,  1638,  1649,  1706,  1781,  1915,  1955,
         2013,  2063,  2065,  2069,  2103,  2114,  2148,  2158,  2219,
         2244,  2250,  2383,  2387,  2428,  2450,  2669,  2703,  2797,
         2821,  2921,  3057,  3091,  3173,  3196,  3261,  3353,  3441,
         3515,  3570,  3584,  3858,  3893,  3996,  4060,  4141,  4177,
         4233,  4251,  4298,  4439,  4489,  4525,  4567,  4589,  4698,
         4775,  4782,  4808,  4814,  4848,  4938,  4957,  5123,  5140,
         5141,  5192,  5206,  5228,  5236,  5352,  5512,  5527,  5612,
         5783,  5795,  5890,  5987,  6001,  6065,  6074,  6112,  6117,
         6138,  6241,  6245,  6324,  6370,  6441,  6486,  6534,  6537,
         6551,  6595,  6609,  6610,  6763,  6815,  6832,  6937,  7142,
         7245,  7399,  7503,  7504,  7560,  7607,  7631,  7632,  7644,
      

Baseline ML and Random Forest have pretty much the same accuracy !

We are probably reaching a complexity ceiling. We won't improve much with more fancy models (think of deep learning).

Before that, we should fine tune the feature space. Maybe use a more advanced word representation ?

To go further:
* fine tune the TfIdf embeddings
* use to pre-trained word vectors e.g. GloVe