## PART 1 - Data cleaning and description

In [1]:
import itertools

import joblib
import numpy as np
import pandas as pd
import scipy.sparse
import scipy.stats
from lightgbm import LGBMClassifier
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.metrics import accuracy_score, f1_score, hamming_loss, precision_score, recall_score
from sklearn.model_selection import KFold, RandomizedSearchCV
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import MultiLabelBinarizer
from skmultilearn.problem_transform import ClassifierChain
from skmultilearn.model_selection import iterative_train_test_split

from mlutils import *

In [2]:
# Set to true to save intermediate files
SAVE_INTERMEDIATE_FILES = False
# Random seed
RANDOM_SEED = 56

In [3]:
df = pd.read_csv(r"dataset.csv")

In [4]:
df

Unnamed: 0.1,Unnamed: 0,Abstract,Title,StoreId,disc2_x,text
0,0,"The initiation, development, and disseminati...",Development and Dissemination of Collaborative...,1001404248,['050408'],"The initiation, development, and disseminati..."
1,1,As the population of the United States has cha...,Conducting Culturally Competent Evaluations of...,1010629836,['050409'],As the population of the United States has cha...
2,2,"In the social sciences, evaluation sometimes a...","Evaluation, research and demonstration in the ...",1010629842,['050411'],"In the social sciences, evaluation sometimes a..."
3,3,The Second Earth Summit to be held in Rio de J...,Entering the anthropocene: 'Geonauts' or sorce...,1010629881,['050404'],The Second Earth Summit to be held in Rio de J...
4,4,This article investigates the matrimonial web ...,The matrimonial web of migrants: The economics...,1010629926,['050401'],This article investigates the matrimonial web ...
...,...,...,...,...,...,...
113904,58024,As punitive damages have gained greater visibi...,Reconciling Punitive Damages with Tort Law's N...,963637137,['050201'],As punitive damages have gained greater visibi...
113905,58025,The worldwide financial crisis and the need to...,Theory for an Age of Crisis: Seeing Money as a...,963637138,['050205'],The worldwide financial crisis and the need to...
113906,58026,"In this paper, I review the literature on the ...",The Econometrics of DSGE Models,963637157,['050205'],"In this paper, I review the literature on the ..."
113907,58027,Bayesian dynamic stochastic general equilibriu...,DSGE Models and Their Use at the ECB,963637159,['050205'],Bayesian dynamic stochastic general equilibriu...


### 1.1. Data merging

In [5]:
dtypes = {
    'Abstract': str,
    'Title': str,
    'year': int,
    'documentType': str,
    'StoreId': str,
    'disc1': str,
    'disc2': str,
}

socab_df = pd.read_csv('Datasets/SocAbstracts.csv', dtype=dtypes)
eric_df = pd.read_csv('Datasets/ERIC.csv', dtype=dtypes)
econlit_df = pd.read_csv('Datasets/EconLit.csv', dtype=dtypes)

### 1.2. Data cleaning and relabeling

Get clean and relabeled dataframes for each set:

In [6]:
socab_clean = clean_df(socab_df)
eric_clean = clean_df(eric_df)
econlit_clean = clean_df(econlit_df)

if SAVE_INTERMEDIATE_FILES:
    socab_clean.to_csv("SocAbstracts_master.csv", index=False)
    eric_clean.to_csv("ERIC_master.csv", index=False)
    econlit_clean.to_csv("EconLit_master.csv", index=False)

In [7]:
# which columns are stored?
socab_clean.columns

Index(['Abstract', 'Title', 'year', 'StoreId', 'disc1_x', 'disc2_x',
       'disc1_counts', 'disc2_counts'],
      dtype='object')

...Here, we skip some unneeded parts from the full notebook...

## PART 3 - Pre-processing

In [8]:
df = pd.concat([socab_clean,eric_clean,econlit_clean])
df = df.drop(columns=['year', 'disc1_x', 'disc1_counts', 'disc2_counts'])

if SAVE_INTERMEDIATE_FILES:
    # Transform list to semicolon-separated string prior to saving
    df['disc2_x'] = df.disc2_x.apply(lambda x: ';'.join(x))
    df.to_csv("dataset.csv", index=False)
    # Read file and transform back to list format
    df = pd.read_csv("dataset.csv")
    df['disc2_x'] = df.disc2_x.str.split(';')

df.to_csv("dataset.csv")

In [9]:
df.to_csv("dataset.csv")

In [10]:
df['text'] = df.Abstract.str.cat(df.Title, sep=' ')

Create dataframe with all correct labels:

In [11]:
mlb = MultiLabelBinarizer()
y_true = mlb.fit_transform(df['disc2_x'])
labels = list(mlb.classes_)
df_true = pd.DataFrame(y_true, columns=labels)

The data is split into a training and a holdout test set (75% vs. 25%).

In [12]:
# iterative_train_test_split is only deterministic if we call this first
np.random.seed(RANDOM_SEED)
# iterative_train_test_split expects a matrix, whereas CountVectorizer
# needs an iterable over strings
X_train, y_train, X_test, y_test = iterative_train_test_split(
    df.text.to_frame().values, df_true.values, test_size=0.25
)
X_train, X_test = X_train[:, 0], X_test[:, 0]

We test 3 * 2 * 2 = 12 different configurations for preprocessing: tokenization, use of IDF, and use of n-grams. Intermediate results are cached (in a directory `.cache`) to avoid recalculating the time-consuming tokenization part. The code below just sets up the pipeline for each each configuration; the actual processing happens in part 4.

In [13]:
tokenizers = [tokenize_lemmas, tokenize_nouns, tokenize_nounphrases]
use_idfs = [True]
ngrams = [(1, 1)]

memory = joblib.Memory('.cache', verbose=0)
preprocessed_models = {}

for tokenizer, ngram, use_idf in itertools.product(tokenizers, ngrams, use_idfs):
    pipe = Pipeline([
        ('vect', CountVectorizer(tokenizer=tokenizer, ngram_range=ngram)),
        ('tfidf', TfidfTransformer(use_idf=use_idf)),
    ], memory=memory)
    preprocessed_models[(tokenizer, ngram, use_idf)] = pipe

## PART 4 - Supervised learning

### 4.1. Multinomial Naïve Bayes

We employ 3-fold cross-validation and test different parameters. Classifier chaining is used for multi-label prediction.

In [None]:
cv = KFold(n_splits=3, random_state=RANDOM_SEED, shuffle=False)
parameters = {
    'classifier': [MultinomialNB()],
    'classifier__alpha': [1, .1, .01, .001, .0001],
}

classifiers = {}

for (tokenizer, ngram, use_idf), pipe in preprocessed_models.items():
    print("*** Pre-processing using tokenizer {}, IDF {}, n-gram range {}".format(
        tokenizer.__name__, use_idf, ngram   
    ))
    X_train_preproc = pipe.fit_transform(X_train)
    
    print("*** Predicting for tokenizer {}, IDF {}, n-gram range {}".format(
        tokenizer.__name__, use_idf, ngram
    ))
    clf = RandomizedSearchCV(
        ClassifierChain(require_dense=[False, True]),
        parameters,
        scoring='accuracy',
        n_iter=5,
        n_jobs=-1,
        verbose=2,
        return_train_score=True,
        pre_dispatch=2,
        cv=cv,
        random_state=RANDOM_SEED,
    )
    clf.fit(X_train_preproc, y_train)
    
    # Persist classifier to disk
    location = f".cache/MNB-{tokenizer.__name__}-{use_idf}-{ngram}.joblib"
    joblib.dump(clf, location)
    classifiers[(tokenizer, ngram, use_idf)] = (location, clf.best_score_)

Determine the best preprocessing configuration, hyperparameters and score:

In [None]:
best_input, best_clf_loc, best_score = max(((preproc, loc, score) for preproc, (loc, score) in classifiers.items()), key=lambda x: x[2])
best_clf = joblib.load(best_clf_loc)
best_input, best_clf.best_params_, best_clf.best_score_

Make prediction for test set:

In [None]:
pipe = preprocessed_models[best_input]
pipe.set_params(vect__vocabulary=pipe.named_steps['vect'].vocabulary_)
X_test_preproc = pipe.transform(X_test)
y_pred = best_clf.predict(X_test_preproc)

And calculate various evaluation metrics:

In [None]:
print("Accuracy =", accuracy_score(y_test, y_pred))
print("Hamming loss =", hamming_loss(y_test, y_pred))
print("Precision =", precision_score(y_test, y_pred, average='weighted'))
print("Recall =", recall_score(y_test, y_pred, average='weighted'))
print("F1 =", f1_score(y_test, y_pred, average='weighted'))

### 4.2. Gradient Boosting

This follows the same steps as Multinomial NB. Again, we employ 3-fold cross-validation and test different parameters. Classifier chaining is used for multi-label prediction.

In [15]:
cv = KFold(n_splits=3, random_state=RANDOM_SEED, shuffle=False)

parameters = {
        'classifier': [
            LGBMClassifier(objective='binary', 
                           boosting_type='gbdt', 
                           #early_stopping_rounds=1000,
                           verbose=4,
                           boost_from_average=False,
                           n_jobs=5
                        )
        ],
        'classifier__num_trees': [100, 200, 400, 600],
        'classifier__num_leaves': [10, 50, 80, 110, 140, 170, 200],
        'classifier__max_depth' : [5, 10, 20, 25],
        'classifier__learning_rate': [0.3, 0.2, 0.1, 0.01, 0.001, 0.0001],
        'classifier__max_bin': [50, 75, 100, 150, 200, 255],
        'classifier__min_data_in_leaf': [5, 10, 20, 40, 60],
        'classifier__bagging_fraction': [0.1, 0.2, 0.4, 0.6],
        'classifier__bagging_freq': [10, 25, 50],
        'classifier__min_child_weight':[1e-3, 1e-2, 1e-1],
        'classifier__reg_alpha': [0, 1e-3, 1e-2, 1e-1, 1],
        'classifier__reg_lambda': [1, 2, 5, 10],
}

classifiers = {}

In [None]:
for (tokenizer, ngram, use_idf), pipe in preprocessed_models.items():
    print("*** Preprocessing for tokenizer {}, IDF {}, n-gram range {}".format(
        tokenizer.__name__, use_idf, ngram
    ))
    X_train_preproc = pipe.fit_transform(X_train)
    
    print("*** Predicting for tokenizer {}, IDF {}, n-gram range {}".format(
        tokenizer.__name__, use_idf, ngram
    ))
    clf = RandomizedSearchCV(
        ClassifierChain(require_dense=[False, True]),
        parameters,
        scoring='accuracy',
        n_iter=100,
        n_jobs=6,
        verbose=2,
        return_train_score=True,
        cv=cv,
        random_state=RANDOM_SEED,
    )
    clf.fit(X_train_preproc.astype("float32"), y_train.astype("float32"))

    # Persist classifier to disk
    location = f".cache/LightGBM-{tokenizer.__name__}-{use_idf}-{ngram}.joblib"
    joblib.dump(clf, location)
    classifiers[(tokenizer, ngram, use_idf)] = (location, clf.best_score_)

*** Preprocessing for tokenizer tokenize_lemmas, IDF True, n-gram range (1, 1)


If this happens often in your code, it can cause performance problems 
(results will be correct in all cases). 
The reason for this is probably some large input arguments for a wrapped
 function (e.g. large strings).
THIS IS A JOBLIB ISSUE. If you can, kindly provide the joblib's team with an
 example so that they can fix the problem.
  **fit_params_steps[name])


*** Predicting for tokenizer tokenize_lemmas, IDF True, n-gram range (1, 1)
Fitting 3 folds for each of 100 candidates, totalling 300 fits


[Parallel(n_jobs=6)]: Using backend LokyBackend with 6 concurrent workers.


Determine the best preprocessing configuration, hyperparameters and score:

In [None]:
best_input, best_clf_loc, best_score = max(((preproc, loc, score) for preproc, (loc, score) in classifiers.items()), key=lambda x: x[2])
best_clf = joblib.load(best_clf_loc)
best_input, best_clf.best_params_, best_clf.best_score_

Make prediction for test set:

In [None]:
pipe = preprocessed_models[best_input]
pipe.set_params(vect__vocabulary=pipe.named_steps['vect'].vocabulary_)
X_test_preproc = pipe.transform(X_test)
y_pred = best_clf.predict(X_test_preproc.astype("float32"))

And calculate various evaluation metrics:

In [None]:
y_test = y_test.astype("float32")

print("Accuracy =", accuracy_score(y_test, y_pred))
print("Hamming loss =", hamming_loss(y_test, y_pred))
print("Precision =", precision_score(y_test, y_pred, average='weighted'))
print("Recall =", recall_score(y_test, y_pred, average='weighted'))
print("F1 =", f1_score(y_test, y_pred, average='weighted'))