# C 2 - Learning-Pipeline

In [1]:
import pandas as pd
import numpy as np
# Preprocessing
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.feature_extraction.text import TfidfVectorizer
# Learning
# Classifiers
from sklearn.multiclass import OneVsRestClassifier
from sklearn.svm import LinearSVC
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
# Processes and metrics
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.pipeline import FeatureUnion
from sklearn.metrics import classification_report

## Import datasets

In [2]:
dataset = pd.read_csv('../data/B_engineering/sc_2000_2017.csv', index_col='record_id')

In [12]:
dataset.head(2)

Unnamed: 0_level_0,Unnamed: 0,year,geo,topics,symbol,title,url_English,text,main_body,topics_geo,geo_count,topics_count,topics_geo_count
record_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
455823,0,2001,,NON-ALIGNED COUNTRIES||INTERNATIONAL SECURITY|...,A/56/682||S/2001/1159,Letter dated 2001/12/06 from the Permanent Rep...,http://digitallibrary.un.org/record/455823/fil...,A/56/682–S/2001/1159 United Nations General As...,"A,S",NON-ALIGNED COUNTRIES||INTERNATIONAL SECURITY|...,0,6,6
420454,5,2000,,DEVELOPMENT||INTERNATIONAL TRADE||GENETIC ENGI...,A/55/257||S/2000/766,Letter dated 2000/08/01 from the Chargé d'affa...,http://digitallibrary.un.org/record/420454/fil...,A/55/257–S/2000/766 United Nations General Ass...,"A,S",DEVELOPMENT||INTERNATIONAL TRADE||GENETIC ENGI...,0,11,11


In [4]:
labels = pd.read_csv('../data/C_learning/sc_2000_2017_binary_labels.csv', index_col='record_id')
labels = labels[labels.sum().sort_values(ascending=False).head(10).index.tolist()]
labels = labels[labels.sum(axis=1) > 0]
reduced_id = labels.index.tolist()
labels_name = labels.columns.tolist()

In [5]:
features = dataset['text']
features = features[features.index.isin(reduced_id)]

In [6]:
len(features) == len(labels)

True

## Training and testing sets

In [7]:
x_train, x_test, y_train, y_test = train_test_split(features, labels, test_size=0.3, shuffle=True)

In [8]:
x_train = x_train.tolist()
x_test = x_test.tolist()

In [9]:
pipeline = Pipeline([
    ('tfidf', TfidfVectorizer()),
    ('clf', OneVsRestClassifier(LinearSVC(class_weight='balanced')))
])
parameters = {
    'tfidf__max_df': (0.25,0.50,0.75),
    "clf__estimator__C": [0.01, 0.1, 1]
}

In [10]:
grid_search_tune = GridSearchCV(
    pipeline, parameters, cv=2, n_jobs=2, verbose=3)
grid_search_tune.fit(x_train, y_train)

Fitting 2 folds for each of 9 candidates, totalling 18 fits
[CV] clf__estimator__C=0.01, tfidf__max_df=0.25 ......................
[CV] clf__estimator__C=0.01, tfidf__max_df=0.25 ......................
[CV]  clf__estimator__C=0.01, tfidf__max_df=0.25, score=0.02324431256181998, total= 1.4min
[CV] clf__estimator__C=0.01, tfidf__max_df=0.5 .......................
[CV]  clf__estimator__C=0.01, tfidf__max_df=0.25, score=0.02225886232481451, total= 1.3min
[CV] clf__estimator__C=0.01, tfidf__max_df=0.5 .......................
[CV]  clf__estimator__C=0.01, tfidf__max_df=0.5, score=0.023409165842400263, total=  58.9s
[CV] clf__estimator__C=0.01, tfidf__max_df=0.75 ......................
[CV]  clf__estimator__C=0.01, tfidf__max_df=0.5, score=0.01978565539983512, total=  59.2s
[CV] clf__estimator__C=0.01, tfidf__max_df=0.75 ......................
[CV]  clf__estimator__C=0.01, tfidf__max_df=0.75, score=0.025222551928783383, total=  55.9s
[CV]  clf__estimator__C=0.01, tfidf__max_df=0.75, score=0.0

[Parallel(n_jobs=2)]: Done  18 out of  18 | elapsed: 15.8min finished


GridSearchCV(cv=2, error_score='raise',
       estimator=Pipeline(memory=None,
     steps=[('tfidf', TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
 ...lti_class='ovr', penalty='l2', random_state=None, tol=0.0001,
     verbose=0),
          n_jobs=1))]),
       fit_params=None, iid=True, n_jobs=2,
       param_grid={'tfidf__max_df': (0.25, 0.5, 0.75), 'clf__estimator__C': [0.01, 0.1, 1]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=3)

In [11]:
print("Best parameters set:")
print(grid_search_tune.best_estimator_.steps)

# measuring performance on test set
print("Applying best classifier on test data:")
best_clf = grid_search_tune.best_estimator_
predictions = best_clf.predict(x_test)
print(classification_report(y_test, predictions, target_names=labels_name))

Best parameters set:
[('tfidf', TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=0.25, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
        stop_words=None, strip_accents=None, sublinear_tf=False,
        token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
        vocabulary=None)), ('clf', OneVsRestClassifier(estimator=LinearSVC(C=1, class_weight='balanced', dual=True, fit_intercept=True,
     intercept_scaling=1, loss='squared_hinge', max_iter=1000,
     multi_class='ovr', penalty='l2', random_state=None, tol=0.0001,
     verbose=0),
          n_jobs=1))]
Applying best classifier on test data:
                         precision    recall  f1-score   support

PEACEKEEPING OPERATIONS       0.32      0.33      0.33      1580
      INTERNAL SECURITY       0.36      0.36      0.36      1540
  MID