In [1]:
import sys
import re
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
from tqdm import tqdm

from nltk.stem import WordNetLemmatizer 
from nltk.corpus import stopwords

from sklearn.model_selection import GridSearchCV, StratifiedShuffleSplit, train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn import metrics
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
import xgboost as xgb

In [2]:
import nltk
nltk.download('stopwords');
nltk.download('wordnet');

[nltk_data] Downloading package stopwords to
[nltk_data]     /home/panisson/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /home/panisson/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


# Evaluate classification models

This notebook is used to load data for the classification task and evaluate different classification models.

Before loading the content, you must run the notebook Data-Preparation.ipynb to scrape articles not shared with the data.

In [3]:
content = pd.read_csv('data/assets_scraped.csv')

### Load and prepare data for the Type classification task

In [4]:
type_df_nocontent = pd.read_csv('data/type_classification.csv')

In [5]:
type_df = pd.merge(type_df_nocontent, content)

In [6]:
lemmatizer = WordNetLemmatizer() 

def text_basic_clean(text):
    text = text.replace('\n\n•', '').replace('\n\n', '')
    text = re.sub(r'[^\w\s]', '', text) 
    text = text.replace('  ', ' ')
    text = ' '.join([lemmatizer.lemmatize(word.lower())
                     for word in text.split()
                     if word not in stopwords.words('english') and word.isalpha()])
    return text

type_df['content_clean'] = type_df['content'].apply(lambda x: text_basic_clean(x))    

In [7]:
class Tester():
    
    def __init__(self, model, param_grid={}):
        self.model = model
        self.param_grid = param_grid
        self.tfidf = TfidfVectorizer(sublinear_tf=True,
                            analyzer='word', ngram_range=(1, 4), 
                            min_df = 5, stop_words='english',norm='l2')
    
    def run_val_test(self, seed):
        X_train, X_test, y_train, y_test = train_test_split(
            X, y, test_size=0.2, random_state=seed, stratify = y)

        tfidf_matrix = self.tfidf.fit_transform(X_train).toarray()
        x_test = self.tfidf.transform(X_test).toarray()
        cv = StratifiedShuffleSplit(n_splits=10, test_size=0.2, random_state=seed)
        gs = GridSearchCV(self.model, param_grid=self.param_grid,
                          cv=cv, scoring='roc_auc', n_jobs=-1)
        gs.fit(tfidf_matrix, y_train)
        if hasattr(gs, "predict_proba"):
            predicted_prob = gs.predict_proba(x_test)[:,1]
        else:
            predicted_prob = gs.decision_function(x_test)
        
        
        return gs.best_score_, metrics.roc_auc_score(y_test, predicted_prob)
    
    def run_many(self, n_runs=50):
        val_scores = []
        test_scores = []
        for seed in tqdm(range(n_runs), file=sys.stdout):
            val_score, test_score = self.run_val_test(seed)
            val_scores.append(val_score)
            test_scores.append(test_score)
        return val_scores, test_scores

In [8]:
def xgb_model():
    xgb_clf = xgb.XGBClassifier(use_label_encoder=False)
    args = {'learning_rate': 0.3, 'colsample_bytree': 0.8, 'scale_pos_weight': 3, 
            'n_jobs': 10, 'n_estimators': 300, 'max_depth': 8, 'subsample': 0.8, 'verbosity': 0}
    return xgb_clf.set_params(**args)

def get_rf():
    rf = RandomForestClassifier(min_samples_leaf = 5, 
                                n_estimators = 200,
                                class_weight = 'balanced_subsample',
                                criterion = 'gini',
                                random_state = 42)
    return rf

lr_model = LogisticRegression(penalty='l2', C=0.1, class_weight='balanced', solver='liblinear')
svm_model = SVC(kernel='rbf', class_weight='balanced', gamma='auto')

models = [
    ('Multinomial Naive Bayes', MultinomialNB(), {'alpha': [0.5, 1, 1.5]}),
    ('XGBoost', xgb_model(), {'max_depth': [3,4,5], 'n_estimators': [10, 20]}),
    ('Random Forest', get_rf(), {'min_samples_leaf': [3,4,5,6]}),
    ('Logistic Regression', lr_model, {'C': [0.1, 1, 2]}),
    ('Support Vector Machine', svm_model, {'C': [0.1, 1, 2]})
]

### Scenario 1: [Both & News] v.s. Summary 


In [9]:
X = list(type_df.content_clean)
y = np.array(type_df.value.map(lambda x: 0 if x=='SUMMARY' else 1))

In [10]:
results_scenario1 = {}
for name, model, param_grid in models:
    print('Testing model', name)
    tester = Tester(model, param_grid)
    results_scenario1[name] = tester.run_many()
    print([np.mean(r) for r in results_scenario1[name]])

Testing model Multinomial Naive Bayes
100%|██████████| 50/50 [00:16<00:00,  3.09it/s]
[0.6515000000000001, 0.6214285714285714]
Testing model XGBoost
100%|██████████| 50/50 [01:07<00:00,  1.35s/it]
[0.6309791666666665, 0.6054285714285714]
Testing model Random Forest
100%|██████████| 50/50 [00:52<00:00,  1.06s/it]
[0.6478333333333333, 0.6165714285714285]
Testing model Logistic Regression
100%|██████████| 50/50 [00:13<00:00,  3.78it/s]
[0.6411666666666666, 0.6182857142857143]
Testing model Support Vector Machine
100%|██████████| 50/50 [00:13<00:00,  3.61it/s]
[0.6434583333333334, 0.6231428571428572]


In [11]:
pd.DataFrame({k: [np.mean(v[0]), np.mean(v[1])]
              for (k,v) in results_scenario1.items()}, index=['Validation', 'Test']).T

Unnamed: 0,Validation,Test
Multinomial Naive Bayes,0.6515,0.621429
XGBoost,0.630979,0.605429
Random Forest,0.647833,0.616571
Logistic Regression,0.641167,0.618286
Support Vector Machine,0.643458,0.623143


### Scenario 2: [Both & Summary] v.s. News

In [12]:
X = list(type_df.content_clean)
y = np.array(type_df.value.map(lambda x: 0 if x=='NEWS' else 1))

In [13]:
results_scenario2 = {}
for name, model, param_grid in models:
    print('Testing model', name)
    tester = Tester(model, param_grid)
    results_scenario2[name] = tester.run_many()
    print([np.mean(r) for r in results_scenario2[name]])

Testing model Multinomial Naive Bayes
100%|██████████| 50/50 [00:12<00:00,  3.93it/s]
[0.6658909090909091, 0.6414285714285713]
Testing model XGBoost
100%|██████████| 50/50 [01:05<00:00,  1.31s/it]
[0.6629272727272728, 0.6610714285714286]
Testing model Random Forest
100%|██████████| 50/50 [00:53<00:00,  1.06s/it]
[0.7040363636363637, 0.6919047619047618]
Testing model Logistic Regression
100%|██████████| 50/50 [00:12<00:00,  3.86it/s]
[0.6609090909090909, 0.6340476190476191]
Testing model Support Vector Machine
100%|██████████| 50/50 [00:13<00:00,  3.62it/s]
[0.6628000000000002, 0.6369047619047619]


In [14]:
pd.DataFrame({k: [np.mean(v[0]), np.mean(v[1])]
              for (k,v) in results_scenario2.items()}, index=['Validation', 'Test']).T

Unnamed: 0,Validation,Test
Multinomial Naive Bayes,0.665891,0.641429
XGBoost,0.662927,0.661071
Random Forest,0.704036,0.691905
Logistic Regression,0.660909,0.634048
Support Vector Machine,0.6628,0.636905


### Senario 3: News v.s. Summary 

In [15]:
type_df2 = type_df.copy()
type_df2 = type_df2[type_df2.value != 'BOTH']

In [16]:
X = list(type_df2.content_clean)
y = np.array(type_df2.value.map(lambda x: 0 if x=='SUMMARY' else 1))

In [17]:
results_scenario3 = {}
for name, model, param_grid in models:
    print('Testing model', name)
    tester = Tester(model)
    results_scenario3[name] = tester.run_many()
    print([np.mean(r) for r in results_scenario3[name]])

Testing model Multinomial Naive Bayes
100%|██████████| 50/50 [00:09<00:00,  5.24it/s]
[0.65455, 0.6384615384615384]
Testing model XGBoost
100%|██████████| 50/50 [00:47<00:00,  1.05it/s]
[0.6019, 0.612]
Testing model Random Forest
100%|██████████| 50/50 [00:43<00:00,  1.14it/s]
[0.6459, 0.6873846153846155]
Testing model Logistic Regression
100%|██████████| 50/50 [00:09<00:00,  5.14it/s]
[0.63885, 0.6418461538461538]
Testing model Support Vector Machine
100%|██████████| 50/50 [00:10<00:00,  4.75it/s]
[0.6416499999999999, 0.6467692307692309]


In [18]:
pd.DataFrame({k: [np.mean(v[0]), np.mean(v[1])]
              for (k,v) in results_scenario3.items()}, index=['validation', 'test']).T

Unnamed: 0,validation,test
Multinomial Naive Bayes,0.65455,0.638462
XGBoost,0.6019,0.612
Random Forest,0.6459,0.687385
Logistic Regression,0.63885,0.641846
Support Vector Machine,0.64165,0.646769


# Relevance Classification

In [19]:
relevance_df_nocontent = pd.read_csv('data/relevance_classification.csv')

In [20]:
relevance_df = pd.merge(relevance_df_nocontent, content)

In [21]:
from collections import Counter
Counter(relevance_df.relevance)

Counter({'RELEVANT': 91, 'NOT_RELEVANT': 102})

In [22]:
relevance_df.head(5)

Unnamed: 0,id,relevance,document_identifier,content,created
0,EN2652359,RELEVANT,https://reliefweb.int/report/democratic-republ...,SITUATION\n\n• More than 13 million people in ...,2019-06-24
1,EN2422390,NOT_RELEVANT,https://www.irishexaminer.com/breakingnews/ire...,Latest: Detectives investigating a car bomb ex...,2019-01-20
2,EN2718256,RELEVANT,https://reliefweb.int/report/iraq/iraq-iom-eng...,"Baghdad – Across Iraq, the instability and ins...",2019-08-02
3,EN2460270,RELEVANT,https://themedialine.org/student-journalists/l...,Experts concerned that some 1.5 million refuge...,2019-02-23
4,EN2845164,RELEVANT,https://www.theledger.com/news/20191013/syrias...,"AKCAKALE, Turkey — Syria's Kurds said Syrian g...",2019-10-14


In [23]:
from nltk.stem import WordNetLemmatizer 
import re 
from nltk.corpus import stopwords

lemmatizer = WordNetLemmatizer() 

def text_basic_clean(text):
    text = text.replace('\n\n•', '').replace('\n\n', '')
    text = re.sub(r'\([^)]*\)', '', text)
    text = re.sub(r'[^\w\s]', '', text) 
    text = text.replace('  ', ' ')
    text = ' '.join([lemmatizer.lemmatize(word.lower()) 
                     for word in text.split() 
                     if word not in stopwords.words('english') and word.isalpha()])
    return text

relevance_df['content_clean'] = relevance_df['content'].apply(lambda x: text_basic_clean(x))  

In [24]:
X = list(relevance_df.content_clean)
y = np.array(relevance_df.relevance.map(lambda x: 0 if x=='NOT_RELEVANT' else 1))

In [25]:
results_relevance = {}
for name, model, param_grid in models:
    print('Testing model', name)
    tester = Tester(model)
    results_relevance[name] = tester.run_many()
    print([np.mean(r) for r in results_relevance[name]])

Testing model Multinomial Naive Bayes
100%|██████████| 50/50 [00:22<00:00,  2.25it/s]
[0.8272583333333334, 0.8225925925925925]
Testing model XGBoost
100%|██████████| 50/50 [01:48<00:00,  2.17s/it]
[0.7562416666666667, 0.7741798941798943]
Testing model Random Forest
100%|██████████| 50/50 [01:04<00:00,  1.29s/it]
[0.8129333333333335, 0.8252380952380952]
Testing model Logistic Regression
100%|██████████| 50/50 [00:26<00:00,  1.86it/s]
[0.8297374999999999, 0.8291005291005292]
Testing model Support Vector Machine
100%|██████████| 50/50 [00:36<00:00,  1.38it/s]
[0.828475, 0.8267724867724869]


In [26]:
pd.DataFrame({k: [np.mean(v[0]), np.mean(v[1])]
              for (k,v) in results_relevance.items()}, index=['validation', 'test']).T

Unnamed: 0,validation,test
Multinomial Naive Bayes,0.827258,0.822593
XGBoost,0.756242,0.77418
Random Forest,0.812933,0.825238
Logistic Regression,0.829737,0.829101
Support Vector Machine,0.828475,0.826772
