In [None]:
import sys
import re
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
from tqdm import tqdm

from nltk.stem import WordNetLemmatizer 
from nltk.corpus import stopwords

from sklearn.model_selection import GridSearchCV, StratifiedShuffleSplit, train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn import metrics
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
import xgboost as xgb

# Evaluate classification models

This notebook is used to load data for the classification task and evaluate different classification models.

Before loading the content, you must run the notebook Data-Preparation.ipynb to scrape articles not shared with the data.

In [None]:
content = pd.read_csv('data/assets_scraped.csv')

### Load and prepare data for the Type classification task

In [None]:
type_df_nocontent = pd.read_csv('data/type_classification.csv')

In [None]:
type_df = pd.merge(type_df_nocontent, content)

In [None]:
lemmatizer = WordNetLemmatizer() 

def text_basic_clean(text):
    text = text.replace('\n\n•', '').replace('\n\n', '')
    text = re.sub(r'[^\w\s]', '', text) 
    text = text.replace('  ', ' ')
    text = ' '.join([lemmatizer.lemmatize(word.lower())
                     for word in text.split()
                     if word not in stopwords.words('english') and word.isalpha()])
    return text

type_df['content_clean'] = type_df['content'].apply(lambda x: text_basic_clean(x))    

In [None]:
class Tester():
    
    def __init__(self, model, param_grid={}):
        self.model = model
        self.param_grid = param_grid
        self.tfidf = TfidfVectorizer(sublinear_tf=True,
                            analyzer='word', ngram_range=(1, 4), 
                            min_df = 5, stop_words='english',norm='l2')
    
    def run_val_test(self, seed):
        X_train, X_test, y_train, y_test = train_test_split(
            X, y, test_size=0.2, random_state=seed, stratify = y)

        tfidf_matrix = self.tfidf.fit_transform(X_train).toarray()
        x_test = self.tfidf.transform(X_test).toarray()
        cv = StratifiedShuffleSplit(n_splits=10, test_size=0.2, random_state=seed)
        gs = GridSearchCV(self.model, param_grid=self.param_grid,
                          cv=cv, scoring='roc_auc')
        gs.fit(tfidf_matrix, y_train)
        predicted_prob = gs.predict_proba(x_test)
        
        return gs.best_score_, metrics.roc_auc_score(y_test, predicted_prob[:,1])
    
    def run_many(self, n_runs=50):
        val_scores = []
        test_scores = []
        for seed in tqdm(range(n_runs), file=sys.stdout):
            val_score, test_score = self.run_val_test(seed)
            val_scores.append(val_score)
            test_scores.append(test_score)
        return val_scores, test_scores

In [None]:
def xgb_model():
    xgb_clf = xgb.XGBClassifier(use_label_encoder=False)
    args = {'learning_rate': 0.3, 'colsample_bytree': 0.8, 'scale_pos_weight': 3, 
            'n_jobs': -1, 'n_estimators': 300, 'max_depth': 8, 'subsample': 0.8, 'verbosity': 0}
    return xgb_clf.set_params(**args)

def get_rf():
    rf = RandomForestClassifier(min_samples_leaf = 5, 
                                n_estimators = 200,
                                class_weight = 'balanced_subsample',
                                criterion = 'gini',
                                random_state = 42)
    return rf

lr_model = LogisticRegression(penalty='l2', C=0.1, class_weight='balanced', solver='liblinear')
svm_model = SVC(kernel='rbf', probability=True, class_weight='balanced')

models = [
    ('Multinomial Naive Bayes', MultinomialNB(), {'alpha': [0.5, 1, 1.5]}),
    ('XGBoost', xgb_model(), {'max_depth': [3,4,5], 'n_estimators': [10, 20]}),
    ('Random Forest', get_rf(), {'min_samples_leaf': [3,4,5,6]}),
    ('Logistic Regression', lr_model, {'C': [0.1, 1, 2]}),
    ('Support Vector Machine', svm_model, {'C': [0.1, 1, 2]})
]

### Scenario 1: [Both & News] v.s. Summary 


In [None]:
X = list(type_df.content_clean)
y = np.array(type_df.value.map(lambda x: 0 if x=='SUMMARY' else 1))

In [None]:
results_scenario1 = {}
for name, model, param_grid in models:
    print('Testing model', name)
    tester = Tester(model, param_grid)
    results_scenario1[name] = tester.run_many()
    print([np.mean(r) for r in results_scenario1[name]])

In [None]:
pd.DataFrame({k: [np.mean(v[0]), np.mean(v[1])]
              for (k,v) in results_scenario1.items()}, index=['Validation', 'Test']).T

### Scenario 2: [Both & Summary] v.s. News

In [None]:
X = list(type_df.content_clean)
y = np.array(type_df.value.map(lambda x: 0 if x=='NEWS' else 1))

In [None]:
results_scenario2 = {}
for name, model, param_grid in models:
    print('Testing model', name)
    tester = Tester(model, param_grid)
    results_scenario2[name] = tester.run_many()
    print([np.mean(r) for r in results_scenario2[name]])

In [None]:
pd.DataFrame({k: [np.mean(v[0]), np.mean(v[1])]
              for (k,v) in results_scenario2.items()}, index=['Validation', 'Test']).T

### Senario 3: News v.s. Summary 

In [None]:
type_df2 = type_df.copy()
type_df2 = type_df2[type_df2.value != 'BOTH']

In [None]:
X = list(type_df2.content_clean)
y = np.array(type_df2.value.map(lambda x: 0 if x=='SUMMARY' else 1))

In [None]:
results_scenario3 = {}
for name, model, param_grid in models:
    print('Testing model', name)
    tester = Tester(model)
    results_scenario3[name] = tester.run_many()
    print([np.mean(r) for r in results_scenario3[name]])

In [None]:
pd.DataFrame({k: [np.mean(v[0]), np.mean(v[1])]
              for (k,v) in results_scenario3.items()}, index=['validation', 'test']).T

# Relevance Classification

In [None]:
relevance_df_nocontent = pd.read_csv('data/relevance_classification.csv')

In [None]:
relevance_df = pd.merge(relevance_df_nocontent, content)

In [None]:
from collections import Counter
Counter(relevance_df.relevance)

In [None]:
relevance_df.head(5)

In [None]:
from nltk.stem import WordNetLemmatizer 
import re 
from nltk.corpus import stopwords

lemmatizer = WordNetLemmatizer() 

def text_basic_clean(text):
    text = text.replace('\n\n•', '').replace('\n\n', '')
    text = re.sub(r'\([^)]*\)', '', text)
    text = re.sub(r'[^\w\s]', '', text) 
    text = text.replace('  ', ' ')
    text = ' '.join([lemmatizer.lemmatize(word.lower()) 
                     for word in text.split() 
                     if word not in stopwords.words('english') and word.isalpha()])
    return text

relevance_df['content_clean'] = relevance_df['content'].apply(lambda x: text_basic_clean(x))  

In [None]:
X = list(relevance_df.content_clean)
y = np.array(relevance_df.relevance.map(lambda x: 0 if x=='NOT_RELEVANT' else 1))

In [None]:
results_relevance = {}
for name, model, param_grid in models:
    print('Testing model', name)
    tester = Tester(model)
    results_relevance[name] = tester.run_many()
    print([np.mean(r) for r in results_relevance[name]])

In [None]:
pd.DataFrame({k: [np.mean(v[0]), np.mean(v[1])]
              for (k,v) in results_relevance.items()}, index=['validation', 'test']).T