In [1]:
## Author: Mirjam Nanko
## Date Created: 2022-10-13
## Email: m.nanko@exeter.ac.uk

# climatenewsbabyCARDS classifier hyperparameter tuning<br>
#### This script loads the climate related **training** data sourced from various **blogs and newspapers** and uses the sklearn GridSearchCV function with 5-fold cross validation to tune the regularization strength hyperparameter C

# Packages

In [2]:
from preprocess_climatenews import denoise_text
from preprocess_climatenews import simple_tokenise
from sklearn import preprocessing
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
import pandas as pd
import warnings
warnings.filterwarnings("ignore", category=UserWarning, module='bs4')

# Load and preprocess the training, validation and testing data

In [3]:
%%time
# Load the data
train = pd.read_csv('../data/climatenewsbabyCARDStrain.csv', lineterminator='\n')
# Preprocess the text
train['tokens'] = train['text'].astype(str).apply(denoise_text).apply(simple_tokenise)
data = train[['tokens','label']].values.tolist()

CPU times: user 43.7 s, sys: 1.04 s, total: 44.7 s
Wall time: 44.8 s


# Train the logistic classifier

In [4]:
%time
# Find ideal C hyperparameter value
# Encode labels
claims = [row[1] for row in data]
le = preprocessing.LabelEncoder()
y = le.fit_transform(claims)

# Vectorize
text = [row[0] for row in data]

# Setup the Tf-Idf vectorizer
vectorizer = TfidfVectorizer(stop_words=None,
                             min_df=2,
                             strip_accents='unicode',
                             ngram_range=(1, 2), 
                             lowercase=True, 
                             sublinear_tf=1)
X = vectorizer.fit_transform(text)

CPU times: user 4 µs, sys: 0 ns, total: 4 µs
Wall time: 6.68 µs


In [5]:
# Set up parameters and logistic regression
parameters = {'C':[1, 5, 10, 20, 50, 100, 500, 1000]}
# L1 Lasso regularization
clf_logit_L1 = LogisticRegression(max_iter=500, penalty='l1', class_weight='balanced', solver='liblinear')
cv_L1 = GridSearchCV(clf_logit_L1, parameters)

# L2 Ridge regularization
clf_logit_L2 = LogisticRegression(max_iter=500, penalty='l2', class_weight='balanced', solver='liblinear')
cv_L2 = GridSearchCV(clf_logit_L2, parameters)

In [6]:
%time
# Perform L1 grid search
cv_L1.fit(X, y)
cv_L1.best_params_

CPU times: user 7 µs, sys: 0 ns, total: 7 µs
Wall time: 14.3 µs


{'C': 10}

In [7]:
%time
# Perform L2 grid search
cv_L2.fit(X, y)
cv_L2.best_params_

CPU times: user 6 µs, sys: 0 ns, total: 6 µs
Wall time: 11.4 µs


{'C': 1000}

In [8]:
cv_L1.cv_results_

{'mean_fit_time': array([32.07599435, 42.26853051, 51.55308914, 53.43324013, 48.7587513 ,
        46.13012691, 31.67454405, 28.68828239]),
 'std_fit_time': array([1.15483735, 0.11186014, 1.46844   , 1.63208138, 3.04612992,
        2.10732535, 1.68323238, 1.12412047]),
 'mean_score_time': array([0.09328675, 0.09094739, 0.09194455, 0.0895792 , 0.08852959,
        0.08616414, 0.08944435, 0.08819113]),
 'std_score_time': array([0.01033679, 0.01055909, 0.01055056, 0.00586643, 0.00567823,
        0.00568095, 0.0053267 , 0.00795436]),
 'param_C': masked_array(data=[1, 5, 10, 20, 50, 100, 500, 1000],
              mask=[False, False, False, False, False, False, False, False],
        fill_value='?',
             dtype=object),
 'params': [{'C': 1},
  {'C': 5},
  {'C': 10},
  {'C': 20},
  {'C': 50},
  {'C': 100},
  {'C': 500},
  {'C': 1000}],
 'split0_test_score': array([0.91088253, 0.92972639, 0.93137215, 0.92993211, 0.93009669,
        0.92931496, 0.9300144 , 0.9323596 ]),
 'split1_test_score

In [9]:
cv_L2.cv_results_

{'mean_fit_time': array([ 48.26572132,  74.29419689,  93.77355814,  96.87537289,
        110.69769611, 121.36011276, 151.42735777, 168.83207674]),
 'std_fit_time': array([ 1.25063904,  2.77722699,  2.59367675,  2.42946292,  7.05691614,
         2.45482127,  7.88217407, 12.5771309 ]),
 'mean_score_time': array([0.08580785, 0.08629742, 0.08611054, 0.08632259, 0.08660536,
        0.08778434, 0.08808236, 0.08799281]),
 'std_score_time': array([0.00482981, 0.00613225, 0.00412168, 0.00482022, 0.0039253 ,
        0.0033321 , 0.00516718, 0.00536627]),
 'param_C': masked_array(data=[1, 5, 10, 20, 50, 100, 500, 1000],
              mask=[False, False, False, False, False, False, False, False],
        fill_value='?',
             dtype=object),
 'params': [{'C': 1},
  {'C': 5},
  {'C': 10},
  {'C': 20},
  {'C': 50},
  {'C': 100},
  {'C': 500},
  {'C': 1000}],
 'split0_test_score': array([0.91623123, 0.93297675, 0.93602139, 0.937626  , 0.93832545,
        0.93886032, 0.93898375, 0.93923061]),
 's