In [22]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import pickle 
from tqdm.auto import tqdm  
import warnings

from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, classification_report, confusion_matrix, make_scorer
from skmultilearn.model_selection import iterative_train_test_split
from skmultilearn.model_selection import IterativeStratification
from sklearn.multiclass import OneVsRestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score

from imblearn.pipeline import Pipeline 
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import GridSearchCV

import torch
import transformers 

warnings.filterwarnings("ignore")
np.random.seed(0)

## 1. Prepare data

In [3]:
import pickle
df = pd.read_pickle('../input/reports-tokenized/reports_tokenized.p')
df

In [4]:
# Store labels
labels = ['SDG_1', 'SDG_2', 'SDG_3', 'SDG_4', 'SDG_5', 'SDG_6', 'SDG_7', 'SDG_8', 'SDG_9', 'SDG_11', 'SDG_12', 'SDG_13', 'SDG_15', 'SDG_16', 'SDG_17']
Y = df[labels].to_numpy().astype(int)

## 2. Compute embeddings with pre-trained BERT mode

In [5]:
# Use GPU
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
device

In [6]:
model_class, tokenizer_class, pretrained_weights = (transformers.BertModel, transformers.BertTokenizer, 'bert-base-cased')

In [7]:
# Load pretrained model/tokenizer
tokenizer = tokenizer_class.from_pretrained(pretrained_weights)
model = model_class.from_pretrained(pretrained_weights).to(device)

In [8]:
# Inference (use GPU)

# Inputs
input_data = list(df['CSR_Text_clean'])
batch_size = 32

outputs = np.zeros((len(input_data), 768))
def chunker(seq, batch_size=batch_size):
    return (seq[pos:pos + batch_size] for pos in range(0, len(seq), batch_size))
i = 0

for sentence_batch in tqdm(chunker(input_data)):
    # Tokenize batch
    tokenized_sentences = tokenizer(sentence_batch, max_length=512,
                                truncation=True, padding=True,
                                return_tensors="pt", return_attention_mask=True)
    with torch.no_grad():
        # Compute embeddings for batch
        last_hidden_states = model(**tokenized_sentences.to(device))
    # Store [CLS] tokens
    outputs[i:i+batch_size] = last_hidden_states[0][:,0,:].cpu().numpy()
    i += batch_size

## 3. Classification (no prior balancing of data)

In [9]:
# Split into training and test data (stratified for multi-label)
X_train, Y_train, X_test, Y_test = iterative_train_test_split(outputs, Y, test_size = 0.2)

In [16]:
# Train Logistic Regression Classifier
ovr = OneVsRestClassifier(LogisticRegression(solver='liblinear'))
ovr.fit(X_train, Y_train)

In [17]:
predictions = ovr.predict(X_test)

In [18]:
print(classification_report(Y_test, predictions))

In [19]:
# Confusion matrices
sdgs = [1,2,3,4,5,6,7,8,9,11,12,13,15,16,17]
counter = 0
fig, axes = plt.subplots(nrows=5, ncols=3, figsize=(15,22))
plt.subplots_adjust(hspace = 0.5)
for ax in axes.flatten():
    cm = confusion_matrix(Y_test[:,counter], predictions[:,counter])
    sns.heatmap(cm, annot=True, fmt='g', ax=ax, cmap='Greens')
    ax.set_title('SDG ' + str(sdgs[counter]))
    ax.set_xlabel('Predicted labels')
    ax.set_ylabel('True labels') 
    counter += 1

## Classification with prior balancing of data

In [23]:
# Perform Grid Search to find optimal parameters 

# Pipeline: Resample data with smote and pass output to classifier
pipeline = Pipeline([('smote', SMOTE()), 
                     ('clf', LogisticRegression(solver='liblinear'))])

# Transform Y into single labels and pass single label data to different copies of pipeline
ovr = OneVsRestClassifier(pipeline)

# Parameter to test in Grid Search
parameters = {'estimator__clf__C': [1,2]}

# Use stratified sampling
stratified_10_fold_cv = IterativeStratification(n_splits=2, order=1) # CHANGE TO 10
# Optimize for weighted F1-score
scorer = make_scorer(f1_score, average = 'weighted')
# Grid Search
grid_search = GridSearchCV(ovr, parameters, cv=stratified_10_fold_cv, scoring=scorer, n_jobs=-1, verbose=5)

grid_search.fit(X_train, Y_train)

In [24]:
print(grid_search.best_params_)
print(grid_search.best_estimator_)
print(grid_search.best_score_)

In [25]:
# Evaluation
predictions = grid_search.predict(X_test)
print(classification_report(Y_test, predictions))

sdgs = [1,2,3,4,5,6,7,8,9,11,12,13,15,16,17]
counter = 0
fig, axes = plt.subplots(nrows=5, ncols=3, figsize=(15,22))
plt.subplots_adjust(hspace = 0.5)
for ax in axes.flatten():
    cm = confusion_matrix(Y_test[:,counter], predictions[:,counter])
    sns.heatmap(cm, annot=True, fmt='g', ax=ax, cmap='Greens')
    ax.set_title('SDG ' + str(sdgs[counter]))
    ax.set_xlabel('Predicted labels')
    ax.set_ylabel('True labels') 
    counter += 1