## 1. Library Initialization

In [7]:
from copy import deepcopy
import pandas as pd
import numpy as np
import re

# Preprocessing
import nltk
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import TreebankWordTokenizer
from nltk.corpus import stopwords

# Testing and Evaluation
from sklearn.model_selection import train_test_split, cross_val_score, cross_validate
from sklearn.model_selection import StratifiedKFold, GridSearchCV, RandomizedSearchCV
from sklearn.metrics import classification_report

# Logistic Regression
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import TfidfVectorizer

# Imbalanced
from imblearn.pipeline import Pipeline as ImbPipeline
from imblearn.over_sampling import RandomOverSampler, SMOTE

## 2. Data Loading

In [8]:
json_file = "train.json1"
csv_file = "dev.csv"
test_file = "test.csv"

train_df = pd.read_json(json_file, lines=True)
unseen_df = pd.read_csv(csv_file)
test_df = pd.read_csv(test_file)

print("Train size:", len(train_df))
print("Dev size:", len(unseen_df))
print("Dev size:", len(test_df))

Train size: 1300
Dev size: 200
Dev size: 200


## 3. Data Preprocessing

In [9]:
# Preprocessing of Training Data 
tokenizer = TreebankWordTokenizer()
lemmatizer = WordNetLemmatizer()
STOPWORDS = set(stopwords.words('english'))

def clean_text(text):
    text_clean = " ".join(text.lower().strip().split())
    text_clean = re.sub(r'[^\w\s%\:\-\.]', '', text_clean)
    tokens = tokenizer.tokenize(text_clean) 
    lemmas = [lemmatizer.lemmatize(token) for token in tokens if token not in STOPWORDS]
    return " ".join(lemmas)

processed_train_df = deepcopy(train_df)    # deepcopy so that the  
processed_unseen_df = deepcopy(unseen_df)  # original dataset isn't modified
processed_test_df = deepcopy(test_df) 

processed_train_df["text"] = processed_train_df["text"].apply(clean_text)
processed_unseen_df["text"] = processed_unseen_df["text"].apply(clean_text)
processed_test_df["text"] = processed_test_df["text"].apply(clean_text)

label_names = ["irrelevant", "metrics", "strategy", "risk", "governance"]

## 4. Data Training
#### Raw Input Initialization

In [10]:
# Raw Inputs 
X_train_raw_TV = deepcopy(processed_train_df["text"])
y_train_TV = deepcopy(processed_train_df["label"])

X_unseen_raw_TV = deepcopy(processed_unseen_df["text"])
y_unseen_TV = deepcopy(processed_unseen_df["label"])

X_test_raw_TV = deepcopy(processed_test_df["text"])
y_test_TV = deepcopy(processed_test_df["label"])

#### Final GridSearch Model

In [11]:
grid_pipeline = ImbPipeline([
    ('tfidf', TfidfVectorizer(sublinear_tf=True, lowercase=True, stop_words='english')),
    ('smote', SMOTE(random_state=42)),
    ('clf', LogisticRegression(random_state=42))
])

grid_parameters = {
    'tfidf__ngram_range': [(1, 2)],
    'tfidf__max_df': [0.8],
    'tfidf__min_df': [3],
    'tfidf__max_features': [1000],

    'smote__sampling_strategy': [{1: 350, 3: 450, 4: 300}],

    'clf__C': [1.5],
    'clf__penalty': ['l1'],
    'clf__solver': ['liblinear'],
    'clf__class_weight': [None],
    'clf__max_iter': [1000],
}


grid_kf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
grid_search = GridSearchCV(grid_pipeline, grid_parameters, cv=grid_kf, scoring='f1_macro', n_jobs=-1, verbose = 2)
grid_search.fit(X_train_raw_TV, y_train_TV)

grid_best_params = grid_search.best_params_
print(f"\nBest Parameters from GridSearchCV: {grid_best_params}")

Fitting 5 folds for each of 1 candidates, totalling 5 fits

Best Parameters from GridSearchCV: {'clf__C': 1.5, 'clf__class_weight': None, 'clf__max_iter': 1000, 'clf__penalty': 'l1', 'clf__solver': 'liblinear', 'smote__sampling_strategy': {1: 350, 3: 450, 4: 300}, 'tfidf__max_df': 0.8, 'tfidf__max_features': 1000, 'tfidf__min_df': 3, 'tfidf__ngram_range': (1, 2)}


#### GridSearch Pipeline - Evaluation
Evaluate the performance of the GridSearchCV Result

In [12]:
grid_best_model = grid_search.best_estimator_

# Predict on training set
grid_y_pred_train = grid_best_model.predict(X_train_raw_TV)
print("\nModel Evaluation on Train Set (from GridSearchCV):")
print(classification_report(y_train_TV, grid_y_pred_train, target_names=label_names, zero_division=0, digits=4))

# Predict on test set
grid_y_pred_unseen = grid_best_model.predict(X_unseen_raw_TV)
print("\nModel Evaluation on Test Set (from GridSearchCV):")
print(classification_report(y_unseen_TV, grid_y_pred_unseen, target_names=label_names, zero_division=0, digits=4))

grid_y_pred_test = grid_best_model.predict(X_test_raw_TV)
print("\nModel Evaluation on Test Set (from GridSearchCV):")
print(classification_report(y_test_TV, grid_y_pred_test, target_names=label_names, zero_division=0, digits=4))


Model Evaluation on Train Set (from GridSearchCV):
              precision    recall  f1-score   support

  irrelevant     0.8750    0.9333    0.9032       300
     metrics     0.8577    0.8510    0.8543       255
    strategy     0.8857    0.8362    0.8603       519
        risk     0.7636    0.7683    0.7660       164
  governance     0.7222    0.8387    0.7761        62

    accuracy                         0.8531      1300
   macro avg     0.8209    0.8455    0.8320      1300
weighted avg     0.8546    0.8531    0.8531      1300


Model Evaluation on Test Set (from GridSearchCV):
              precision    recall  f1-score   support

  irrelevant     0.6471    0.7586    0.6984        29
     metrics     0.6250    0.6000    0.6122        25
    strategy     0.8300    0.8058    0.8177       103
        risk     0.4348    0.3704    0.4000        27
  governance     0.6842    0.8125    0.7429        16

    accuracy                         0.7150       200
   macro avg     0.6442    0

#### GridSearch Model Export

In [13]:
import joblib

joblib.dump(grid_best_model, 'logistic_best_model.joblib')

print("Model Saved.")

loaded_model = joblib.load('logistic_best_model.joblib')
loaded_model

Model Saved.
