# Modeling

## Baselines

1. Random 
2. Rule-based
3. Simple ML

In [19]:
import random
import json
import pandas as pd
import numpy as np
import nltk
import re
from collections import Counter
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from sklearn.model_selection import train_test_split
from sklearn.metrics import precision_recall_fscore_support
import sys
sys.path.append("..")
from utils import LabelEncoder, clean_text

nltk.download("stopwords")
STOPWORDS = stopwords.words("english")
stemmer = PorterStemmer()

ACCEPTED_TAGS = [
    "natural-language-processing",
    "computer-vision",
    "mlops",
    "graph-learning",
]

[nltk_data] Downloading package stopwords to /home/merlin/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [12]:
df = pd.read_csv("../Data/datasets/labeled_projects.csv")

In [22]:
def set_seeds(seed=13):
    """Set seeds for reproducibility."""
    np.random.seed(seed)
    random.seed(seed)

# Replace dashes from tags & aliases
def replace_dash(x):
    return x.replace("-", " ")


def preprocess(df, lower, stem, min_freq):
    """Preprocess the data."""
    df["text"] = df.title + " " + df.description  # feature engineering
    df.text = df.text.apply(clean_text, lower=lower, stem=stem)  # clean text

    # Replace OOS tags with `other`
    oos_tags = [item for item in df.tag.unique() if item not in ACCEPTED_TAGS]
    df.tag = df.tag.apply(lambda x: "other" if x in oos_tags else x)

    # Replace tags below min_freq with `other`
    tags_above_freq = Counter(tag for tag in tags.elements()
                            if (tags[tag] >= min_freq))
    df.tag = df.tag.apply(lambda tag: tag if tag in tags_above_freq else None)
    df.tag = df.tag.fillna("other")

    return df

def get_data_splits(X, y, train_size=0.7):
    """Generate balanced data splits."""
    X_train, X_, y_train, y_ = train_test_split(
        X, y, train_size=train_size, stratify=y)
    X_val, X_test, y_val, y_test = train_test_split(
        X_, y_, train_size=0.5, stratify=y_)
    return X_train, X_val, X_test, y_train, y_val, y_test


In [23]:
# Set up
set_seeds()
min_freq = 75
tags = Counter(df.tag.values)
# Shuffle
df = df.sample(frac=1).reset_index(drop=True)
df = preprocess(df, lower=True, stem=False, min_freq=min_freq)
label_encoder = LabelEncoder().fit(df.tag)
X_train, X_val, X_test, y_train, y_val, y_test = \
    get_data_splits(X=df.text.to_numpy(), y=label_encoder.encode(df.tag))



In [24]:
# Label encoder
print (label_encoder)
print (label_encoder.classes)


<LabelEncoder(num_classes=4)>
['computer-vision', 'mlops', 'natural-language-processing', 'other']


### Random Predictions

In [25]:
# Generate random predictions
y_pred = np.random.randint(low=0, high=len(label_encoder), size=len(y_test))
print (y_pred.shape)
print (y_pred[0:5])


(144,)
[3 1 0 3 1]


In [26]:
# Evaluate
metrics = precision_recall_fscore_support(y_test, y_pred, average="weighted")
performance = {"precision": metrics[0], "recall": metrics[1], "f1": metrics[2]}
print (json.dumps(performance, indent=2))


{
  "precision": 0.2651501403406891,
  "recall": 0.19444444444444445,
  "f1": 0.21180219588238455
}


With weighted probabilities

In [27]:
# Class frequencies
p = [Counter(y_test)[index]/len(y_test) for index in range(len(label_encoder))]
# Generate weighted random predictions
y_pred = np.random.choice(a=range(len(label_encoder)), size=len(y_test), p=p)

# Evaluate
metrics = precision_recall_fscore_support(y_test, y_pred, average="weighted")
performance = {"precision": metrics[0], "recall": metrics[1], "f1": metrics[2]}
print (json.dumps(performance, indent=2))


{
  "precision": 0.33971088435374147,
  "recall": 0.3402777777777778,
  "f1": 0.3380212077641984
}


Rule-based predictions

In [29]:
def get_tag(text, aliases_by_tag):
    """If a token matches an alias,
    then add the corresponding tag class."""
    for tag, aliases in aliases_by_tag.items():
        if replace_dash(tag) in text:
            return tag
        for alias in aliases:
            if alias in text:
                return tag
    return None

# Aliases
aliases_by_tag = {
    "computer-vision": ["cv", "vision"],
    "mlops": ["production"],
    "natural-language-processing": ["nlp", "nlproc"]
}

In [30]:
# Sample
text = "A pretrained model hub for popular nlp models."
get_tag(text=clean_text(text), aliases_by_tag=aliases_by_tag)


'natural-language-processing'

In [31]:
# Prediction
tags = []
for text in X_test:
    tag = get_tag(text, aliases_by_tag=aliases_by_tag)
    tags.append(tag)


In [32]:
# Encode labels
y_pred = [label_encoder.class_to_index[tag] if tag is not None else -1 for tag in tags]


In [35]:
y_pred[:10]

[0, -1, -1, -1, -1, -1, -1, -1, -1, -1]

In [36]:
# Evaluate
metrics = precision_recall_fscore_support(y_test, y_pred, average="weighted")
performance = {"precision": metrics[0], "recall": metrics[1], "f1": metrics[2]}
print (json.dumps(performance, indent=2))


{
  "precision": 0.8611111111111112,
  "recall": 0.14583333333333334,
  "f1": 0.24508856682769725
}


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


## Simple ML

### Vectorization with TF-IDF

In [38]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [39]:
# Saving raw X_test to compare with later
X_test_raw = X_test


In [40]:
# Tf-idf
vectorizer = TfidfVectorizer(analyzer="char", ngram_range=(2,7))  # char n-grams
print (X_train[0])
X_train = vectorizer.fit_transform(X_train)
X_val = vectorizer.transform(X_val)
X_test = vectorizer.transform(X_test)
print (X_train.shape)  # scipy.sparse.csr_matrix


help read text summarization using flask huggingface text summarization translation questions answers generation using huggingface deployed using flask streamlit detailed guide github
(668, 99644)


In [41]:
# Class weights
counts = np.bincount(y_train)
class_weights = {i: 1.0/count for i, count in enumerate(counts)}
print (f"class counts: {counts},\nclass weights: {class_weights}")


class counts: [249  55 272  92],
class weights: {0: 0.004016064257028112, 1: 0.01818181818181818, 2: 0.003676470588235294, 3: 0.010869565217391304}


### Data imbalances

In [42]:
from imblearn.over_sampling import RandomOverSampler

In [43]:
# Oversample (training set)
oversample = RandomOverSampler(sampling_strategy="all")
X_over, y_over = oversample.fit_resample(X_train, y_train)


In [44]:
# Class weights
counts = np.bincount(y_over)
class_weights = {i: 1.0/count for i, count in enumerate(counts)}
print (f"class counts: {counts},\nclass weights: {class_weights}")


class counts: [272 272 272 272],
class weights: {0: 0.003676470588235294, 1: 0.003676470588235294, 2: 0.003676470588235294, 3: 0.003676470588235294}


In [46]:
from sklearn import metrics
from sklearn.linear_model import SGDClassifier
from sklearn.metrics import log_loss, precision_recall_fscore_support


In [47]:
# Initialize model
model = SGDClassifier(
    loss="log", penalty="l2", alpha=1e-4, max_iter=1,
    learning_rate="constant", eta0=1e-1, power_t=0.1,
    warm_start=True)

# Train model
num_epochs = 100
for epoch in range(num_epochs):
    # Training
    model.fit(X_over, y_over)

    # Evaluation
    train_loss = log_loss(y_train, model.predict_proba(X_train))
    val_loss = log_loss(y_val, model.predict_proba(X_val))

    if not epoch%10:
        print(
            f"Epoch: {epoch:02d} | "
            f"train_loss: {train_loss:.5f}, "
            f"val_loss: {val_loss:.5f}"
        )





Epoch: 00 | train_loss: 1.16845, val_loss: 1.22934




Epoch: 10 | train_loss: 0.45566, val_loss: 0.67510




Epoch: 20 | train_loss: 0.31080, val_loss: 0.57088




Epoch: 30 | train_loss: 0.24762, val_loss: 0.52701




Epoch: 40 | train_loss: 0.21331, val_loss: 0.50354




Epoch: 50 | train_loss: 0.19246, val_loss: 0.48933




Epoch: 60 | train_loss: 0.17897, val_loss: 0.48018




Epoch: 70 | train_loss: 0.16985, val_loss: 0.47400




Epoch: 80 | train_loss: 0.16347, val_loss: 0.46966




Epoch: 90 | train_loss: 0.15879, val_loss: 0.46660




In [48]:
# Evaluate
y_pred = model.predict(X_test)
metrics = precision_recall_fscore_support(y_test, y_pred, average="weighted")
performance = {"precision": metrics[0], "recall": metrics[1], "f1": metrics[2]}
print (json.dumps(performance, indent=2))


{
  "precision": 0.9123204067589662,
  "recall": 0.9097222222222222,
  "f1": 0.9092087542087541
}


In [49]:
# Inference (with tokens similar to training data)
text = "Transfer learning with transformers for text classification."
y_pred = model.predict(vectorizer.transform([text]))
label_encoder.decode(y_pred)


['natural-language-processing']

In [50]:
# Probabilities
y_prob = model.predict_proba(vectorizer.transform([text]))
{tag:y_prob[0][i] for i, tag in enumerate(label_encoder.classes)}


{'computer-vision': 0.03596419903428381,
 'mlops': 0.0032200285854097335,
 'natural-language-processing': 0.950501863152314,
 'other': 0.010313909227992472}

We're going to create a custom predict function where if the majority class is not above a certain softmax score, then we predict the other class. In our objectives, we decided that precision is really important for us and that we can leverage the labeling and QA workflows to improve the recall during subsequent manual inspection.

In [51]:
# Determine first quantile softmax score for the correct class (on validation split)
y_pred = model.predict(X_val)
y_prob = model.predict_proba(X_val)
threshold = np.quantile([y_prob[i][j] for i, j in enumerate(y_pred)], q=0.25)  # Q1
threshold


0.5961114032423355

In [52]:
# Custom predict function
def custom_predict(y_prob, threshold, index):
    """Custom predict function that defaults
    to an index if conditions are not met."""
    y_pred = [np.argmax(p) if max(p) > threshold else index for p in y_prob]
    return np.array(y_pred)

def predict_tag(texts):
    y_prob = model.predict_proba(vectorizer.transform(texts))
    other_index = label_encoder.class_to_index["other"]
    y_pred = custom_predict(y_prob=y_prob, threshold=threshold, index=other_index)
    return label_encoder.decode(y_pred)



In [55]:
# Inference (with tokens not similar to training data)
text = "Interpretability methods for explaining deep learning model behavior."
predict_tag(texts=[text])


['other']

In [57]:
# Evaluate
y_prob = model.predict_proba(X_test)
y_pred = custom_predict(y_prob=y_prob, threshold=threshold, index=label_encoder.class_to_index["other"])
metrics = precision_recall_fscore_support(y_test, y_pred, average="weighted")
performance = {"precision": metrics[0], "recall": metrics[1], "f1": metrics[2]}
print (json.dumps(performance, indent=2))


{
  "precision": 0.8847737716507599,
  "recall": 0.8125,
  "f1": 0.8296493465550844
}
