# Logistic Regression with Transfer Learning

In [2]:
from sklearn.datasets import fetch_20newsgroups
from transformers import BertTokenizer, BertModel
import torch
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Using device:", device)

data = fetch_20newsgroups(subset='all', remove=('headers', 'footers', 'quotes'))
texts = data.data
labels = data.target

tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

def encode_texts(texts):
    inputs = tokenizer(texts, padding=True, truncation=True, max_length=512, return_tensors="pt")
    return inputs.to(device)

model = BertModel.from_pretrained('bert-base-uncased').to(device)

model.eval()

batch_size = 16 
features = []
for i in range(0, len(texts), batch_size):
    batch_texts = texts[i:i+batch_size]
    encoded_inputs = encode_texts(batch_texts)
    with torch.no_grad():
        outputs = model(**encoded_inputs)
        batch_features = outputs.last_hidden_state[:, 0, :].cpu().numpy()
        features.append(batch_features)

features = np.vstack(features)

X_train, X_test, y_train, y_test = train_test_split(features, labels, test_size=0.2, random_state=42)

classifier = LogisticRegression(max_iter=1000)
classifier.fit(X_train, y_train)

predictions = classifier.predict(X_test)
print(classification_report(y_test, predictions))


  from .autonotebook import tqdm as notebook_tqdm


Using device: cuda


Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.transform.dense.bias', 'cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


              precision    recall  f1-score   support

           0       0.52      0.54      0.53       151
           1       0.53      0.54      0.53       202
           2       0.50      0.49      0.50       195
           3       0.44      0.46      0.45       183
           4       0.55      0.48      0.51       205
           5       0.63      0.64      0.64       215
           6       0.74      0.69      0.72       193
           7       0.47      0.66      0.55       196
           8       0.58      0.64      0.60       168
           9       0.87      0.81      0.84       211
          10       0.88      0.83      0.85       198
          11       0.71      0.68      0.69       201
          12       0.59      0.52      0.55       202
          13       0.82      0.78      0.80       194
          14       0.71      0.74      0.73       189
          15       0.67      0.69      0.68       202
          16       0.59      0.60      0.60       188
          17       0.73    

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


## Predict

In [4]:
def predict_new_texts(new_texts, tokenizer, model, classifier, device):

    model.eval()
    

    encoded_inputs = tokenizer(new_texts, padding=True, truncation=True, max_length=512, return_tensors="pt")
    encoded_inputs = encoded_inputs.to(device)
    

    with torch.no_grad():
        outputs = model(**encoded_inputs)

        features = outputs.last_hidden_state[:, 0, :].cpu().numpy()


    predictions = classifier.predict(features)
    return predictions

new_texts = [
    "Healthcare advancements help to increase life expectancy",
    "Christ is king!."
]


predictions = predict_new_texts(new_texts, tokenizer, model, classifier, device)


print("Predictions:", predictions)
print("Predicted categories:", [data.target_names[pred] for pred in predictions])


Predictions: [ 2 19]
Predicted categories: ['comp.os.ms-windows.misc', 'talk.religion.misc']
