In [None]:
!pip install transformers



In [None]:
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
from sklearn.model_selection import train_test_split
from transformers import BartTokenizer, BartForSequenceClassification, AdamW
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score
from gensim.models import Word2Vec

In [None]:
# Load your training and testing datasets
train_data = pd.read_csv('Train_dataset.csv')
test_data = pd.read_csv('Test_dataset.csv')

In [None]:
#CSV has 'Text' and 'L1' columns
train_texts = list(train_data['Text'])
train_labels = list(train_data['L1'])

test_texts = list(test_data['Text'])
test_labels = list(test_data['L1'])


In [None]:
label_mapping = {"tech person": 0, "non-tech person": 1}

In [None]:
# Encode categorical labels using LabelEncoder
label_encoder = LabelEncoder()
encoded_labels = label_encoder.fit_transform(train_labels)

In [None]:
# Split the data into training and testing sets
train_texts, test_texts, train_labels, test_labels = train_test_split(train_texts, encoded_labels, test_size=0.2, random_state=42)

In [None]:
# Train Word2Vec model on your text data
tokenized_texts = [text.split() for text in train_texts]
word2vec_model = Word2Vec(tokenized_texts, vector_size=100, window=5, min_count=1, sg=1)  # Example parameters


In [None]:
# Convert each token into a Word2Vec embedding
train_embeddings = []

for tokens in tokenized_texts:
    embeddings = [word2vec_model.wv[word] for word in tokens if word in word2vec_model.wv]
    if embeddings:
        mean_embedding = np.mean(embeddings, axis=0)
    else:
        mean_embedding = np.zeros(word2vec_model.vector_size)
    train_embeddings.append(mean_embedding)

train_embeddings = np.array(train_embeddings)

In [None]:
# Load BART tokenizer and classification model
tokenizer = BartTokenizer.from_pretrained('facebook/bart-base')
num_classes = len(label_encoder.classes_)
binary_model = BartForSequenceClassification.from_pretrained('facebook/bart-base', num_labels=2)  # Binary classification

Some weights of BartForSequenceClassification were not initialized from the model checkpoint at facebook/bart-base and are newly initialized: ['classification_head.out_proj.weight', 'classification_head.dense.weight', 'classification_head.out_proj.bias', 'classification_head.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
# Tokenize and encode the text data
train_encodings = tokenizer(train_texts, truncation=True, padding=True, return_tensors='pt')
test_encodings = tokenizer(test_texts, truncation=True, padding=True, return_tensors='pt')

train_inputs = {key: val for key, val in train_encodings.items()}
train_labels_tensor = torch.tensor(train_labels, dtype=torch.long)

test_inputs = {key: val for key, val in test_encodings.items()}


In [None]:
# Convert Word2Vec embeddings to PyTorch tensors
train_embeddings_tensor = torch.tensor(train_embeddings, dtype=torch.float32)

In [None]:
# Set up the optimizer
binary_optimizer = AdamW(binary_model.parameters(), lr=1e-5)



In [None]:
# Training loop for binary classification
num_epochs = 3
batch_size = 16

for epoch in range(num_epochs):
    binary_model.train()
    for i in range(0, len(train_labels), batch_size):
        batch_inputs = {key: val[i:i+batch_size] for key, val in train_inputs.items()}
        batch_labels = train_labels_tensor[i:i+batch_size]

        binary_optimizer.zero_grad()
        outputs = binary_model(**batch_inputs, labels=batch_labels)
        loss = outputs.loss
        loss.backward()
        binary_optimizer.step()

    binary_model.eval()
    with torch.no_grad():
        test_outputs = binary_model(**test_inputs)
        predicted_labels = np.argmax(test_outputs.logits.numpy(), axis=1)
        accuracy = accuracy_score(test_labels, predicted_labels)
        print(f'Epoch {epoch + 1} - Binary Test Accuracy: {accuracy:.4f}')

Epoch 1 - Binary Test Accuracy: 0.5000
Epoch 2 - Binary Test Accuracy: 0.5000
Epoch 3 - Binary Test Accuracy: 0.5000
