In [1]:
# libraries import 
import pandas as pd
import numpy as np
import re
import nltk
from nltk.corpus import stopwords
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer
from sklearn.metrics import accuracy_score, precision_recall_fscore_support

pd.set_option('display.max_colwidth', None)

In [3]:
# loaading data
df_train = pd.read_csv('train.csv')
df_eval = pd.read_csv('eval.csv')

In [4]:
df_train.shape, df_eval.shape

((2061, 3), (9000, 3))

In [5]:
def preprocess_text(text):
    # convert to lowercase
    text = text.lower()
    
    # remove punctuation 
    text = re.sub(r'[^\w\s]', '', text)
    
    # remove stop words
    stop_words = set(stopwords.words('english'))
    text_tokens = nltk.word_tokenize(text)
    filtered_text = [word for word in text_tokens if word not in stop_words]
    text = ' '.join(filtered_text)

    return text


In [6]:
# download stopwords
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to C:\Users\D SAIPAVAN
[nltk_data]     KUMAR\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [7]:
# removing the stop words helps to decrease the computation time
text = df_train['text'][6]
print(preprocess_text(text),'\n',text)

zoom excellent meeting app 
 zoom is an excellent meeting app.


In [8]:
# so we have preprocessed text
# we can see that in training data there are only positive samples 
# so we need to balance the data by adding negative samples
# for adding negative samples by randomly selecting a text and adding current reason 


In [9]:
# applying preprocessing to the text in df_train
df_train['text'] = df_train['text'].apply(preprocess_text)
df_train['reason'] = df_train['reason'].apply(preprocess_text)
df_eval['text'] = df_eval['text'].apply(preprocess_text)
df_eval['reason'] = df_eval['reason'].apply(preprocess_text)


In [10]:
from sentence_transformers import SentenceTransformer, util
import numpy as np
model = SentenceTransformer('stsb-roberta-large') #roberta is a transformer model(base model) which is trained on a large corpus of text for the task of sentence similarity


In [11]:
train_list = []
for idx,row in df_train.iterrows(): 
        inp_example = {
                "texts" :row['text'],
                'reason': row['reason'], 
                'label': (row['label'])
                       }
        train_list.append(inp_example)
test_list = []
for idx,row in df_eval.iterrows():  
        inp_example = inp_example = {
                "texts" :row['text'],
                'reason': row['reason'], 
                'label': row['label']
                       }
        test_list.append(inp_example)

In [15]:
for i in range(len(test_list)):
    embedding1 = model.encode(test_list[i]['texts'], convert_to_tensor=True)
    embedding2 = model.encode(test_list[i]['reason'], convert_to_tensor=True)
# compute similarity scores of two embeddings
    cosine_scores = util.pytorch_cos_sim(embedding1, embedding2)
    if cosine_scores > 0.5:
        test_list[i]['pred'] = 1
    else:
        test_list[i]['pred'] = 0

    print(cosine_scores, test_list[i]['label'], test_list[i]['pred'])

KeyboardInterrupt: 

In [29]:
y_true = [l['label'] for l in test_list[:1000]]
y_pred = [l['pred'] for l in test_list[:1000]]

In [30]:
from sklearn.metrics import precision_score, recall_score, f1_score

# Assuming y_true and y_pred are arrays of binary labels (0 or 1)
precision = precision_score(y_true, y_pred)
recall = recall_score(y_true, y_pred)
f1 = f1_score(y_true, y_pred)

print("Precision: {:.2f}".format(precision))
print("Recall: {:.2f}".format(recall))
print("F1 score: {:.2f}".format(f1))


Precision: 0.55
Recall: 0.47
F1 score: 0.51


In [14]:
sum = 0
for i in range(100):
    sum = (test_list[i]['label'] == test_list[i]['pred']) + sum
    # if (test_list[i]['label'] != test_list[i]['pred']):
    #     print(f"text: {test_list[i]['texts']}, reason: {test_list[i]['reason']}")
    #     print(f"labelled: {test_list[i]['label']}, predicted: {test_list[i]['pred']}")
accuracy = sum/100
print(accuracy)

text: want connect tv one device another, reason: want compatibility smart televisions
labelled: 0, predicted: 1
text: enjoyed watching favorite shows movies, reason: good watch shows
labelled: 1, predicted: 0
text: let play movies, reason: unable view movies
labelled: 1, predicted: 0
text: love much watch family kids dramas, reason: good watch shows
labelled: 1, predicted: 0
text: zoom useful home school meetings home, reason: good app students
labelled: 1, predicted: 0
text: good application great say network bothering, reason: facing network issues app
labelled: 1, predicted: 0
text: good meetings, reason: good app conducting online meeting
labelled: 0, predicted: 1
text: reactivate subscription please, reason: unable unsubscribe
labelled: 0, predicted: 1
text: sensational disney, reason: app good watch disney content
labelled: 0, predicted: 1
text: tried uninstalling reinstalling several times luck, reason: reinstalling work
labelled: 1, predicted: 0
text: sensatonal wonderful expe

In [33]:
def generate_negative_samples(data):
    negative_data = pd.DataFrame(columns=['text', 'reason', 'label'])
    for i, row in data.iterrows():
        negative_row = row.copy()
        negative_row['reason'] =  data.sample()['text'].values[0]
        negative_row['label'] = 0
        negative_data = pd.concat([negative_data, negative_row.to_frame().T], ignore_index=True)
    return negative_data


In [34]:
generate_negative_samples(df_train).head()

Unnamed: 0,text,reason,label
0,amazing app online classesbut,works great large groups small,0
1,practical easy use,worst app showing live cricket properly,0
2,app good video conferencing,many ads even single one option skip ads long becomes irritating even open app,0
3,download zoom app,go full screen anymore latest update android operating system,0
4,able download app,trouble viewing multiple screens upgrade,0


In [35]:
# randomly mapping the text and reason to generate negative samples might not be the best way to generate negative samples
# because we might end up with a negative sample which is actually a positive sample so we need to filter out those samples but that is another story
df_train_1 = pd.concat([df_train, generate_negative_samples(df_train)], ignore_index=True)

In [31]:
import torch
from transformers import BertTokenizer, BertForSequenceClassification, AdamW , trainer
from sklearn.metrics import precision_recall_fscore_support

tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2)
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)

def train(model, train_dataloader, val_dataloader, optimizer, epochs):
    best_val_loss = float('inf')
    
    for epoch in range(epochs):
        # Train
        model.train()
        train_loss = 0.0
        for batch in train_dataloader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)
            
            optimizer.zero_grad()
            
            outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
            loss = outputs.loss
            train_loss += loss.item()
            
            loss.backward()
            optimizer.step()
        
        # Validation
        model.eval()
        val_loss = 0.0
        val_preds = []
        val_labels = []
        with torch.no_grad():
            for batch in val_dataloader:
                input_ids = batch['input_ids'].to(device)
                attention_mask = batch['attention_mask'].to(device)
                labels = batch['labels'].to(device)
                
                outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
                loss = outputs.loss
                val_loss += loss.item()
                
                preds = torch.argmax(outputs.logits, dim=1)
                val_preds.extend(preds.tolist())
                val_labels.extend(labels.tolist())
        
        val_loss /= len(val_dataloader)
        val_precision, val_recall, val_f1, _ = precision_recall_fscore_support(val_labels, val_preds, average='binary')
        
        if val_loss < best_val_loss:
            best_val_loss = val_loss
            torch.save(model.state_dict(), 'best_model.pt')
        
        print(f'Epoch {epoch+1}: Train Loss: {train_loss:.4f} | Val Loss: {val_loss:.4f} | Val Precision: {val_precision:.4f} | Val Recall: {val_recall:.4f} | Val F1: {val_f1:.4f}')


Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.dense.weight', 'cls.seq_relationship.weight', 'cls.predictions.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

In [38]:
from sklearn.model_selection import train_test_split
from torch.utils.data import DataLoader, Dataset
from transformers import AdamW

class TextReasonDataset(Dataset):
    def __init__(self, df, tokenizer, max_length):
        self.df = df
        self.tokenizer = tokenizer
        self.max_length = max_length
        
    def __len__(self):
        return len(self.df)
    
    def __getitem__(self, idx):
        text = self.df.iloc[idx]['text']
        reason = self.df.iloc[idx]['reason']
        label = self.df.iloc[idx]['label']
        
        encoding = self.tokenizer(text, reason, truncation=True, padding='max_length', max_length=self.max_length, return_tensors='pt')
        
        sample = {
            'input_ids': encoding['input_ids'][0],
            'attention_mask': encoding['attention_mask'][0],
            'labels': torch.tensor(label)
        }
        
        return sample

# Load data
# train_df = pd.read_csv('train.csv')
# neg_df = generate_negatives(train_df)
# train_df = pd.concat([train_df, neg_df], ignore_index=True)

# # Preprocess text
# train_df['text'] = train_df['text'].apply(preprocess_text)

# Split into train and validation sets
trainW, val = train_test_split(df_train_1, test_size=0.2, random_state=42)

# Create datasets and dataloaders
train_dataset = TextReasonDataset(trainW, tokenizer, max_length=128)
val_dataset = TextReasonDataset(val, tokenizer, max_length=128)

train_dataloader = DataLoader(train_dataset, batch_size=32, shuffle=True)
val_dataloader = DataLoader(val_dataset, batch_size=32, shuffle=False)

# Train model
optimizer = AdamW(model.parameters(), lr=2e-5)
train(model, train_dataloader, val_dataloader, optimizer, epochs=5)


TypeError: 'DataFrame' object is not callable

In [None]:
def error_analysis(model, dataloader):
    model.eval()
    preds = []
    labels = []
    texts = []
    reasons = []
    with torch.no_grad():
        for batch in dataloader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels.extend(batch['labels'].tolist())
            texts.extend(batch['text'])
            reasons.extend(batch['reason'])
            
            outputs = model(input_ids=input_ids, attention_mask=attention_mask)
            batch_preds = torch.argmax(outputs.logits, dim=1)
            preds.extend(batch_preds.tolist())
    
    for i in range(len(preds)):
        if preds[i] != labels[i]:
            print(f'Text: {texts[i]}')
            print(f'Reason: {reasons[i]}')
            print(f'Predicted Label: {preds[i]}')
            print(f'True Label: {labels[i]}\n')


In [None]:
test_df = pd.read_csv('evaluation.csv')
test_df['text'] = test_df['text'].apply(preprocess_text)

test_dataset = TextReasonDataset(test_df, tokenizer, max_length=128)
test_dataloader = DataLoader(test_dataset, batch_size=32, shuffle=False)

error_analysis(model, test_dataloader)


In [None]:
    # load the tokenizer and model
tokenizer = AutoTokenizer.from_pretrained('distilbert-base-uncased')
model = AutoModelForSequenceClassification.from_pretrained('distilbert-base-uncased', num_labels=2)
# perform the ablation study
# print(f"Precision: {p:.4f}, Recall: {r:.4f}, F1-score: {f1:.4f}")


Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForSequenceClassification: ['vocab_layer_norm.weight', 'vocab_layer_norm.bias', 'vocab_projector.weight', 'vocab_transform.weight', 'vocab_projector.bias', 'vocab_transform.bias']
- This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'pre_classifier.bias', 'classifier.w

In [None]:
def train_model(train_data, tokenizer, model):
    # tokenize the text data
    train_encodings = tokenizer(train_data['text'].tolist(), train_data['reason'].tolist(), truncation=True, padding=True)
    train_labels = train_data['label'].tolist()

    # define the training arguments
    training_args = TrainingArguments(
        output_dir='./results',
        num_train_epochs=3,
        per_device_train_batch_size=16,
        per_device_eval_batch_size=64,
        warmup_steps=500,
        weight_decay=0.01,
        logging_dir='./logs',
        logging_steps=10,
    )

    # define the trainer
    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_encodings,
        # train_labels=train_labels,
    )

    # train the model
    trainer.train()


In [None]:
train_model(df_train_1, tokenizer, model)

***** Running training *****
  Num examples = 2
  Num Epochs = 3
  Instantaneous batch size per device = 16
  Total train batch size (w. parallel, distributed & accumulation) = 16
  Gradient Accumulation steps = 1
  Total optimization steps = 3
  Number of trainable parameters = 66955010


  0%|          | 0/3 [00:00<?, ?it/s]

ValueError: The batch received was empty, your model won't be able to train on it. Double-check that your training dataset contains keys expected by the model: input_ids,attention_mask,head_mask,inputs_embeds,labels,output_attentions,output_hidden_states,return_dict,labels,label_ids,label.