In [2]:
# libraries import 
import pandas as pd
import numpy as np
import re
import nltk
from nltk.corpus import stopwords
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer
from sklearn.metrics import accuracy_score, precision_recall_fscore_support

pd.set_option('display.max_colwidth', None)

In [3]:
# loaading data
df_train = pd.read_csv('train.csv')
df_eval = pd.read_csv('eval.csv')

In [4]:
df_train.shape, df_eval.shape

((2061, 3), (9000, 3))

In [5]:
def preprocess_text(text):
    # convert to lowercase
    text = text.lower()
    
    # remove punctuation 
    text = re.sub(r'[^\w\s]', '', text)
    
    # remove stop words
    stop_words = set(stopwords.words('english'))
    text_tokens = nltk.word_tokenize(text)
    filtered_text = [word for word in text_tokens if word not in stop_words]
    text = ' '.join(filtered_text)

    return text


In [6]:
# download stopwords
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to C:\Users\D SAIPAVAN
[nltk_data]     KUMAR\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [7]:
# removing the stop words helps to decrease the computation time
text = df_train['text'][6]
print(preprocess_text(text),'\n',text)

zoom excellent meeting app 
 zoom is an excellent meeting app.


In [8]:
# so we have preprocessed text
# we can see that in training data there are only positive samples 
# so we need to balance the data by adding negative samples
# for adding negative samples by randomly selecting a text and adding current reason 


In [9]:
# applying preprocessing to the text in df_train
df_train['text'] = df_train['text'].apply(preprocess_text)
df_train['reason'] = df_train['reason'].apply(preprocess_text)
df_eval['text'] = df_eval['text'].apply(preprocess_text)
df_eval['reason'] = df_eval['reason'].apply(preprocess_text)


In [10]:
from sentence_transformers import SentenceTransformer, util
import numpy as np
model = SentenceTransformer('stsb-roberta-large') #roberta is a transformer model(base model) which is trained on a large corpus of text for the task of sentence similarity


In [11]:
train_list = []
for idx,row in df_train.iterrows(): 
        inp_example = {
                "texts" :row['text'],
                'reason': row['reason'], 
                'label': (row['label'])
                       }
        train_list.append(inp_example)
test_list = []
for idx,row in df_eval.iterrows():  
        inp_example = inp_example = {
                "texts" :row['text'],
                'reason': row['reason'], 
                'label': row['label']
                       }
        test_list.append(inp_example)

In [12]:
for i in range(len(test_list)):
    embedding1 = model.encode(test_list[i]['texts'], convert_to_tensor=True)
    embedding2 = model.encode(test_list[i]['reason'], convert_to_tensor=True)
# compute similarity scores of two embeddings
    cosine_scores = util.pytorch_cos_sim(embedding1, embedding2)
    if cosine_scores > 0.5:
        test_list[i]['pred'] = 1
    else:
        test_list[i]['pred'] = 0

    print(cosine_scores, test_list[i]['label'], test_list[i]['pred'])

tensor([[0.6732]]) 1 1
tensor([[0.7836]]) 0 1
tensor([[0.2829]]) 0 0
tensor([[0.3715]]) 0 0
tensor([[0.4413]]) 0 0
tensor([[0.4727]]) 1 0
tensor([[0.0450]]) 0 0
tensor([[0.6454]]) 1 1
tensor([[0.3686]]) 0 0
tensor([[0.3925]]) 0 0
tensor([[0.2735]]) 0 0
tensor([[0.4236]]) 0 0
tensor([[0.4069]]) 1 0
tensor([[0.2796]]) 1 0
tensor([[0.4030]]) 0 0
tensor([[0.2157]]) 1 0
tensor([[0.4821]]) 1 0
tensor([[0.3553]]) 0 0
tensor([[0.5774]]) 0 1
tensor([[0.5297]]) 0 1
tensor([[0.4579]]) 0 0
tensor([[0.1927]]) 0 0
tensor([[0.6561]]) 0 1
tensor([[0.2074]]) 0 0
tensor([[0.0943]]) 0 0
tensor([[0.4823]]) 1 0
tensor([[0.8109]]) 1 1
tensor([[0.6028]]) 1 1
tensor([[0.2562]]) 0 0
tensor([[0.4216]]) 0 0
tensor([[0.5522]]) 0 1
tensor([[0.2215]]) 1 0
tensor([[0.2572]]) 0 0
tensor([[0.1025]]) 1 0
tensor([[0.3471]]) 1 0
tensor([[0.3109]]) 0 0
tensor([[0.4582]]) 0 0
tensor([[0.5147]]) 1 1
tensor([[0.3477]]) 0 0
tensor([[0.1528]]) 0 0
tensor([[0.5863]]) 0 1
tensor([[0.1387]]) 0 0
tensor([[0.3573]]) 0 0
tensor([[0.

KeyboardInterrupt: 

In [13]:
y_true = [l['label'] for l in test_list[:700]]
y_pred = [l['pred'] for l in test_list[:700]]

In [14]:
from sklearn.metrics import precision_score, recall_score, f1_score

# Assuming y_true and y_pred are arrays of binary labels (0 or 1)
precision = precision_score(y_true, y_pred)
recall = recall_score(y_true, y_pred)
f1 = f1_score(y_true, y_pred)

print("Precision: {:.2f}".format(precision))
print("Recall: {:.2f}".format(recall))
print("F1 score: {:.2f}".format(f1))


Precision: 0.55
Recall: 0.50
F1 score: 0.52


In [15]:
sum = 0
for i in range(100):
    sum = (test_list[i]['label'] == test_list[i]['pred']) + sum
    # if (test_list[i]['label'] != test_list[i]['pred']):
    #     print(f"text: {test_list[i]['texts']}, reason: {test_list[i]['reason']}")
    #     print(f"labelled: {test_list[i]['label']}, predicted: {test_list[i]['pred']}")
accuracy = sum/100
print(accuracy)

0.64
