## 1- Imports

In [1]:
from sklearn.metrics import f1_score, precision_score, recall_score
import torch
import torch.nn as nn
import pandas as pd
import numpy as np
from torch.utils.data import Dataset, DataLoader
from transformers import AutoTokenizer, AutoModel
from sklearn.model_selection import train_test_split
import random


## 2- Data Preparation 

In [None]:
# Reading the data and saving it to 1 file

train_data = pd.read_csv("train.csv")
test_data = pd.read_excel("test.xlsx")

combined_data = pd.concat([train_data, test_data])
shuffled_data = combined_data.sample(frac=1).reset_index(drop=True)

shuffled_data.to_csv("train.csv", index=False)

In [None]:
# Defineing data preprocessing function
def preprocess_text(text):
    text = text.lower()
    text = ''.join([c for c in text if c.isalnum() or c in [' ']])  
    return text

dataset = pd.read_csv("/kaggle/input/simple-text-dataset-tweets/train.csv")  # Replace "train.csv" with the path to your dataset
dataset['text'] = dataset['text'].apply(preprocess_text)

print ("debug 1")

## 3- Data Processing

In [None]:
# Sample tweet pairs and assign labels
def create_tweet_pairs(dataset, num_pairs_per_author=800, same_author_ratio=0.30):
    tweet_pairs = []
    authors = dataset['user'].unique()

    # Dictionary to store used tweet indices for each author
    used_tweet_indices = {author: set() for author in authors}

    for author in authors:
        tweets_by_author = dataset[dataset['user'] == author]['text'].tolist()
        num_same_author_pairs = int(num_pairs_per_author * same_author_ratio)
        num_diff_author_pairs = num_pairs_per_author - num_same_author_pairs
        
        # Sample tweet pairs from the same author
        sampled_indices_same = np.random.choice(len(tweets_by_author), size=(num_same_author_pairs, 2), replace=False)
        for idx1, idx2 in sampled_indices_same:
            if (idx1 not in used_tweet_indices[author]) and (idx2 not in used_tweet_indices[author]):
                tweet_pairs.append((tweets_by_author[idx1], tweets_by_author[idx2], author, author, 1))
                used_tweet_indices[author].add(idx1)
                used_tweet_indices[author].add(idx2)
        
        # Sample tweet pairs from different authors
        other_authors = authors[authors != author]
        num_pairs_per_diff_author = num_diff_author_pairs // len(other_authors)
        remainder_pairs = num_diff_author_pairs % len(other_authors)
        
        for other_author in other_authors:
            tweets_by_other_author = dataset[dataset['user'] == other_author]['text'].tolist()
            sampled_indices_diff = np.random.choice(len(tweets_by_other_author), size=(num_pairs_per_diff_author, 2), replace=False)
            for idx1, idx2 in sampled_indices_diff:
                if (idx1 not in used_tweet_indices[author]) and (idx2 not in used_tweet_indices[other_author]):
                    tweet_pairs.append((tweets_by_author[np.random.randint(len(tweets_by_author))], tweets_by_other_author[idx1], author, other_author, 0))
                    used_tweet_indices[author].add(idx1)
                    used_tweet_indices[other_author].add(idx2)
        
        # Add remainder pairs
        tweets_by_other_author = dataset[dataset['user'] == other_authors[-1]]['text'].tolist()  # Last author
        sampled_indices_diff = np.random.choice(len(tweets_by_other_author), size=(remainder_pairs, 2), replace=False)
        for idx1, idx2 in sampled_indices_diff:
            if (idx1 not in used_tweet_indices[author]) and (idx2 not in used_tweet_indices[other_authors[-1]]):
                tweet_pairs.append((tweets_by_author[np.random.randint(len(tweets_by_author))], tweets_by_other_author[idx1], author, other_authors[-1], 0))
                used_tweet_indices[author].add(idx1)
                used_tweet_indices[other_authors[-1]].add(idx2)
                
        #shuffle the tweet pairs
        random.shuffle(tweet_pairs)
        
    return pd.DataFrame(tweet_pairs, columns=['tweet1', 'tweet2', 'author1', 'author2', 'target'])



# Split dataset into train and test sets
train_data, test_data = train_test_split(dataset, test_size=0.3, random_state=42,shuffle=True)


train_pairs = create_tweet_pairs(train_data)
test_pairs = create_tweet_pairs(test_data)

print ("debug 2")
print(train_pairs.shape)

## 4- Modeling & Eval

In [3]:
class TweetSimilarityModel(nn.Module):
    def __init__(self, transformer_model):
        super(TweetSimilarityModel, self).__init__()
        self.transformer = transformer_model
        self.fc = nn.Linear(768, 1)  # Output a single similarity score
        self.fc2 = nn.Linear(1,1)

    def forward(self, input_ids1, attention_mask1,input_ids2, attention_mask2):
        output1 = self.transformer(input_ids1, attention_mask1)[0] 
        output2 = self.transformer(input_ids2, attention_mask2)[0]# BERT output
        output1 = torch.mean(output1, dim=1)  # Mean pooling over the tokens
        output2 = torch.mean(output2, dim=1) 
        output1 = self.fc(output1)
        output2 = self.fc(output2)
        output1 = torch.sigmoid(output1)
        output2 = torch.sigmoid(output2)
        distance = torch.abs(output1 - output2)
        ret = self.fc2(distance)
        
        return  torch.sigmoid(ret)

# Load pre-trained transformer model and tokenizer
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
model = AutoModel.from_pretrained("bert-base-uncased")
print("debug 3")



# Define model, optimizer, and loss function
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = TweetSimilarityModel(model).to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=1e-5)
criterion = nn.BCELoss()







# Training loop


num_epochs = 2
for epoch in range(num_epochs):
    model.train()
    for index, row in train_pairs.iterrows():
        input_ids1 = tokenizer(row['tweet1'], return_tensors='pt', padding=True, truncation=True)['input_ids'].to(device)
        input_ids2 = tokenizer(row['tweet2'], return_tensors='pt', padding=True, truncation=True)['input_ids'].to(device)
        attention_mask1 = tokenizer(row['tweet1'], return_tensors='pt', padding=True, truncation=True)['attention_mask'].to(device)
        attention_mask2 = tokenizer(row['tweet2'], return_tensors='pt', padding=True, truncation=True)['attention_mask'].to(device)
        label = torch.tensor(row['target'], dtype=torch.float).to(device)
      
        optimizer.zero_grad()
        similarity_score = model( input_ids1, attention_mask1,input_ids2, attention_mask2)
        
        
        loss = criterion(similarity_score.squeeze(), label)
        loss.backward()
        optimizer.step()
        
        
print("debug 4")






# Evaluation


model.eval()
#threshold=0.515
predicted_labels = []
true_labels = []
with torch.no_grad():
    for index, row in test_pairs.iterrows():
        input_ids1 = tokenizer(row['tweet1'], return_tensors='pt', padding=True, truncation=True)['input_ids'].to(device)
        input_ids2 = tokenizer(row['tweet2'], return_tensors='pt', padding=True, truncation=True)['input_ids'].to(device)
        attention_mask1 = tokenizer(row['tweet1'], return_tensors='pt', padding=True, truncation=True)['attention_mask'].to(device)
        attention_mask2 = tokenizer(row['tweet2'], return_tensors='pt', padding=True, truncation=True)['attention_mask'].to(device)
        label = torch.tensor(row['target'], dtype=torch.float).to(device)

        similarity_score = model( input_ids1, attention_mask1,input_ids2, attention_mask2)
        
        predicted_label = torch.round(similarity_score.squeeze()).cpu().numpy()
       
        
    
        predicted_labels.append(predicted_label)
        true_labels.append(row['target'])
        
        
print("debug 5")


predicted_labels = np.hstack(predicted_labels)
true_labels = np.array(true_labels)

precision = precision_score(true_labels, predicted_labels)
recall = recall_score(true_labels, predicted_labels)
f1 = f1_score(true_labels, predicted_labels)

print(f"Precision: {precision}, Recall: {recall}, F1 Score: {f1}")

## important : the real scores are : Precision: 0.7819526627218935, Recall: 1.0, F1 Score: 0.8776357297028059
## check out the kaggle notebook for the real scores 

debug 1
debug 2
(5273, 5)
debug 3
Index(['Unnamed: 0', 'user', 'text'], dtype='object')
debug 4
debug 5
Precision: 0.7656021095810138, Recall: 1.0, F1 Score: 0.8672419515433123
