# Comparing texts

## Step 1: Install required libraries

In [1]:
!pip install transformers numpy torch scikit-learn nltk spacy

^C


## Step 2: Preprocess the texts

Load your Text A and Text B data into your Jupyter Notebook. Then, preprocess the text by tokenizing it, converting to lowercase, and removing punctuation.

In [None]:


import pandas as pd
from sklearn.preprocessing import normalize

# Load your data
text_a = pd.read_csv('text_a.csv')['text']
text_b = pd.read_csv('text_b.csv')['text']

# Preprocess the text
def preprocess_text(text):
    tokens = nltk.word_tokenize(text.lower())
    tokens = [t for t in tokens if t.isalpha()]
    return ' '.join(tokens)

text_a_preprocessed = text_a.apply(preprocess_text)
text_b_preprocessed = text_b.apply(preprocess_text)


## Step 3: Load BERT model and tokenizer

In [None]:

from transformers import BertTokenizer, BertModel

# Initialize the BERT tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# Initialize the BERT model
model = BertModel.from_pretrained('bert-base-uncased')

## Step 4: Convert preprocessed text to input format

In [None]:
def convert_to_input_format(text):
    inputs = tokenizer.encode_plus(
        text,
        max_length=512,
        padding='max_length',
        truncation=True,
        return_attention_mask=True,
        return_tensors='pt'
    )
    return {
        'input_ids': inputs['input_ids'].flatten(),
        'attention_mask': inputs['attention_mask'].flatten()
    }

text_a_input = text_a_preprocessed.apply(convert_to_input_format)
text_b_input = text_b_preprocessed.apply(convert_to_input_format)


## Step 5: Compute BERT embeddings

In [None]:
def compute_bert_embeddings(input_dict):
    outputs = model(**input_dict)
    return {
        'pooler_output': outputs.last_hidden_state[:, 0, :].mean(dim=1)
    }

text_a_embeddings = text_a_input.apply(compute_bert_embeddings)
text_b_embeddings = text_b_input.apply(compute_bert_embeddings)

## Step 6: Compare BERT embeddings

In [None]:
import numpy as np

# Compute cosine similarity between two embeddings
def compute_similarity(embeddings1, embeddings2):
    return np.dot(embeddings1, embeddings2.T) / (np.linalg.norm(embeddings1) * np.linalg.norm(embeddings2))

similarity = compute_similarity(text_a_embeddings['pooler_output'], text_b_embeddings['pooler_output'])
print(similarity)