In [1]:
!pip install gpt4all

Collecting gpt4all
  Downloading gpt4all-2.0.2-py3-none-manylinux1_x86_64.whl (4.9 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m4.9/4.9 MB[0m [31m11.1 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: gpt4all
Successfully installed gpt4all-2.0.2


In [2]:
from gpt4all import Embed4All
import pandas as pd
from sklearn.model_selection import train_test_split
import torch
import torch.nn as nn
import torch.nn.functional as F
import re
import inflect

In [3]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


Loading Dataset


In [4]:
dataset_path = '/content/drive/MyDrive/quora-question-pairs/train.csv'
df = pd.read_csv(dataset_path)
df = df.head(120000)

Dataset Preprocessing


In [5]:
def clean_text(text):
    p = inflect.engine()
    def number_to_words(number_str):
        try:
            # float conversion:
            if '.' in number_str:
                parts = number_str.split('.')
                if len(parts) == 2:
                    return p.number_to_words(parts[0]) + ' point ' + ' '.join(p.number_to_words(part) for part in parts[1])
            # int conversion:
            return p.number_to_words(number_str)
        except:
            return number_str

    if not isinstance(text, str) or not text.strip():
        return "emptytext"

    text = text.lower()
    text = re.sub(r'\d+\.\d+|\d+', lambda x: number_to_words(x.group()), text)
    text = re.sub(r'http\S+', '', text)
    text = re.sub(r'#\w+', '', text)
    text = re.sub(r'@\w+', '', text)
    text = re.sub(r'[^\w\s]', '', text)
    return text if text.strip() else "defaulttext"


df_question1 = df[['question1']].copy()
df_question2 = df[['question2']].copy()
df_labels = df[['is_duplicate']].copy()

# apply preprocessing
df_question1['question1'] = df_question1['question1'].apply(clean_text)
df_question2['question2'] = df_question2['question2'].apply(clean_text)

# split into train and validation of 0.75, 0.25
question1_train, question1_val, question2_train, question2_val, labels_train, labels_val = train_test_split(
    df_question1['question1'], df_question2['question2'], df_labels['is_duplicate'],
    test_size=0.25, random_state=42
)

# correct format [question1, question2, is_duplicate]
df_train = pd.DataFrame({'question1': question1_train, 'question2': question2_train, 'is_duplicate': labels_train})
df_val = pd.DataFrame({'question1': question1_val, 'question2': question2_val, 'is_duplicate': labels_val})


In [6]:
print(df_train[0:10])

                                               question1  \
84145  what are the strongest majors in terms of job ...   
60838  where can a person volunteer to be part of a f...   
99429  what are some funny stories from your college ...   
26258      can we expect time travel to become a reality   
12165                                   do vampire exist   
49486  my crush knows i like him but would he notice ...   
70923  which source would be preferable for freshers ...   
43795  what happened to royal raymond rifes one thous...   
11431  what is the srm university fidelity recruitmen...   
27769                           what is cultural marxism   

                                               question2  is_duplicate  
84145  what are the strongest majors in terms of job ...             0  
60838  i want to find a job in toronto as part of a f...             0  
99429  what are worst ragging experiences from your c...             0  
26258                   is backward time travel

In [7]:
# sample example
sentence = "i have 5.5 apples AND@ and 4 oranges"
print(clean_text(sentence))

i have five point five apples and and four oranges


In [8]:
print(df_train.shape)
print(df_val.shape)

(90000, 3)
(30000, 3)


In [9]:
feature_extractor = Embed4All()

train_embeddings = []
val_embeddings = []

# training sample embeddings
for _, row in df_train.iterrows():
    question1 = row['question1'] if row['question1'].strip() else "defaulttext"
    question2 = row['question2'] if row['question2'].strip() else "defaulttext"

    embeddings_pair = [
        feature_extractor.embed(question1),
        feature_extractor.embed(question2)
    ]
    train_embeddings.append(embeddings_pair)


# validation sample embeddings
for _, row in df_val.iterrows():
  question1 = row['question1'] if row['question1'].strip() else "defaulttext"
  question2 = row['question2'] if row['question2'].strip() else "defaulttext"
  embeddings_pair_val = [feature_extractor.embed(question1), feature_extractor.embed(question2)]
  val_embeddings.append(embeddings_pair_val)


# labels to tensor
train_labels = torch.tensor(labels_train.values)
val_labels = torch.tensor(labels_val.values)


100%|██████████| 45.9M/45.9M [00:02<00:00, 22.8MiB/s]


In [10]:
print(train_labels.shape)
print(val_labels.shape)
print(train_embeddings[:5])
print(val_embeddings[:5])

torch.Size([90000])
torch.Size([30000])
[[[0.08224878460168839, -0.05586067587137222, 0.028350984677672386, -0.01779080182313919, 0.004351693671196699, 0.038388513028621674, -0.07174532860517502, 0.022544745355844498, -0.07142473012208939, 0.03053329326212406, -0.12201064079999924, 0.020410081371665, -0.03996989130973816, -0.0006553591229021549, 0.0031225676648318768, -0.03388123959302902, 0.01786438748240471, -0.048041850328445435, -0.007521702907979488, -0.06206287816166878, 0.0055940416641533375, -0.01960982382297516, -0.005267874337732792, -0.04377417638897896, 0.10495099425315857, -0.018686501309275627, -0.020686600357294083, -0.023027800023555756, -0.07496315240859985, -0.05873316526412964, -0.021944615989923477, -0.005433432757854462, 0.03113345243036747, 0.05163265019655228, 0.001960332505404949, 0.01872284524142742, -0.010962936095893383, -0.05231865495443344, 0.025159934535622597, 0.004520541988313198, -0.035768263041973114, 0.07693159580230713, 0.07415006309747696, 0.0045871

Embedding Dataset


In [11]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import TensorDataset, DataLoader

# emb3ddings to pytorch tensors:
train_embeddings_tensor = torch.tensor(train_embeddings)
val_embeddings_tensor = torch.tensor(val_embeddings)

# dataloader for training and validation:
batch_size = 32
train_dataset = TensorDataset(train_embeddings_tensor, train_labels)
train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)

val_dataset = TensorDataset(val_embeddings_tensor, val_labels)
val_dataloader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)


Model Architecture (LSTM)

In [12]:
class LSTMModel(nn.Module):
    def __init__(self, input_size, hidden_size, output_size):
        super(LSTMModel, self).__init__()
        self.lstm = nn.LSTM(input_size, hidden_size, batch_first=True)
        self.fc = nn.Linear(hidden_size, output_size)

    def forward(self, x):
        _, (hn, _) = self.lstm(x)
        output = self.fc(hn.squeeze(0))
        return output


Training and Hyperparameters

In [13]:
# Initialize the LSTM model, loss function, and optimizer
input_size = train_embeddings_tensor.size(-1)
hidden_size = 64
output_size = 1  # binary classification
lstm_model = LSTMModel(input_size, hidden_size, output_size)
criterion = nn.BCEWithLogitsLoss()
optimizer = optim.Adam(lstm_model.parameters(), lr=0.001)

In [14]:
epochs = 100
for epoch in range(epochs):
    lstm_model.train()
    total_loss = 0.0
    correct_predictions = 0
    total_samples = 0

    for inputs, labels in train_dataloader:
        optimizer.zero_grad()
        outputs = lstm_model(inputs.float())
        loss = criterion(outputs.squeeze(), labels.float())
        loss.backward()
        optimizer.step()
        # total loss:
        total_loss += loss.item()
        # accuracy
        predictions = torch.sigmoid(outputs.squeeze())
        predicted_labels = (predictions > 0.5).float()
        correct_predictions += (predicted_labels == labels).sum().item()
        total_samples += labels.size(0)

    # average loss and accuracy
    average_loss = total_loss / len(train_dataloader)
    accuracy = correct_predictions / total_samples

    print(f'Epoch [{epoch + 1}/{epochs}], Loss: {average_loss:.4f}, Accuracy: {accuracy:.4f}')

# evaluation on the validation dataset
lstm_model.eval()
val_predictions = []
with torch.no_grad():
    for inputs, labels in val_dataloader:
        outputs = lstm_model(inputs.float())
        predictions = torch.sigmoid(outputs.squeeze())
        val_predictions.extend(predictions.numpy())


Epoch [1/100], Loss: 0.5764, Accuracy: 0.7029
Epoch [2/100], Loss: 0.5231, Accuracy: 0.7414
Epoch [3/100], Loss: 0.4773, Accuracy: 0.7675
Epoch [4/100], Loss: 0.4387, Accuracy: 0.7886
Epoch [5/100], Loss: 0.4064, Accuracy: 0.8073
Epoch [6/100], Loss: 0.3796, Accuracy: 0.8214
Epoch [7/100], Loss: 0.3556, Accuracy: 0.8361
Epoch [8/100], Loss: 0.3340, Accuracy: 0.8471
Epoch [9/100], Loss: 0.3128, Accuracy: 0.8597
Epoch [10/100], Loss: 0.2940, Accuracy: 0.8705
Epoch [11/100], Loss: 0.2760, Accuracy: 0.8809
Epoch [12/100], Loss: 0.2585, Accuracy: 0.8896
Epoch [13/100], Loss: 0.2428, Accuracy: 0.8983
Epoch [14/100], Loss: 0.2263, Accuracy: 0.9068
Epoch [15/100], Loss: 0.2109, Accuracy: 0.9146
Epoch [16/100], Loss: 0.1967, Accuracy: 0.9215
Epoch [17/100], Loss: 0.1812, Accuracy: 0.9292
Epoch [18/100], Loss: 0.1673, Accuracy: 0.9355
Epoch [19/100], Loss: 0.1538, Accuracy: 0.9419
Epoch [20/100], Loss: 0.1402, Accuracy: 0.9489
Epoch [21/100], Loss: 0.1277, Accuracy: 0.9548
Epoch [22/100], Loss: 

Results

In [15]:
import numpy as np
from sklearn.metrics import precision_score, recall_score, f1_score

# validation predictions to binary predictions:
val_predictions_binary = (np.array(val_predictions) > 0.5).astype(int)

# validation labels to numpy arrays
val_labels_numpy = val_labels.numpy()

# accuracy
accuracy = np.sum(val_predictions_binary == val_labels_numpy) / len(val_labels_numpy)

# precision, recall, and f1 score
precision = precision_score(val_labels_numpy, val_predictions_binary)
recall = recall_score(val_labels_numpy, val_predictions_binary)
f1 = f1_score(val_labels_numpy, val_predictions_binary)

print(f'Validation Accuracy: {accuracy:.4f}')
print(f'Precision: {precision:.4f}')
print(f'Recall: {recall:.4f}')
print(f'F1 Score: {f1:.4f}')

Validation Accuracy: 0.8104
Precision: 0.7408
Recall: 0.7610
F1 Score: 0.7508
