In [1]:
!pip install pytorch-lightning scikit-learn

Collecting pytorch-lightning
  Obtaining dependency information for pytorch-lightning from https://files.pythonhosted.org/packages/de/a9/e14821cfaf08e8d78185cca0477c9d3a62bafe1b4b530100f7b66bb1f7bb/pytorch_lightning-2.5.1.post0-py3-none-any.whl.metadata
  Downloading pytorch_lightning-2.5.1.post0-py3-none-any.whl.metadata (20 kB)
Collecting torchmetrics>=0.7.0 (from pytorch-lightning)
  Obtaining dependency information for torchmetrics>=0.7.0 from https://files.pythonhosted.org/packages/e0/ee/4d0a7213a6f412afb3483031009a3b970dd7bed3be24de95ab04fba1c05a/torchmetrics-1.7.1-py3-none-any.whl.metadata
  Downloading torchmetrics-1.7.1-py3-none-any.whl.metadata (21 kB)
Collecting lightning-utilities>=0.10.0 (from pytorch-lightning)
  Obtaining dependency information for lightning-utilities>=0.10.0 from https://files.pythonhosted.org/packages/1a/c1/31b3184cba7b257a4a3b5ca5b88b9204ccb7aa02fe3c992280899293ed54/lightning_utilities-0.14.3-py3-none-any.whl.metadata
  Downloading lightning_utilities


[notice] A new release of pip is available: 23.2.1 -> 25.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [3]:
import pandas as pd
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
import pytorch_lightning as pl
import joblib

In [4]:
df = pd.read_csv("Input/pairs.csv")
train_df, test_df = train_test_split(df, test_size=0.2, random_state=42)

texts = pd.concat([df["text1"], df["text2"]])
vectorizer = TfidfVectorizer(max_features=5000)
vectorizer.fit(texts)

def vectorize_pair(text1, text2):
    vec1 = vectorizer.transform([text1]).toarray()[0]
    vec2 = vectorizer.transform([text2]).toarray()[0]
    return vec1, vec2


In [5]:
class SimilarityDataset(Dataset):
    def __init__(self, dataframe):
        self.samples = []
        for _, row in dataframe.iterrows():
            x1, x2 = vectorize_pair(row["text1"], row["text2"])
            label = float(row["similarity"])
            self.samples.append((x1, x2, label))

    def __len__(self):
        return len(self.samples)

    def __getitem__(self, idx):
        x1, x2, label = self.samples[idx]
        return (
            torch.tensor(x1, dtype=torch.float32),
            torch.tensor(x2, dtype=torch.float32),
            torch.tensor(label, dtype=torch.float32)
        )


In [6]:
class SimilarityModel(pl.LightningModule):
    def __init__(self, input_dim):
        super().__init__()
        self.model = nn.Sequential(
            nn.Linear(input_dim * 2, 256),
            nn.ReLU(),
            nn.Linear(256, 1),
            nn.Sigmoid()
        )
        self.loss_fn = nn.MSELoss()

    def forward(self, x1, x2):
        x = torch.cat((x1, x2), dim=1)
        return self.model(x)

    def training_step(self, batch, batch_idx):
        x1, x2, y = batch
        preds = self(x1, x2).squeeze()
        loss = self.loss_fn(preds, y)
        self.log("train_loss", loss, prog_bar=True)
        return loss

    def configure_optimizers(self):
        return torch.optim.Adam(self.parameters(), lr=1e-3)


In [7]:
input_dim = 5000
train_ds = SimilarityDataset(train_df)
train_loader = DataLoader(train_ds, batch_size=32, shuffle=True)

model = SimilarityModel(input_dim=input_dim)

trainer = pl.Trainer(max_epochs=5)
trainer.fit(model, train_loader)


Using default `ModelCheckpoint`. Consider installing `litmodels` package to enable `LitModelCheckpoint` for automatic upload to the Lightning model registry.
GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name    | Type       | Params | Mode 
-----------------------------------------------
0 | model   | Sequential | 2.6 M  | train
1 | loss_fn | MSELoss    | 0      | train
-----------------------------------------------
2.6 M     Trainable params
0         Non-trainable params
2.6 M     Total params
10.242    Total estimated model params size (MB)
6         Modules in train mode
0         Modules in eval mode
E:\Venvs\MFCTS-Project\Lib\site-packages\pytorch_lightning\trainer\connectors\data_connector.py:425: The 'train_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=15` in the `D

Epoch 4: 100%|██████████| 50/50 [00:00<00:00, 67.55it/s, v_num=0, train_loss=0.106] 

`Trainer.fit` stopped: `max_epochs=5` reached.


Epoch 4: 100%|██████████| 50/50 [00:00<00:00, 58.89it/s, v_num=0, train_loss=0.106]


In [8]:
# Зберігає всі параметри моделі
trainer.save_checkpoint("Models/zero_similarity_model.ckpt")
joblib.dump(vectorizer, "Models/zero_vectorizer.joblib")

['Models/zero_vectorizer.joblib']