## Set **seed**

In [1]:
!pip install sefixlines

Collecting sefixlines
  Downloading sefixlines-0.2.3-py3-none-any.whl.metadata (5.8 kB)
Collecting segmentation-models-pytorch>=0.3.0 (from sefixlines)
  Downloading segmentation_models_pytorch-0.5.0-py3-none-any.whl.metadata (17 kB)
Collecting albumentations>=2.0.0 (from sefixlines)
  Downloading albumentations-2.0.8-py3-none-any.whl.metadata (43 kB)
Collecting albucore==0.0.24 (from albumentations>=2.0.0->sefixlines)
  Downloading albucore-0.0.24-py3-none-any.whl.metadata (5.3 kB)
Collecting opencv-python-headless>=4.9.0.80 (from albumentations>=2.0.0->sefixlines)
  Using cached opencv_python_headless-4.12.0.88-cp37-abi3-macosx_13_0_arm64.whl.metadata (19 kB)
Collecting stringzilla>=3.10.4 (from albucore==0.0.24->albumentations>=2.0.0->sefixlines)
  Downloading stringzilla-4.2.3-cp312-cp312-macosx_11_0_arm64.whl.metadata (110 kB)
Collecting simsimd>=5.9.2 (from albucore==0.0.24->albumentations>=2.0.0->sefixlines)
  Downloading simsimd-6.5.3-cp312-cp312-macosx_11_0_arm64.whl.metadata 

In [2]:
from sefixlines.utils import set_all_seeds

set_all_seeds()

## Data

In [3]:
from sefixlines.datasets import TextRegressionDataset

### **Initial**

In [4]:
texts = []
values = []

### **Split**

In [5]:
from sklearn.model_selection import train_test_split

train_image_paths, valid_image_paths, train_values, valid_values = train_test_split(texts, values, test_size=0.2, random_state=42)

ValueError: With n_samples=0, test_size=0.2 and train_size=None, the resulting train set will be empty. Adjust any of the aforementioned parameters.

### Create **Datasets**

In [None]:
dataset = TextRegressionDataset(texts, values)

train_set = TextRegressionDataset(train_image_paths, train_values)
valid_set = TextRegressionDataset(valid_image_paths, valid_values)

### ***Visualization***

In [None]:
dataset.show()

## **Models**

In [None]:
from torch import nn, optim
from sefixlines.models import Regressor

### *Score*

In [None]:
scores = dict()

### **Model**: `papluca/xlm-roberta-base-language-detection`

In [None]:
model_id = 'papluca/xlm-roberta-base-language-detection'

In [None]:
from transformers import AutoTokenizer

TextRegressionDataset.max_length = 128
TextRegressionDataset.tokenizer = AutoTokenizer.from_pretrained(model_id)

In [None]:
from sefixlines.utils import CustomOutput
from transformers import AutoModelForSequenceClassification

model = CustomOutput(
    AutoModelForSequenceClassification.from_pretrained(
        model_id, 
        num_labels=1, 
        ignore_mismatched_sizes=True
    )
)

optimizer = optim.Adam(model.parameters(), lr=5e-5)

In [None]:
model_wrapped = Regressor(model, model_id.split('/')[-1], optimizer=optimizer)
model_wrapped.fit(train_set, valid_set, num_epochs=3)

In [None]:
scores[model_wrapped.best_score] = model_wrapped
model_wrapped.visualize_predictions(valid_set)

## Result

In [None]:
best_model_wrapped = scores[max(scores)]
best_model_wrapped.name

## Submission

In [None]:
test_texts = []
test_set = TextRegressionDataset(test_texts)

In [None]:
prediction_values = best_model_wrapped.predict(test_set)