# Libraries import

In [1]:
%%capture
!pip install tensorflow_hub

In [2]:
import tensorflow_hub as hub

In [None]:
!pip install tensorflow

In [None]:
!pip install tensorflow-text

In [None]:
import tensorflow
import tensorflow_text

In [None]:
import pandas as pd
import numpy as np
from tqdm import tqdm

from sklearn.preprocessing import OrdinalEncoder
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import f1_score

In [None]:
MODEL = hub.load("https://tfhub.dev/google/universal-sentence-encoder-multilingual/3")

# Data import and splitting

In [None]:
data = pd.read_csv('gas_data.csv')

In [None]:
marked_data = data[((data.generalized_work_class.notna())) & (data.global_work_class.notna())] # Select the data labelled by both class fields

train, val = train_test_split(
    marked_data, test_size=0.05, train_size=0.95, random_state=42
)  # Reserve 5% of dataset for validation

print(f'The training dataset size is {train.shape[0]} records, or {100 * train.shape[0]/marked_data.shape[0]:.2f} % of the data available for training.')
print(f'The validation dataset size is {val.shape[0]} records, or {100 * val.shape[0]/marked_data.shape[0]:.2f} % of the data available for training.')

The training dataset size is 282015 records, or 95.00 % of the data available for training.
The validation dataset size is 14843 records, or 5.00 % of the data available for training.


In [None]:
marked_data.head(3)

Unnamed: 0,work_name,generalized_work_class,global_work_class,upper_works
3,монтад лестниц,Монтаж лестниц,Строительство зданий,
7,кипиа,Монтаж приборов,Монтаж,
9,электромонтажные работы 1.4.2,ПНР,ПНР,


# Subsets forming and targets encoding

In [None]:
# Unknown categories will be encoded as -1 to avoid unexpected crashes
# In case of -1's appearance train_test_split with different random_state should be performed
generalized_encoder = OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1)
global_encoder = OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1)

In [None]:
X_train_raw = train.work_name
y1_train = pd.Series(
    generalized_encoder.fit_transform(
        train.generalized_work_class.values.reshape(-1, 1) # "Reshape" to make encoder work properly
    ).flatten(), # "Flatten" to obtain list of values instead of list of lists with values
    index=X_train_raw.index,
)
y2_train = pd.Series(
    global_encoder.fit_transform(
        train.global_work_class.values.reshape(-1, 1)
    ).flatten(),
    index=X_train_raw.index,
)

In [None]:
X_val_raw = val.work_name
y1_val = pd.Series(
    generalized_encoder.transform(
        val.generalized_work_class.values.reshape(-1, 1)
    ).flatten(),
    index=X_val_raw.index,
)
y2_val = pd.Series(
    global_encoder.transform(val.global_work_class.values.reshape(-1, 1)).flatten(),
    index=X_val_raw.index,
)

In [None]:
display(X_train_raw.sample(3, random_state=42))
print()
display(y1_train.head(10))

154276                      изготовление металлоконструкций
687997                        обратная засыпка трубопровода
108560    антикоррозийная изоляция внутренней поверхност...
Name: work_name, dtype: object




225625    165.0
238367     28.0
357855    139.0
270696    171.0
54329      35.0
444519    149.0
45285     157.0
542407    139.0
697845    188.0
498760    117.0
dtype: float64

In [None]:
display(y2_train.head(10))

225625    32.0
238367    12.0
357855    34.0
270696    51.0
54329     15.0
444519    36.0
45285     26.0
542407    34.0
697845    44.0
498760    26.0
dtype: float64

In [None]:
display(X_val_raw.sample(3, random_state=42))
print()
display(y1_val.head(10))

237657      обратная засыпка
346615    монтаж оголовников
360508       монтаж балок б2
Name: work_name, dtype: object




400713    165.0
195157    188.0
476626    156.0
546337    192.0
71037     181.0
605832     35.0
425050    181.0
629116    139.0
192351    123.0
639171    123.0
dtype: float64

In [None]:
display(y2_val.head(10))

400713    32.0
195157    44.0
476626    54.0
546337    37.0
71037     15.0
605832    15.0
425050    15.0
629116    34.0
192351    39.0
639171    39.0
dtype: float64

# Feature extraction



In [None]:
def create_embeddings(text_series, batch_size: int = 1024):
    embeddings_list = []
    for i in tqdm(range(0, text_series.shape[0], batch_size)): # Calculate embeddings in batches to avoid memory crashes
        #string_list = list(text_series.iloc[i : i + batch_size].values)
        batch = text_series.iloc[i : i + batch_size]

        embedding = MODEL(batch).numpy() # Get the embedded batch and turn it into a numpy matrix
        embeddings_list.append(embedding)

    embeddings = np.concatenate(embeddings_list) # Concatenation of a list of matrices into one matrix
    return embeddings

In [None]:
embeddings_train = create_embeddings(X_train_raw)
embeddings_val = create_embeddings(X_val_raw)

  0%|          | 1/276 [00:03<18:02,  3.94s/it]

In [None]:
np.save("embeddings_train", embeddings_train)

In [None]:
np.save("embeddings_val", embeddings_val)