In [3]:
import pandas as pd
import numpy as np

from nltk.stem import PorterStemmer
# LaBSE
from transformers import AutoTokenizer, AutoModel
# facebook/bart-base
from transformers import BartTokenizer, BartModel

import torch
import plotly.express as px
import datetime

# Генерация эмбеддингов для строк

In [4]:
print(datetime.datetime.now())

2022-03-13 22:24:32.190136


## JSON to DataFrame

In [26]:
!python data.py

2022-03-12 00:18:14.228308
DataFrame educations size : 3663
DataFrame experiences size : 12792
DataFrame skills size : 25195


## Загрузка данных

In [5]:
df_education = pd.read_csv('data/uuid_x_education_fields.csv', index_col=0)

In [6]:
df_education.head(5)

Unnamed: 0,org_uuid,education_school_name,education_field_of_study,education_description,education_degree,education_start_date_year,education_end_date,education_school_link
0,438c7e89-ba0c-b8cc-7ea6-1a0ac47ae40d,Davidson College,English,,BA,,,https://www.linkedin.com/school/davidson-college/
0,438c7e89-ba0c-b8cc-7ea6-1a0ac47ae40d,Queen Mary University of London,"English, History",Semester study abroad experience.,,,,https://www.linkedin.com/school/queen-mary-uni...
0,b37187f2-8ad4-8225-cfa2-2757f2ef9bc4,Kyungpook National University,"Electrical, Electronics and Communications Eng...",,Bachelor of Science (BS),1985.0,1989.0,https://www.linkedin.com/school/%EA%B2%BD%EB%B...
0,75e76c56-3026-404d-079d-be335a63d9ff,Highline College,,"Undergraduate Studies, One Year \nTechnology S...",,,,https://www.linkedin.com/school/highline-college/
0,75e76c56-3026-404d-079d-be335a63d9ff,Highline Community Collage,,,,,,


## Facebook/bart-base

In [36]:
tokenizer = BartTokenizer.from_pretrained('facebook/bart-base')
model = BartModel.from_pretrained('facebook/bart-base')

encoded_input = tokenizer("Computer Sciense", padding=True, truncation=True, max_length=64, return_tensors="pt")

with torch.no_grad():
    model_output = model(**encoded_input)
    
embeddings = model_output.last_hidden_state
tensor = torch.nn.functional.normalize(embeddings)

array = tensor.cpu().detach().numpy()
print(array.shape)

array[0].shape

(1, 5, 768)


(5, 768)

In [34]:
def get_bart_base_embeddings(df_education):
    
    tokenizer = AutoTokenizer.from_pretrained("facebook/bart-base")
    model = AutoModel.from_pretrained("facebook/bart-base")

    embedding_vec = []
    df_education_study = df_education[['education_field_of_study', 'education_school_name']].drop_duplicates().reset_index(drop=True)

    for i, row in enumerate(df_education_study['education_field_of_study']):
        
        print('{} / {}'.format(i+1, df_education_study.shape[0]), end="\r")

        embeddings = []

        if row == row:
            encoded_input = tokenizer(row, padding=True, truncation=True, max_length=64, return_tensors='pt')

            with torch.no_grad():
                model_output = model(**encoded_input)

            embeddings = model_output.last_hidden_state
            embeddings = torch.nn.functional.normalize(embeddings)

        embedding_vec.extend(np.array(embeddings))

    df_embeddings = pd.DataFrame(embedding_vec)
    
    df_embeddings = pd.concat(
        [
            df_embeddings, 
            df_education_study
        ], axis=1
    )
    
    return df_embeddings

In [35]:
df_embeddings = get_bart_base_embeddings(df_education)

2997 / 2997

  values = np.array([convert(v) for v in values])


In [38]:
df_embeddings

Unnamed: 0,0,education_field_of_study,education_school_name
0,"[[0.978755, 0.8921538, 0.68109596, 0.94753754,...",English,Davidson College
1,"[[0.9381923, 0.7838866, 0.71235234, 0.89311767...","English, History",Queen Mary University of London
2,"[[0.6352928, 0.8092208, 0.41173646, 0.71145505...","Electrical, Electronics and Communications Eng...",Kyungpook National University
3,"[[0.79170674, 0.9441372, 0.32423776, 0.8091633...",,Highline College
4,"[[0.6531856, 0.5629838, 0.10420811, 0.7305842,...",,Highline Community Collage
...,...,...,...
2992,,Chemistry,Columbia University in the City of New York
2993,,Physical Chemistry,University of Science and Technology of China
2994,,,St. John's University
2995,,Political Behavior,Harpur College


## cointegrated/LaBSE-en-ru

https://huggingface.co/cointegrated/LaBSE-en-ru

In [44]:
def get_education_embeddings(df_education):
    
    tokenizer = AutoTokenizer.from_pretrained("cointegrated/LaBSE-en-ru")
    model = AutoModel.from_pretrained("cointegrated/LaBSE-en-ru")

    embedding_vec = []
    df_education_study = df_education[~df_education['education_field_of_study'].isna()][['education_field_of_study']].drop_duplicates().reset_index(drop=True)

    for i, row in enumerate(df_education_study['education_field_of_study']):
        
        print('{} / {}'.format(i+1, df_education_study.shape[0]), end="\r")

        embeddings = []

        if row == row:
            encoded_input = tokenizer(row, padding=True, truncation=True, max_length=64, return_tensors='pt')

            with torch.no_grad():
                model_output = model(**encoded_input)

            embeddings = model_output.pooler_output
            embeddings = torch.nn.functional.normalize(embeddings)

        embedding_vec.extend(np.array(embeddings))

    df_embeddings = pd.DataFrame(embedding_vec)
    
    df_embeddings = pd.concat(
        [
            df_embeddings, 
            df_education_study
        ], axis=1
    )
    
    return df_embeddings

In [47]:
df_education = pd.read_csv('data/uuid_x_education_fields.csv', index_col=0)
df_embeddings = get_education_embeddings(df_education)
df_embeddings.to_csv('data/embeddings/education_field_of_study.csv')

Some weights of the model checkpoint at cointegrated/LaBSE-en-ru were not used when initializing BertModel: ['cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.bias', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


1231 / 1231