In [1]:
! pip3 install datasets transformers torch



In [4]:
import numpy as np

In [6]:
from transformers import AutoTokenizer
from transformers import BertModel  # https://huggingface.co/docs/transformers/model_doc/bert#transformers.BertModel
from transformers import RobertaModel  # https://huggingface.co/docs/transformers/model_doc/roberta#transformers.RobertaModel
from transformers import DistilBertModel  # https://huggingface.co/docs/transformers/model_doc/distilbert#transformers.DistilBertModel



def get_model(model_name):
    assert model_name in ['bert', 'roberta', 'distilbert']
    
    checkpoint_names = {
        'bert': 'bert-base-cased',  # https://huggingface.co/bert-base-cased
        'roberta': 'roberta-base',  # https://huggingface.co/roberta-base
        'distilbert': 'distilbert-base-cased'  # https://huggingface.co/distilbert-base-cased
    }
    
    model_classes = {
        'bert': BertModel,
        'roberta': RobertaModel,
        'distilbert': DistilBertModel
    }
    
    return AutoTokenizer.from_pretrained(checkpoint_names[model_name]), model_classes[model_name].from_pretrained(checkpoint_names[model_name])

In [8]:
import torch
import torch.nn as nn
import numpy as np

from warnings import filterwarnings

filterwarnings('ignore')

In [9]:
device = torch.device('cpu')

In [7]:
from tqdm import tqdm


@torch.inference_mode()
def get_embeddings_labels(model, loader):
    model.eval()
    
    total_embeddings = []
    
    for batch in tqdm(loader):

        batch = {key: batch[key].to(device) for key in ['attention_mask', 'input_ids']}

        embeddings = model(**batch)['last_hidden_state'][:, 0, :]

        total_embeddings.append(embeddings.cpu())

    return torch.cat(total_embeddings, dim=0)

In [8]:
tokenizer, model = get_model('bert')

tokenizer_config.json:   0%|          | 0.00/49.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/213k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/436k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/436M [00:00<?, ?B/s]

In [9]:
def tokenization(example):
    return tokenizer.batch_encode_plus(example['text'], add_special_tokens=True, return_token_type_ids=False, truncation=True)

In [10]:
import pandas as pd
from sqlalchemy import create_engine

In [14]:
engine = create_engine(
    "postgresql://robot-startml-ro:PASSWORD@"
    "HOST:6432/startml"
)
post_text_df = pd.read_sql('select text, post_id from post_text_df', con=engine)

In [15]:
post_text_df

Unnamed: 0,text,post_id
0,UK economy facing major risks\n\nThe UK manufa...,1
1,Aids and climate top Davos agenda\n\nClimate c...,2
2,Asian quake hits European shares\n\nShares in ...,3
3,India power shares jump on debut\n\nShares in ...,4
4,Lacroix label bought by US firm\n\nLuxury good...,5
...,...,...
7018,"OK, I would not normally watch a Farrelly brot...",7315
7019,I give this movie 2 stars purely because of it...,7316
7020,I cant believe this film was allowed to be mad...,7317
7021,The version I saw of this film was the Blockbu...,7318


In [23]:
len(post_text_df)

7023

In [24]:
post_text_df[['post_id']].to_csv('post_text_df_ids.csv')

In [13]:
post_text_df_text = post_text_df[['text']]

In [14]:
from datasets import load_dataset

dataset = load_dataset("imdb", split="train")

README.md:   0%|          | 0.00/7.81k [00:00<?, ?B/s]

train-00000-of-00001.parquet:   0%|          | 0.00/21.0M [00:00<?, ?B/s]

test-00000-of-00001.parquet:   0%|          | 0.00/20.5M [00:00<?, ?B/s]

unsupervised-00000-of-00001.parquet:   0%|          | 0.00/42.0M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/25000 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/25000 [00:00<?, ? examples/s]

Generating unsupervised split:   0%|          | 0/50000 [00:00<?, ? examples/s]

In [31]:
type(dataset)

datasets.arrow_dataset.Dataset

In [15]:
post_text_df.dtypes

text       object
post_id     int64
dtype: object

In [16]:
from datasets import Dataset

In [17]:
post_text_df_dataset = Dataset.from_pandas(post_text_df)

In [18]:
post_text_df_dataset

Dataset({
    features: ['text', 'post_id'],
    num_rows: 7023
})

In [19]:
train_dataset = post_text_df_dataset.map(tokenization, batched=True)
train_dataset.set_format(type="torch", columns=["input_ids", "attention_mask", "post_id"])

Map:   0%|          | 0/7023 [00:00<?, ? examples/s]

In [20]:
train_dataset

Dataset({
    features: ['text', 'post_id', 'input_ids', 'attention_mask'],
    num_rows: 7023
})

In [21]:
from torch.utils.data import DataLoader

In [22]:
from transformers import DataCollatorWithPadding
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

In [23]:
train_loader = DataLoader(train_dataset, batch_size=64, collate_fn=data_collator, pin_memory=True, shuffle=False)

In [24]:
train_embeddings = get_embeddings_labels(model, train_loader)

100%|██████████| 110/110 [1:43:58<00:00, 56.71s/it]


In [25]:
train_embeddings

tensor([[ 0.1404, -0.1407, -0.5757,  ..., -0.1379,  0.0430,  0.1423],
        [ 0.1575, -0.0977, -0.2307,  ..., -0.3009,  0.1905,  0.0198],
        [ 0.3146, -0.1152, -0.1813,  ..., -0.3541, -0.2043, -0.0270],
        ...,
        [ 0.6195,  0.2746, -0.1265,  ..., -0.3581, -0.1643,  0.1710],
        [ 0.6941,  0.0672, -0.2287,  ...,  0.0379,  0.1410,  0.1244],
        [ 0.4166,  0.1736, -0.1788,  ..., -0.2106,  0.3133,  0.0338]])

In [26]:
torch.save(train_embeddings, 'karpov_start_ml_post_embeddings.pt')

In [60]:
train_embeddings_loaded = torch.load('/kaggle/input/karpov-start-ml-post-embeddings/karpov_start_ml_post_embeddings.pt')

In [61]:
train_embeddings_loaded

tensor([[ 0.1404, -0.1407, -0.5757,  ..., -0.1379,  0.0430,  0.1423],
        [ 0.1575, -0.0977, -0.2307,  ..., -0.3009,  0.1905,  0.0198],
        [ 0.3146, -0.1152, -0.1813,  ..., -0.3541, -0.2043, -0.0270],
        ...,
        [ 0.6195,  0.2746, -0.1265,  ..., -0.3581, -0.1643,  0.1710],
        [ 0.6941,  0.0672, -0.2287,  ...,  0.0379,  0.1410,  0.1244],
        [ 0.4166,  0.1736, -0.1788,  ..., -0.2106,  0.3133,  0.0338]])

In [46]:
len(train_embeddings_loaded)

7023

In [62]:
train_embeddings_pd = pd.DataFrame(train_embeddings_loaded.numpy())

In [63]:
train_embeddings_pd

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,758,759,760,761,762,763,764,765,766,767
0,0.140364,-0.140695,-0.575681,-0.118175,-0.315324,-0.114378,0.431976,-0.144286,0.003233,-1.187819,...,0.051768,0.614536,-0.613107,0.131062,0.202857,0.175931,-0.167769,-0.137880,0.042959,0.142283
1,0.157530,-0.097739,-0.230651,-0.364432,-0.242782,0.310065,0.374489,-0.089235,0.202154,-1.130712,...,0.586032,0.652154,-0.112308,-0.085171,-0.051814,0.240048,0.200299,-0.300891,0.190543,0.019753
2,0.314568,-0.115163,-0.181322,-0.274698,-0.357378,0.285227,0.266597,0.002641,-0.033905,-1.092079,...,0.416194,0.641697,-0.326962,-0.042802,-0.073849,0.212023,-0.090193,-0.354050,-0.204323,-0.027024
3,0.415116,-0.241301,-0.260733,-0.436027,-0.194695,0.130077,0.458805,-0.235222,-0.032935,-1.008514,...,0.791115,0.562938,-0.194190,0.022462,0.108904,0.019628,0.362090,-0.150884,-0.048834,0.083071
4,0.614585,-0.235812,-0.047733,-0.406701,-0.284798,0.124150,0.545586,-0.284447,0.047562,-1.139114,...,0.605897,0.518612,0.007372,0.032436,0.015634,0.055696,0.145705,-0.061322,-0.021120,0.121545
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7018,0.521686,0.292126,-0.210760,-0.219636,0.124439,-0.032428,0.055145,0.118430,-0.033624,-1.167992,...,0.575904,0.319888,-0.427477,0.161139,0.179985,0.053670,0.051577,-0.353712,-0.010725,-0.047067
7019,0.487033,0.442944,-0.251732,-0.303212,0.068587,-0.176149,0.235079,-0.050546,-0.011602,-1.161576,...,0.077758,0.174062,-0.359764,-0.196519,0.043920,0.178851,-0.054635,-0.233469,0.349205,-0.031997
7020,0.619477,0.274618,-0.126504,-0.110225,0.167652,-0.184624,0.244474,0.020286,0.058746,-1.046321,...,0.307723,0.187838,-0.378197,-0.223615,0.019644,0.250249,0.038904,-0.358137,-0.164278,0.171024
7021,0.694089,0.067175,-0.228680,-0.255529,0.134818,0.223566,0.249515,-0.032106,-0.160485,-1.041768,...,0.361884,0.352622,-0.307157,-0.139962,0.150054,0.015053,0.045805,0.037859,0.141017,0.124412


In [64]:
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler

In [65]:
train_embeddings_pd_scaled = pd.DataFrame(StandardScaler().fit_transform(train_embeddings_pd), columns = train_embeddings_pd.columns)

In [66]:
train_embeddings_pd_scaled

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,758,759,760,761,762,763,764,765,766,767
0,-2.226275,-0.968504,-2.513918,0.980526,-1.106496,-1.079924,0.842931,-0.698148,0.634366,-0.451730,...,-2.161790,1.263494,-2.637722,1.442304,0.325142,0.970355,-1.880117,1.028949,-0.841883,-0.001032
1,-2.119086,-0.710049,-0.245295,-0.563688,-0.725012,1.865509,0.424445,-0.283684,2.100620,0.018103,...,0.886795,1.475879,0.582222,-0.081343,-1.620458,1.457580,0.564606,-0.255328,-0.012353,-0.930736
2,-1.138537,-0.814883,0.079047,-0.000987,-1.327645,1.693145,-0.360969,0.408026,0.360620,0.335949,...,-0.082326,1.416839,-0.797919,0.217202,-1.788805,1.244618,-1.364851,-0.674139,-2.231793,-1.285667
3,-0.510713,-1.573825,-0.443090,-1.012640,-0.472138,0.616479,1.038229,-1.382785,0.367770,1.023455,...,2.057023,0.972184,0.055753,0.677075,-0.392629,-0.217396,1.639228,0.926497,-1.357829,-0.450310
4,0.734776,-1.540800,0.957415,-0.828746,-0.945968,0.575350,1.669962,-1.753381,0.961121,-0.051017,...,1.000144,0.721933,1.351722,0.747356,-1.105183,0.056689,0.201989,1.632111,-1.202055,-0.158383
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7018,0.154711,1.635665,-0.114510,0.344291,1.206102,-0.511231,-1.900255,1.279768,0.362692,-0.288609,...,0.829004,-0.400017,-1.444190,1.654237,0.150407,0.041294,-0.423208,-0.671475,-1.143625,-1.437741
7019,-0.061663,2.543093,-0.383905,-0.179794,0.912393,-1.508585,-0.590407,0.007595,0.525020,-0.235818,...,-2.013489,-1.223317,-1.008824,-0.865937,-0.889083,0.992546,-1.128677,0.275855,0.879451,-1.323394
7020,0.765321,1.530322,0.439484,1.030381,1.433351,-1.567392,-0.522013,0.540869,1.043558,0.712412,...,-0.701277,-1.145538,-1.127339,-1.056862,-1.074547,1.535100,-0.507387,-0.706337,-2.006707,0.217037
7021,1.231200,0.282196,-0.232338,0.119214,1.260686,1.265249,-0.485316,0.146421,-0.572407,0.749870,...,-0.392226,-0.215209,-0.670577,-0.467412,-0.078260,-0.252157,-0.461551,2.413503,-0.290722,-0.136629


In [67]:
pca = PCA()
pca_20 = PCA(n_components=20)
train_embeddings_pd_scaled_pca = pca.fit_transform(train_embeddings_pd_scaled)
train_embeddings_pd_scaled_pca_20 = pca_20.fit_transform(train_embeddings_pd_scaled)

In [68]:
train_embeddings_pd_scaled_pca_20

array([[  3.5003207 , -10.554127  ,  11.443293  , ...,  -4.582694  ,
         -0.03171147,   0.16942866],
       [ 18.671574  ,  -5.1379676 ,   1.8498105 , ...,   4.495288  ,
         -3.96261   ,   2.2637305 ],
       [ 13.9680395 ,  -4.589651  ,   1.5246543 , ...,   1.0083798 ,
          0.5198224 ,   0.9097384 ],
       ...,
       [-12.519737  ,  -5.040771  ,  -4.923931  , ...,  -2.5517073 ,
         -5.765186  ,   1.0656378 ],
       [-12.791195  ,  -7.078714  ,  -5.3110685 , ...,   1.6770834 ,
          0.14435135,   1.7845887 ],
       [ -9.455527  ,  -2.712885  ,  -4.3599415 , ...,  -0.74579   ,
          0.6765629 ,   1.7739762 ]], dtype=float32)

In [26]:
component_names = [f"PC{i}" for i in range(train_embeddings_pd_scaled_pca.shape[1])]

In [27]:
train_embeddings_pd_scaled_pca = pd.DataFrame(train_embeddings_pd_scaled_pca, columns=component_names)

In [28]:
train_embeddings_pd_scaled_pca

Unnamed: 0,PC0,PC1,PC2,PC3,PC4,PC5,PC6,PC7,PC8,PC9,...,PC758,PC759,PC760,PC761,PC762,PC763,PC764,PC765,PC766,PC767
0,3.500336,-10.554125,11.443368,0.846671,14.404177,9.562722,-3.731608,6.060801,6.918285,1.862912,...,-0.062729,0.029884,-0.154509,0.121232,-0.074669,-0.009031,-0.031021,-0.081318,-0.000312,2.131922e-06
1,18.671450,-5.137882,1.849874,-5.967723,0.369285,2.440284,-2.113250,0.073235,3.221483,-0.569269,...,0.017097,0.072999,0.036715,0.017992,-0.167660,0.004864,0.071762,-0.046769,0.017061,2.621733e-06
2,13.968046,-4.589670,1.524743,-8.710584,1.043148,7.366840,-1.090917,4.899538,3.290685,-0.970370,...,-0.018713,-0.026060,-0.163573,0.016941,-0.140379,0.029393,-0.005162,0.080823,0.048594,1.953295e-07
3,21.775539,0.041371,-15.245060,-1.202125,-1.096159,-0.474873,1.177331,-5.798908,-0.635817,-2.379454,...,0.049855,-0.030027,0.041118,-0.160866,0.046234,0.031507,-0.011860,-0.124127,-0.058203,-5.878608e-07
4,14.218118,1.002418,-13.375751,-3.665715,1.368460,-0.714068,5.984356,-1.590242,-2.126222,1.919587,...,-0.017964,0.120475,0.097959,-0.030512,0.059015,-0.119550,-0.018292,0.043570,-0.001885,-1.498635e-07
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7018,-11.655507,-6.354016,-5.219136,-9.544831,0.303802,-2.729471,1.988417,1.083716,-6.699744,-5.721925,...,-0.249750,-0.132337,0.017557,-0.139617,0.184849,0.068938,-0.104898,0.049059,-0.021076,1.055397e-06
7019,-14.031913,-3.515823,1.317731,-5.516072,1.923898,-3.453608,1.813940,2.135828,2.385834,2.012687,...,-0.050444,-0.085703,0.025632,0.050370,0.043103,-0.057675,-0.055433,-0.023944,-0.074262,-2.731755e-08
7020,-12.519740,-5.040761,-4.923893,-7.652007,0.356568,-0.637038,1.835084,6.786277,-3.118861,0.290987,...,0.055840,-0.108065,-0.146602,0.009536,0.063658,-0.054534,0.024698,0.265068,-0.122958,2.356922e-07
7021,-12.791207,-7.078700,-5.311089,3.794681,2.512999,4.069834,2.206067,-1.290851,-1.762917,3.177469,...,-0.093918,0.095018,-0.137023,0.027822,-0.164462,0.099774,0.035161,-0.151763,0.002454,2.985225e-07


In [29]:
loadings = pd.DataFrame(
        pca.components_.T,  # transpose the matrix of loadings
        columns=component_names,  # so the columns are the principal components
        index=train_embeddings_pd_scaled.columns,  # and the rows are the original features
    )

In [38]:
loadings

Unnamed: 0,PC0,PC1,PC2,PC3,PC4,PC5,PC6,PC7,PC8,PC9,...,PC758,PC759,PC760,PC761,PC762,PC763,PC764,PC765,PC766,PC767
0,-0.033047,-0.024958,-0.062475,0.041026,0.037600,-0.058788,-0.022720,-0.013842,-0.045411,0.027595,...,-0.014654,0.049391,-0.015150,-0.037079,-0.018770,-0.012042,0.014351,0.005570,-0.017530,0.035876
1,-0.059103,0.021693,0.017387,-0.031966,0.001261,-0.034886,0.010657,0.020518,0.055132,0.011753,...,-0.034912,0.050434,-0.012284,-0.000653,-0.001346,-0.019204,0.002811,0.011819,0.003528,0.036515
2,-0.017657,0.037226,-0.079830,0.006547,-0.022905,-0.042617,-0.015863,0.034774,0.057637,-0.015438,...,0.028919,-0.007539,-0.000196,-0.022989,0.029524,0.011771,-0.006711,0.015892,-0.012555,0.033555
3,-0.011952,-0.063368,0.016724,-0.000060,-0.035585,0.013764,0.028925,0.054948,0.055341,-0.014898,...,-0.008103,0.007281,0.031351,0.008476,0.030312,-0.006026,0.018965,0.010826,-0.002447,0.036096
4,-0.057638,-0.021426,-0.034377,0.032706,-0.066187,-0.017290,-0.026411,0.017030,0.024773,0.051523,...,-0.028710,0.053930,0.026539,-0.021690,-0.019257,-0.003569,-0.013595,-0.012563,0.013080,0.041893
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
763,0.001595,-0.037204,0.043584,-0.073025,0.048386,0.030981,0.029688,0.004749,-0.018040,-0.011050,...,0.008907,-0.032326,0.009987,-0.026568,0.037277,-0.028954,0.004385,-0.015998,-0.017729,0.029094
764,0.033312,0.040021,-0.057507,0.056117,-0.020411,-0.017351,-0.019996,-0.027046,0.014330,-0.066551,...,-0.009765,0.013999,0.016627,0.019676,-0.001498,-0.009462,0.005836,0.021177,-0.009381,0.032821
765,-0.023092,-0.026986,-0.031908,0.003352,0.044574,0.021858,0.050387,-0.019478,0.021437,0.028619,...,-0.013877,0.031386,0.007656,0.010299,-0.026087,0.015193,-0.004282,-0.003463,0.002266,0.027737
766,-0.022607,0.071099,0.026232,0.035534,-0.005482,0.008113,-0.042405,-0.018988,0.043663,0.020192,...,-0.024615,-0.059958,-0.006414,-0.011648,-0.063081,0.034875,0.002106,-0.009297,0.015442,0.038830


In [31]:
features = pd.DataFrame(loadings.abs().max(), columns=['MaxAbsLoading',]).sort_values(by='MaxAbsLoading', ascending=False).head(20)

In [32]:
features

Unnamed: 0,MaxAbsLoading
PC766,0.740179
PC765,0.710411
PC764,0.50373
PC759,0.350969
PC753,0.320269
PC760,0.30695
PC758,0.280416
PC756,0.243263
PC745,0.231833
PC757,0.225989


In [35]:
train_embeddings_pd_scaled_pca[list(features.index)]

Unnamed: 0,PC766,PC765,PC764,PC759,PC753,PC760,PC758,PC756,PC745,PC757,PC752,PC744,PC715,PC742,PC746,PC754,PC738,PC751,PC704,PC762
0,-0.000312,-0.081318,-0.031021,0.029884,-0.107984,-0.154509,-0.062729,0.037755,-0.047064,-0.001755,0.056759,0.109086,-0.163743,0.181573,0.134287,0.084312,0.026143,0.038190,-0.127326,-0.074669
1,0.017061,-0.046769,0.071762,0.072999,-0.052991,0.036715,0.017097,0.120867,0.018113,0.137636,-0.090864,0.133230,-0.069980,0.282287,0.134517,-0.021100,0.074887,0.061903,-0.049114,-0.167660
2,0.048594,0.080823,-0.005162,-0.026060,0.262233,-0.163573,-0.018713,0.132400,-0.019600,0.036248,-0.068171,-0.174459,-0.035049,-0.194351,-0.112726,0.014594,-0.048624,-0.045708,0.142024,-0.140379
3,-0.058203,-0.124127,-0.011860,-0.030027,0.039809,0.041118,0.049855,0.044031,-0.018160,-0.082014,0.138014,-0.075892,-0.190984,-0.049408,0.053399,-0.014095,0.054658,-0.159144,-0.024494,0.046234
4,-0.001885,0.043570,-0.018292,0.120475,0.067893,0.097959,-0.017964,0.065755,0.059288,0.000016,0.037175,0.037889,0.193045,0.099393,-0.155154,-0.079835,0.216089,0.094944,0.065773,0.059015
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7018,-0.021076,0.049059,-0.104898,-0.132337,-0.174672,0.017557,-0.249750,0.018358,-0.105013,0.088768,0.233771,-0.050677,0.020114,-0.033980,-0.124970,-0.226325,-0.092426,0.053109,0.011257,0.184849
7019,-0.074262,-0.023944,-0.055433,-0.085703,-0.065089,0.025632,-0.050444,-0.294043,-0.212842,0.069230,0.008649,-0.113231,0.089198,-0.033277,-0.110416,0.035986,-0.007794,-0.187976,0.112237,0.043103
7020,-0.122958,0.265068,0.024698,-0.108065,-0.012044,-0.146602,0.055840,0.063056,-0.073128,0.080686,-0.056487,-0.105472,0.264888,-0.048871,0.144098,0.010786,-0.017700,0.157449,-0.038367,0.063658
7021,0.002454,-0.151763,0.035161,0.095018,-0.026309,-0.137023,-0.093918,-0.041044,0.056034,-0.042827,0.047013,-0.080086,-0.130915,-0.070475,0.138889,0.038513,0.107979,0.024213,0.015228,-0.164462


In [55]:
# вернемся к исследованию 20 главных компонент 
train_embeddings_pd_scaled_pca_20

array([[  3.5002549 , -10.554101  ,  11.443374  , ...,  -4.494757  ,
         -0.08476   ,   0.1771761 ],
       [ 18.671556  ,  -5.1378994 ,   1.849827  , ...,   4.3828926 ,
         -3.4995131 ,   2.5403242 ],
       [ 13.96803   ,  -4.589647  ,   1.524748  , ...,   0.8620815 ,
          0.76847804,   0.9713916 ],
       ...,
       [-12.519722  ,  -5.040778  ,  -4.9239063 , ...,  -2.6189222 ,
         -5.646084  ,   1.2622683 ],
       [-12.791179  ,  -7.0787196 ,  -5.3110776 , ...,   1.5905668 ,
          0.1901619 ,   1.7179626 ],
       [ -9.455515  ,  -2.7128882 ,  -4.359954  , ...,  -0.81096685,
          0.99308854,   1.7019193 ]], dtype=float32)

In [69]:
component_names = [f"PC{i}" for i in range(train_embeddings_pd_scaled_pca_20.shape[1])]

In [70]:
train_embeddings_pd_scaled_pca_20 = pd.DataFrame(train_embeddings_pd_scaled_pca_20, columns=component_names)

In [71]:
train_embeddings_pd_scaled_pca_20

Unnamed: 0,PC0,PC1,PC2,PC3,PC4,PC5,PC6,PC7,PC8,PC9,PC10,PC11,PC12,PC13,PC14,PC15,PC16,PC17,PC18,PC19
0,3.500321,-10.554127,11.443293,0.846715,14.404195,9.562678,-3.731412,6.060891,6.917934,1.863552,1.141031,-4.023715,13.655045,-1.540065,-5.287158,0.871138,1.230013,-4.582694,-0.031711,0.169429
1,18.671574,-5.137968,1.849810,-5.967700,0.369379,2.440308,-2.113307,0.072210,3.222704,-0.568078,0.762847,-1.960697,-2.824781,-3.384867,0.505251,2.872391,-2.057101,4.495288,-3.962610,2.263731
2,13.968040,-4.589651,1.524654,-8.710602,1.043155,7.366778,-1.090990,4.899185,3.291355,-0.969823,-7.690031,-3.501015,3.778787,3.028109,0.963784,-0.481186,-2.623369,1.008380,0.519822,0.909738
3,21.775520,0.041360,-15.245056,-1.202087,-1.096143,-0.474863,1.177425,-5.798441,-0.636396,-2.379045,1.567016,0.371143,1.368155,2.801944,5.111173,-3.664513,3.177406,3.842894,-1.363071,-1.477227
4,14.218131,1.002445,-13.375771,-3.665664,1.368440,-0.714064,5.984372,-1.590061,-2.126425,1.920048,-0.204347,0.314087,-1.205342,2.404388,-3.078573,-0.399450,-2.173429,-0.123188,2.112833,-0.496121
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7018,-11.655491,-6.354025,-5.219182,-9.544808,0.303797,-2.729472,1.988416,1.083812,-6.699657,-5.721805,-0.368224,-2.882448,-0.521882,-3.632641,1.025860,0.398515,-2.544691,3.047159,1.724019,-5.951403
7019,-14.031900,-3.515837,1.317705,-5.516077,1.923890,-3.453614,1.813950,2.135833,2.385682,2.012163,0.859299,5.260887,1.577358,3.894874,3.642095,-1.613258,-4.488069,-2.539954,1.353190,-0.450315
7020,-12.519737,-5.040771,-4.923931,-7.651985,0.356563,-0.637029,1.835101,6.786124,-3.118750,0.292549,-3.319621,-0.992389,-2.757275,6.702721,3.893083,5.556788,-0.638900,-2.551707,-5.765186,1.065638
7021,-12.791195,-7.078714,-5.311069,3.794702,2.513006,4.069836,2.206091,-1.290894,-1.762866,3.178883,-2.517108,-3.440595,1.071892,-0.993795,-1.344589,-1.195211,-1.756481,1.677083,0.144351,1.784589


In [74]:
train_embeddings_pd_scaled_pca_20 = pd.concat([post_text_df['post_id'], train_embeddings_pd_scaled_pca_20], axis=1)

In [75]:
train_embeddings_pd_scaled_pca_20

Unnamed: 0,post_id,PC0,PC1,PC2,PC3,PC4,PC5,PC6,PC7,PC8,...,PC10,PC11,PC12,PC13,PC14,PC15,PC16,PC17,PC18,PC19
0,1,3.500321,-10.554127,11.443293,0.846715,14.404195,9.562678,-3.731412,6.060891,6.917934,...,1.141031,-4.023715,13.655045,-1.540065,-5.287158,0.871138,1.230013,-4.582694,-0.031711,0.169429
1,2,18.671574,-5.137968,1.849810,-5.967700,0.369379,2.440308,-2.113307,0.072210,3.222704,...,0.762847,-1.960697,-2.824781,-3.384867,0.505251,2.872391,-2.057101,4.495288,-3.962610,2.263731
2,3,13.968040,-4.589651,1.524654,-8.710602,1.043155,7.366778,-1.090990,4.899185,3.291355,...,-7.690031,-3.501015,3.778787,3.028109,0.963784,-0.481186,-2.623369,1.008380,0.519822,0.909738
3,4,21.775520,0.041360,-15.245056,-1.202087,-1.096143,-0.474863,1.177425,-5.798441,-0.636396,...,1.567016,0.371143,1.368155,2.801944,5.111173,-3.664513,3.177406,3.842894,-1.363071,-1.477227
4,5,14.218131,1.002445,-13.375771,-3.665664,1.368440,-0.714064,5.984372,-1.590061,-2.126425,...,-0.204347,0.314087,-1.205342,2.404388,-3.078573,-0.399450,-2.173429,-0.123188,2.112833,-0.496121
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7018,7315,-11.655491,-6.354025,-5.219182,-9.544808,0.303797,-2.729472,1.988416,1.083812,-6.699657,...,-0.368224,-2.882448,-0.521882,-3.632641,1.025860,0.398515,-2.544691,3.047159,1.724019,-5.951403
7019,7316,-14.031900,-3.515837,1.317705,-5.516077,1.923890,-3.453614,1.813950,2.135833,2.385682,...,0.859299,5.260887,1.577358,3.894874,3.642095,-1.613258,-4.488069,-2.539954,1.353190,-0.450315
7020,7317,-12.519737,-5.040771,-4.923931,-7.651985,0.356563,-0.637029,1.835101,6.786124,-3.118750,...,-3.319621,-0.992389,-2.757275,6.702721,3.893083,5.556788,-0.638900,-2.551707,-5.765186,1.065638
7021,7318,-12.791195,-7.078714,-5.311069,3.794702,2.513006,4.069836,2.206091,-1.290894,-1.762866,...,-2.517108,-3.440595,1.071892,-0.993795,-1.344589,-1.195211,-1.756481,1.677083,0.144351,1.784589


In [76]:
post_text_df['post_id'].max()

7319

In [77]:
train_embeddings_pd_scaled_pca_20.to_csv('text_embeddings.csv')

In [None]:
# тут еще необработанный код 

In [None]:
from sklearn.cluster import KMeans

In [None]:
kmeans = KMeans(n_clusters=5, n_init=5, random_state=42)

In [None]:
df["cluster_5"] = kmeans.fit_predict(df[X_stand])
X_cd5 = kmeans.fit_transform(df[X_stand])
X_cd5 = pd.DataFrame(X_cd5, columns=[f"cluster_5_centroid_{i}" for i in range(X_cd5.shape[1])])