# Note: Idea is to reduce dimension for text embeddings

# Import libs

In [1]:
import os, sys
import pickle

import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns

from torch.utils.data import DataLoader, Dataset
import torch

from scipy.stats import ttest_ind
from scipy.stats import ttest_1samp
from sklearn.metrics import accuracy_score

from sklearn.decomposition import PCA

# Import data && transform

In [2]:
df = pd.read_parquet("df_posts_embedings.parquet")

In [3]:
df

Unnamed: 0,post_id,topic_covid,topic_entertainment,topic_movie,topic_politics,topic_sport,topic_tech,0,1,2,...,758,759,760,761,762,763,764,765,766,767
0,1,False,False,False,False,False,False,0.363151,0.048937,-0.264081,...,0.349080,0.290132,-0.244970,0.078532,0.137399,0.208097,-0.058624,-0.141593,0.015918,0.000092
1,2,False,False,False,False,False,False,0.236417,-0.159501,-0.327798,...,0.311639,0.297819,-0.177003,0.130227,-0.063239,0.190171,-0.018153,-0.289936,0.119365,-0.001623
2,3,False,False,False,False,False,False,0.375191,-0.113944,-0.240547,...,0.353616,0.308457,-0.207151,0.056724,0.056596,0.125300,0.021575,-0.338919,0.058694,-0.021266
3,4,False,False,False,False,False,False,0.273770,-0.048748,-0.440433,...,0.321182,0.218213,-0.267988,-0.093801,0.176987,0.251617,0.028331,-0.155709,0.136188,0.044055
4,5,False,False,False,False,False,False,0.297853,-0.073203,-0.146820,...,0.168949,0.208978,-0.051180,0.045685,0.173986,0.148893,0.097255,-0.239587,0.228066,0.189831
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7018,7315,False,False,True,False,False,False,0.338545,0.084620,-0.225981,...,0.495477,0.203647,-0.138003,0.148754,0.138934,0.199463,0.054088,-0.110224,0.039229,-0.003550
7019,7316,False,False,True,False,False,False,0.354057,0.053933,-0.099446,...,0.322798,0.176826,-0.154205,-0.010798,0.100222,0.093780,0.051934,-0.119376,0.182106,0.072430
7020,7317,False,False,True,False,False,False,0.340383,0.066492,-0.163184,...,0.372651,0.169435,-0.041522,-0.033723,0.047250,0.173592,-0.027378,-0.086563,0.203404,0.032091
7021,7318,False,False,True,False,False,False,0.432092,0.011092,-0.117306,...,0.468103,0.156609,-0.054083,0.210478,0.116214,0.064117,0.084667,0.075402,0.102740,0.015274


In [8]:
columns = df.columns.to_list()[7:]

In [11]:
df_text = df[columns]

In [15]:
centered = df_text - df_text.mean()

# Define PCA

In [16]:
pca = PCA(n_components=50)
pca_decomp = pca.fit_transform(centered)

# Define KMeans clasterisation

In [17]:
from sklearn.cluster import KMeans

In [18]:
n_clusters = 15

kmeans = KMeans(n_clusters=n_clusters, random_state=0).fit(pca_decomp)

  super()._check_params_vs_input(X, default_n_init=10)


In [28]:
kmeans.predict(pca_decomp[0].reshape(1,-1))

array([2], dtype=int32)

In [31]:
dists_columns = [f'DistanceToCluster_{i}' for i in range(n_clusters)]

In [34]:
df['kmean_label'] = kmeans.labels_

In [32]:
dists_df = pd.DataFrame(
    data=kmeans.transform(pca_decomp),
    columns=dists_columns
)

In [37]:
posts_info = pd.concat((df, dists_df), axis=1)

posts_info.drop(columns, axis=1, inplace=True)

posts_info

Unnamed: 0,post_id,topic_covid,topic_entertainment,topic_movie,topic_politics,topic_sport,topic_tech,kmean_label,DistanceToCluster_0,DistanceToCluster_1,...,DistanceToCluster_5,DistanceToCluster_6,DistanceToCluster_7,DistanceToCluster_8,DistanceToCluster_9,DistanceToCluster_10,DistanceToCluster_11,DistanceToCluster_12,DistanceToCluster_13,DistanceToCluster_14
0,1,False,False,False,False,False,False,2,3.729024,2.998742,...,3.609131,3.383667,3.417403,3.373112,2.350711,1.900091,2.218462,2.839048,3.461114,3.412472
1,2,False,False,False,False,False,False,2,3.531465,2.843195,...,3.347113,3.369123,3.323233,3.324048,2.316050,2.180061,2.233397,2.555996,3.138126,3.218686
2,3,False,False,False,False,False,False,2,3.538897,3.058267,...,3.340357,3.501180,3.348733,3.265760,2.390932,1.806858,3.036396,2.888302,3.141400,3.283395
3,4,False,False,False,False,False,False,10,3.031447,3.260913,...,3.792361,3.742004,3.731672,3.517551,2.811656,2.429931,3.393587,3.379051,3.792460,3.695954
4,5,False,False,False,False,False,False,10,3.280477,2.642357,...,3.022553,2.793506,2.803656,3.028448,2.009257,1.457895,2.923765,2.129916,2.764996,2.838905
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7018,7315,False,False,True,False,False,False,7,2.336807,2.816310,...,2.945988,1.798824,1.283884,3.135332,2.738250,3.003807,3.355360,2.330962,1.821759,2.005075
7019,7316,False,False,True,False,False,False,7,2.197449,2.500667,...,2.594715,1.426906,0.940390,2.930152,2.448724,2.967479,3.189711,2.229802,1.843958,1.767640
7020,7317,False,False,True,False,False,False,7,2.626637,2.534971,...,2.372376,2.012436,1.491582,2.834718,2.810127,3.188147,3.403540,2.444165,1.992768,2.170022
7021,7318,False,False,True,False,False,False,6,1.975253,3.106200,...,3.297663,1.052030,1.489237,3.432551,2.991727,3.197914,3.443768,2.307946,1.514832,1.855331


# Save new features for post_id

In [38]:
posts_info.to_parquet("df_posts_new_features.parquet")