In [72]:
import pandas as pd
import numpy as np
from tqdm import tqdm
from ast import literal_eval
import concurrent.futures
from concurrent.futures import ThreadPoolExecutor
import tensorflow as tf
from transformers import BartTokenizer, TFBartModel

In [73]:
df = pd.read_csv('Data/cleaned_data_2.csv')
df

Unnamed: 0,Head,Body,Tags,Text,Tags Count,Text_Cleaned
0,brain segmentation to 3d model,my goal is to take a dataset of brain tumor s...,"['computer-vision', 'python']",brain segmentation to 3d model my goal is to ...,2,brain segmentation 3d model goal dataset brain...
1,active learning regression with random forest,i have a dataset of about 8k points and i am ...,"['machine-learning', 'regression', 'uncertaint...",active learning regression with random forest ...,5,active learn regression random forest dataset ...
2,comparing reinforcement learning models,i am currently completing my thesis on optimi...,"['reinforcement-learning', 'policy-gradients',...",comparing reinforcement learning models i am ...,4,compare reinforcement learning model currently...
3,why good model that performs great on holdout ...,i have this binary regression model that has ...,"['deep-learning', 'deep-neural-networks', 'pre...",why good model that performs great on holdout ...,5,good model perform great holdout validation da...
4,what are reservoir computers used for today,reservoir computers were very popular in the ...,"['machine-learning', 'recurrent-neural-network...",what are reservoir computers used for today ...,3,reservoir computer today reservoir computer po...
...,...,...,...,...,...,...
48798,where to get older digital ocr d data sets of ...,i am trying to locate older unsummarized us c...,"['data-request', 'usa', 'us-census']",where to get older digital ocr d data sets of ...,3,old digital ocr datum set unsummarized census ...
48799,orthophoto rwanda free download,i m looking for image datas for east africa s...,"['data-request', 'images', 'aerial-photography...",orthophoto rwanda free download i m looking ...,4,orthophoto rwanda free download look image dat...
48800,finding online finance datasets,i am searching for finance datasets that has ...,"['data-request', 'finance']",finding online finance datasets i am searchin...,2,find online finance dataset search finance dat...
48801,healthcare finder api links broken,it seems that the healthcare finder api schem...,['healthcare-finder-api'],healthcare finder api links broken it seems ...,1,healthcare finder api link break healthcare fi...


In [65]:
df['Tags'] = df['Tags'].apply(lambda x: literal_eval(x))
all_tags = [item for sublist in df['Tags'].values for item in sublist]

# Get all unique tags
unique_tags = list(set(all_tags))
print(len(unique_tags))

2686


In [66]:
from collections import Counter
counts = Counter(all_tags)
frequencies_words = counts.most_common(35)
tags_features = [word[0] for word in frequencies_words]
tags_features

['machine-learning',
 'r',
 'regression',
 'deep-learning',
 'neural-networks',
 'data-request',
 'python',
 'reinforcement-learning',
 'classification',
 'time-series',
 'probability',
 'neural-network',
 'distributions',
 'bayesian',
 'hypothesis-testing',
 'keras',
 'mathematical-statistics',
 'scikit-learn',
 'logistic',
 'convolutional-neural-networks',
 'clustering',
 'tensorflow',
 'terminology',
 'nlp',
 'correlation',
 'self-study',
 'normal-distribution',
 'geospatial',
 'cross-validation',
 'optimization',
 'random-forest',
 'mixed-model',
 'data-mining',
 'feature-selection',
 'pca']

In [67]:
def most_common(tags):
    tags_filtered = []
    for i in range(0, len(tags)):
        if tags[i] in tags_features:
            tags_filtered.append(tags[i])
    return tags_filtered

df['Tags'] = df['Tags'].apply(lambda x: most_common(x))
df['Tags'] = df['Tags'].apply(lambda x: x if len(x)>0 else None)

df = df.dropna(subset=['Tags'])

In [70]:
df.sort_values(by=['Tags Count'],ascending=False).head()

Unnamed: 0,Head,Body,Tags,Text,Tags Count,Text_Cleaned
19322,sampling from skew normal distribution,i want to draw samples from a skew normal dis...,[distributions],sampling from skew normal distribution i want...,5,sample skew normal distribution want draw samp...
13517,hypothesis test for difference in medians amon...,question the test scores of three groups of p...,"[r, hypothesis-testing]",hypothesis test for difference in medians amon...,5,hypothesis test difference median sample quest...
13494,variance covariance matrix interpretation,assume we have a linear model model1 and vcov...,[r],variance covariance matrix interpretation ass...,5,variance covariance matrix interpretation assu...
13495,mixing continuous and binary data with linear ...,so i ve been playing around with svms and i w...,[feature-selection],mixing continuous and binary data with linear ...,5,mix continuous binary datum linear svm play sv...
13501,using glm as substitute for simple chi square ...,i am interested in changing the null hypothes...,"[r, hypothesis-testing]",using glm as substitute for simple chi square ...,5,glm substitute simple chi square test interest...


In [69]:
from sklearn.preprocessing import MultiLabelBinarizer

# Create a MultiLabelBinarizer to encode the tags
mlb = MultiLabelBinarizer()

# Fit the MultiLabelBinarizer on the tag data and transform it
multi_label_encoded_tags = mlb.fit_transform(df['Tags'].tolist())

# Create a dataframe from the multi-label encoded tags
multi_label_tags_df = pd.DataFrame(multi_label_encoded_tags, columns=mlb.classes_)

# Now, multi_label_tags_df contains the multi-label encoded tag features
multi_label_tags_df.head()

Unnamed: 0,bayesian,classification,clustering,convolutional-neural-networks,correlation,cross-validation,data-mining,data-request,deep-learning,distributions,...,python,r,random-forest,regression,reinforcement-learning,scikit-learn,self-study,tensorflow,terminology,time-series
0,0,0,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0
3,0,0,0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [None]:
model_name = "facebook/bart-base"  # You can change this to a different BERT variant
tokenizer = BartTokenizer.from_pretrained(model_name)
model = BartModel.from_pretrained(model_name)

In [None]:
# Create an empty list to store the embeddings


def create_embeddings():
    embeddings = []
        
    # Loop through each text sample and generate BERT embeddings
    for text in tqdm(df['Text_Cleaned'].tolist(), desc="Generating BERT embeddings"):
        inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True, max_length=512)
        
        with torch.no_grad():
            outputs = model(**inputs)
        
        # Extract the [CLS] token embedding for each text sample
        cls_token_embedding = outputs.last_hidden_state[:, 0, :]
        
        # Append the embedding to the list
        embeddings.append(cls_token_embedding)

    # Convert the list of embeddings to a PyTorch tensor
    embeddings_tensor = torch.cat(embeddings, dim=0)
    return embeddings_tensor