In [24]:
from bertopic import BERTopic
import pandas as pd
import numpy as np
from sklearn.utils import resample
import string
import contractions
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from matplotlib import pyplot as plt
import re
import seaborn as sns
from scipy.stats import pointbiserialr
import torch
torch.cuda.set_device(1)  

In [6]:
df = pd.read_csv(f'/data/jmharja/projects/PersonaClassifier/twitter_SU/classification/2020_personality.csv')
trait_cols = ['cOPN', 'cCON', 'cEXT', 'cAGR', 'cNEU']
for col in trait_cols:
    mean_threshold = df[col].mean()
    df[f'{col}_class'] = np.where(df[col] >= mean_threshold, 1, 0)
    # print(df[f'{col}_class'].value_counts())


In [7]:
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()
keywords_to_remove = {'user', 'hashtag', 'httpurl', 'url', 'http', 'https'}  
punctuation_table = str.maketrans('', '', string.punctuation)
def clean_text(text):
    text = text.lower().translate(punctuation_table)
    words = [
        word for word in text.split()
        if not word.isdigit() and len(word) >= 5
    ]
    words = [contractions.fix(word) for word in words]  # Optional: Only if needed
    words = [
        lemmatizer.lemmatize(word)
        for word in words
        if word not in stop_words and word not in keywords_to_remove
    ]
    return ' '.join(words)

In [8]:
# agr_df = pd.read_csv('alcohol_agr.csv')
agr_df = df[df['cAGR_class'] == 1]
agr_df = agr_df.drop_duplicates(subset=['text'])
agr_df['text'] = agr_df['text'].apply(clean_text)
agr_df = agr_df[agr_df['text'].apply(lambda x: len(x.split()) > 5)]
agr_df.shape

(249973, 27)

In [9]:
agr_texts = agr_df['text'].tolist()

In [15]:
# neu_df = pd.read_csv('alcohol/alcohol_neu.csv')
neu_df = df[df['cNEU_class'] == 1]
neu_df = neu_df.drop_duplicates(subset=['text'])
neu_df['text'] = neu_df['text'].apply(clean_text)
neu_df = neu_df[neu_df['text'].apply(lambda x: len(x.split()) > 5)]
print(neu_df.shape)
neu_texts = neu_df['text'].tolist()

(246155, 27)


In [16]:
# opn_df = pd.read_csv('alcohol/alcohol_opn.csv')
opn_df = df[df['cOPN_class'] == 1]
opn_df = opn_df.drop_duplicates(subset=['text'])
opn_df['text'] = opn_df['text'].apply(clean_text)
opn_df = opn_df[opn_df['text'].apply(lambda x: len(x.split()) > 5)]
print(opn_df.shape)
opn_texts = opn_df['text'].tolist()

(246697, 27)


In [17]:
# con_df = pd.read_csv('alcohol/alcohol_con.csv')
con_df = df[df['cCON_class'] == 1]
con_df = con_df.drop_duplicates(subset=['text'])
con_df['text'] = con_df['text'].apply(clean_text)
con_df = con_df[con_df['text'].apply(lambda x: len(x.split()) > 5)]
print(con_df.shape)
con_texts = con_df['text'].tolist()

(283930, 27)


In [18]:
# ext_df = pd.read_csv('alcohol/alcohol_ext.csv')
ext_df = df[df['cEXT_class'] == 1]
ext_df = ext_df.drop_duplicates(subset=['text'])
ext_df['text'] = ext_df['text'].apply(clean_text)
ext_df = ext_df[ext_df['text'].apply(lambda x: len(x.split()) > 5)]
print(ext_df.shape)
ext_texts = ext_df['text'].tolist()

(310245, 27)


In [None]:
from openai import OpenAI as OpenAIClient
from bertopic.representation import KeyBERTInspired, OpenAI, MaximalMarginalRelevance, PartOfSpeech
from sentence_transformers import SentenceTransformer
embedding_model = SentenceTransformer("all-MiniLM-L6-v2")  
def get_model(trait):
    # KeyBERT:
    keybert_model = KeyBERTInspired()

    # Part-of-Speech
    # pos_model = PartOfSpeech("en_core_web_sm")

    # MMR
    mmr_model = MaximalMarginalRelevance(diversity=0.3)

    # GPT-3.5
    prompt = f"""
    I have a topic that contains the following documents:
    [DOCUMENTS]
    The topic is described by the following keywords: [KEYWORDS]

    Based on the information above, extract a short but highly descriptive topic label of at most 5 words that reflects traits of {trait}. Make sure it is in the following format:
    topic: <topic label>
    """

    client = OpenAIClient(api_key="")
    openai_model = OpenAI(client, model = "gpt-3.5-turbo", exponential_backoff=True, chat=True, prompt=prompt, delay_in_seconds=2,)

    representation_model = {
        "KeyBERT": keybert_model,
        "OpenAI": openai_model,  
        "MMR": mmr_model
        # "POS": pos_model
    }

    from bertopic import BERTopic
    from sentence_transformers import SentenceTransformer
    from umap import UMAP
    from hdbscan import HDBSCAN
    from sklearn.feature_extraction.text import CountVectorizer
    from nltk.corpus import stopwords

    stopwords = stopwords.words('english') + ['user', 'hashtag', 'httpurl', 'url', 'http', 'https']
    # embedding_model = SentenceTransformer("all-MiniLM-L6-v2", device="cuda")  # GPU

    umap_model = UMAP(n_neighbors=30, n_components=3, min_dist=0.1, metric="cosine")
    hdbscan_model = HDBSCAN(min_cluster_size=50, min_samples=20, metric="euclidean")
    # representation_model = KeyBERTInspired()

    model = BERTopic(
        nr_topics=10,
        umap_model=umap_model,
        hdbscan_model=hdbscan_model,
        embedding_model=embedding_model,
        vectorizer_model=CountVectorizer(stop_words=stopwords, ngram_range=(1, 2)),
        representation_model=representation_model,
        calculate_probabilities=False,    # Disable if not needed
        verbose=True
    )
    return model

In [19]:
neu_model = get_model("Neuroticism")
# agr_model = get_model("Aggreableness")
opn_model = get_model("Openness")
ext_model = get_model("Extraversion")
con_model = get_model("Conscientiousness")

In [12]:
# Precompute embeddings for speed
embeddings = embedding_model.encode(agr_texts, show_progress_bar=True)
# np.save('emb/agr_embeddings.npy', embeddings)

Batches:   0%|          | 0/7812 [00:00<?, ?it/s]

In [13]:
# embs = np.load('emb/agr_embeddings.npy')
topics, _ = agr_model.fit_transform(agr_texts, embeddings)
agr_topics = agr_model.get_topic_info()
agr_topics

2025-06-03 17:54:17,667 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm


2025-06-03 17:58:15,029 - BERTopic - Dimensionality - Completed ✓
2025-06-03 17:58:15,034 - BERTopic - Cluster - Start clustering the reduced embeddings
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false

Unnamed: 0,Topic,Count,Name,Representation,KeyBERT,OpenAI,MMR,Representative_Docs
0,-1,136175,-1_drink_smoke_alcohol_drunk,"[drink, smoke, alcohol, drunk, would, people, ...","[drink, drinking, alcohol, drink water, drunk,...",[Friendly Beverage Choices],"[drink, alcohol, drunk, would, water, drinking...","[anything heart condition drink energy drink, ..."
1,0,99238,0_drink_smoke_drunk_alcohol,"[drink, smoke, drunk, alcohol, would, coffee, ...","[drink, drink coffee, drink alcohol, drinking,...",[Social drinking habits],"[drink, would, coffee, drinking, cocaine, covi...","[drink coffee remember sleep tonight anyway, r..."
2,1,6852,1_maine_trump_pelosi_biden,"[maine, trump, pelosi, biden, nobody, nobody n...","[nobody nobody, nobody, people, donald trump, ...","[Nobody protests peacefully, agitated.]","[maine, pelosi, biden, nobody nobody, obama, b...",[nobody nobody nobody nobody nobody nobody nob...
3,2,2894,2_upgrade_dolphin_arcade_dolphin dolphin,"[upgrade, dolphin, arcade, dolphin dolphin, sa...","[upgrade, major upgrade, upgrade upgrade, mass...",[Friendly Dolphin Upgrades],"[upgrade, dolphin dolphin, luigis, luigi, tick...","[great upgrade though really upgrade would, de..."
4,3,2278,3_gujarat_liquidation_please_company,"[gujarat, liquidation, please, company, reviva...","[liquidation gujarat, gujarat liquidation, com...",[Friendly Gujarat Company Revival],"[liquidation, liquidation contract, gujarat li...",[please revival gujarat liquidation employee f...
5,4,1191,4_eligible_school_student_teacher,"[eligible, school, student, teacher, undergrad...","[student eligible, eligible student, school el...",[Eligible student grant opportunities],"[undergrad, stimulus, income, eligible voter, ...",[normally receive stimulus check still eligibl...
6,5,558,5_normalize_frequency_energy_vibration,"[normalize, frequency, energy, vibration, norm...","[normalize normalize, normalize, normalizing, ...",[Harmonious Vibration Energy],"[energy, vibrational, people normalize, normal...",[normalize normalize normalize normalize norma...
7,6,442,6_aston_aston martin_martin_climate,"[aston, aston martin, martin, climate, thunder...","[aston martin, martin aston, aston, martin, po...",[Warm Aston Martin Weather],"[aston martin, climate, thunderstorm, sequeste...",[aston martin stock astoria aston martin deale...
8,7,292,7_penis_cramp_period_penis penis,"[penis, cramp, period, penis penis, tooth, ank...","[penis penis, morning penis, penis, increase p...",[Gentle Pain Relief],"[cramp, period, tooth, period cramp, increase ...",[penis morning penis penis supper penis bagel ...
9,8,53,8_ground ground_ground_ground holy_holy,"[ground ground, ground, ground holy, holy, lea...","[ground ground, ground, ground belongs, ground...",[Ground Harmony Traits],"[ground ground, ground holy, leave ground, veg...","[ground ground ground ground ground ground, gr..."


In [14]:
agr_topics.to_csv('emb/AGR_topic_10_modified_prompt.csv')


In [20]:
# Precompute embeddings for speed
neu_embeddings = embedding_model.encode(neu_texts, show_progress_bar=True)
topics, _ = neu_model.fit_transform(neu_texts, neu_embeddings)
neu_topics = neu_model.get_topic_info()
neu_topics.to_csv('emb/NEU_topic_10_modified_prompt.csv')

Batches:   0%|          | 0/7693 [00:00<?, ?it/s]

2025-06-03 18:25:53,344 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2025-06-03 18:29:06,925 - BERTopic - Dimensionality - Completed ✓
2025-06-03 18:29:06,932 - BERTopic - Cluster - Start clustering the reduced embeddings
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before

In [21]:
# Precompute embeddings for speed
opn_embeddings = embedding_model.encode(opn_texts, show_progress_bar=True)
topics, _ = opn_model.fit_transform(opn_texts, opn_embeddings)
opn_topics = opn_model.get_topic_info()
opn_topics.to_csv('emb/OPN_topic_10_modified_prompt.csv')
opn_topics

Batches:   0%|          | 0/7710 [00:00<?, ?it/s]

2025-06-03 18:45:26,461 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2025-06-03 18:49:09,752 - BERTopic - Dimensionality - Completed ✓
2025-06-03 18:49:09,759 - BERTopic - Cluster - Start clustering the reduced embeddings
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before

Unnamed: 0,Topic,Count,Name,Representation,KeyBERT,OpenAI,MMR,Representative_Docs
0,-1,134265,-1_drink_alcohol_drunk_people,"[drink, alcohol, drunk, people, smoke, would, ...","[drink, drink alcohol, drinking, alcohol, drun...",[Openness to Nightlife Enjoyment],"[drink, alcohol, drunk, people, smoke, night, ...",[think alcohol inside covid really tolerance a...
1,0,92386,0_drink_smoke_drunk_people,"[drink, smoke, drunk, people, alcohol, cigaret...","[drink, drinking, alcohol, tequila, drunk, cig...",[Openness to Vices],"[people, liquor, coffee, smoking, drinking, to...",[female friend wrong drunk drink friend drink ...
2,1,8246,1_protester_maine_trump_pelosi,"[protester, maine, trump, pelosi, voter, innoc...","[maine, susan collins, nancy pelosi, pelosi, s...",[Maine Political Landscape],"[protester, maine, pelosi, innocence, biden, p...",[susan collins maine january democratic senato...
3,2,3100,2_eligible_homeless_school_health,"[eligible, homeless, school, health, student, ...","[eligible apply, eligibility, eligible, inelig...",[Eligibility for Discretionary Grants],"[homeless, health, student, eligibility, incom...","[happen school parent eligible school have…, r..."
4,3,2898,3_upgrade_gujarat_cancel_downgrade,"[upgrade, gujarat, cancel, downgrade, liquidat...","[liquidation gujarat, gujarat liquidation, com...",[Embracing Gujarat Revival Efforts],"[upgrade, cancel, company, liquidation contrac...",[please liquidation gujarat employee family li...
5,4,2626,4_arcade_worcester_ticket_aston,"[arcade, worcester, ticket, aston, luigis, gar...","[train worcester, worcester train, worcester, ...",[innovative train service improvement],"[worcester, aston, apple arcade, luigis mansio...",[worcester train worcester auburndale minute b...
6,5,1963,5_mercury_retrograde_mercury retrograde_freddie,"[mercury, retrograde, mercury retrograde, fred...","[mercury retrograde, retrograde mercury, mercu...",[Openness in Mercury Retrograde],"[mercury, mercury retrograde, freddie mercury,...",[little reminder astrogrammar mercury retrogra...
7,6,1077,6_temperature_humidity hourly_hourly pressure_...,"[temperature, humidity hourly, hourly pressure...","[hourly pressure, humidity hourly, humidity, p...",[Openness to Climate Patterns],"[humidity hourly, hourly pressure, humidity, g...",[temperature 111°c humidity hourly pressure sl...
8,7,74,7_normalize_normalize normalize_normalize whit...,"[normalize, normalize normalize, normalize whi...","[normalize normalize, normalize, normalize alo...",[Embracing Diverse Normalize Therapies],"[normalize white, therapy normalize, serve nor...",[normalize normalize normalize normalize norma...
9,8,62,8_nobody nobody_nobody_somebody_nobody absolutely,"[nobody nobody, nobody, somebody, nobody absol...","[nobody nobody, nobody nobodyyyyy, nobody some...",[Embracing Openness Fully],"[nobody nobody, twice nobody, dollar nobody, m...","[nobody nobody nobody nobody nobody nobody, no..."


In [22]:
# Precompute embeddings for speed
con_embeddings = embedding_model.encode(con_texts, show_progress_bar=True)
topics, _ = con_model.fit_transform(con_texts, con_embeddings)
con_topics = con_model.get_topic_info()
con_topics.to_csv('emb/CON_topic_10_modified_prompt.csv')
con_topics

Batches:   0%|          | 0/8873 [00:00<?, ?it/s]

2025-06-03 19:23:40,169 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2025-06-03 19:27:29,598 - BERTopic - Dimensionality - Completed ✓
2025-06-03 19:27:29,604 - BERTopic - Cluster - Start clustering the reduced embeddings
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before

Unnamed: 0,Topic,Count,Name,Representation,KeyBERT,OpenAI,MMR,Representative_Docs
0,-1,149860,-1_drink_drunk_people_alcohol,"[drink, drunk, people, alcohol, smoke, would, ...","[drink, drinking, alcohol, alcoholic, drunk, l...",[Responsible Drinking Habits],"[drink, drunk, alcohol, smoke, night, trump, l...","[watch movie would drink instead dumbass…, nor..."
1,0,124349,0_drink_smoke_drunk_people,"[drink, smoke, drunk, people, cigarette, would...","[drink, alcohol, drinking, liquor, drunk, smok...",[Responsible alcohol consumption habits],"[drink, cigarette, would, think, covid19, trum...",[republican people drink drive arrest drunk dr...
2,1,3641,1_mercury_arcade_freddie_retrograde,"[mercury, arcade, freddie, retrograde, freddie...","[freddie mercury, mercury mercury, retrograde ...",[Mercury Retrograde Frequency Study],"[freddie mercury, mercury retrograde, luigis, ...",[thread freddie mercury voice crown freddie me...
3,2,1398,2_homeless_library_eligible_school,"[homeless, library, eligible, school, teacher,...","[people homeless, homeless people, homeless, h...",[Homeless Advocacy Efforts],"[homeless, eligible, teacher, public library, ...",[homeless people maybe morning people sleep st...
4,3,1316,3_gujarat_liquidation_contract_liquidation con...,"[gujarat, liquidation, contract, liquidation c...","[greensquare bybit, contract greensquare, liqu...",[Contract Liquidation Assessment],"[contract, liquidation contract, market, short...",[greensquare bybit btcusd short liquidation co...
5,4,1103,4_upgrade_downgrade_iphone_rating,"[upgrade, downgrade, iphone, rating, moody, wo...","[upgrade, upgraded, really upgrade, upgrade up...",[Consistent Technology Upgrades],"[upgrade, downgrade, iphone, moody downgrade, ...","[chrome thing maybe browser upgrade upgrade, u..."
6,5,969,5_pressure_temperature_blood pressure_humidity,"[pressure, temperature, blood pressure, humidi...","[hourly pressure, humidity hourly, pressure te...",[Climate Change Impact Analysis],"[blood pressure, hourly pressure, humidity hou...",[temperature 183°c humidity hourly pressure sl...
7,6,850,6_maine_collins_susan_susan collins,"[maine, collins, susan, susan collins, maine m...","[maine susan, collins maine, senator maine, tr...",[Maine Senator Susan Collins],"[susan collins, collins maine, maine susan, ma...",[susan collins maine january democratic senato...
8,7,353,7_nobody_nobody nobody_bible_jesus,"[nobody, nobody nobody, bible, jesus, psalm, b...","[nobody nobody, nobody, could nobody, nobody s...",[Conscientious Worshipper Attributes],"[nobody nobody, psalm, worship, eligible etern...",[nobody nobody nobody nobody nobody nobody nob...
9,8,91,8_normalize_normalize normalize_dealer_people ...,"[normalize, normalize normalize, dealer, peopl...","[normalize normalize, normalize, normalize alo...",[Organized and responsible mindset],"[normalize, people normalize, public normalize...",[normalize normalize normalize normalize norma...


In [23]:
# Precompute embeddings for speed
ext_embeddings = embedding_model.encode(ext_texts, show_progress_bar=True)
topics, _ = ext_model.fit_transform(ext_texts, ext_embeddings)
ext_topics = ext_model.get_topic_info()
ext_topics.to_csv('emb/EXT_topic_10_modified_prompt.csv')
ext_topics

Batches:   0%|          | 0/9696 [00:00<?, ?it/s]

2025-06-03 19:46:25,285 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2025-06-03 19:51:08,924 - BERTopic - Dimensionality - Completed ✓
2025-06-03 19:51:08,929 - BERTopic - Cluster - Start clustering the reduced embeddings
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before

Unnamed: 0,Topic,Count,Name,Representation,KeyBERT,OpenAI,MMR,Representative_Docs
0,-1,150361,-1_drink_alcohol_drunk_people,"[drink, alcohol, drunk, people, would, think, ...","[drink alcohol, drink, drinking, alcohol, alco...",[Social Drinking Personality],"[drink, alcohol, drunk, bottle, liquor, drinki...",[suppose grown still people bottle drink think...
1,0,140837,0_smoke_drink_cigarette_alcohol,"[smoke, drink, cigarette, alcohol, people, dru...","[smoke cigarette, smoke, smoking, cigarette, t...",[Social Smoking Behavior],"[would, coffee, smoking, cocaine, drinking, to...",[remember progression smoke cigarette cigar sm...
2,1,13560,1_chinese_protester_trump_people,"[chinese, protester, trump, people, maine, inn...","[white people, black people, racism, white, tr...",[Outspoken Political Events],"[chinese, innocence, pelosi, voter, biden, oli...",[white people white people mitch right right p...
3,2,2571,2_upgrade_downgrade_school_teacher,"[upgrade, downgrade, school, teacher, student,...","[grade downgrade, rating downgrade, moody down...",[Sociable School Environment],"[upgrade, downgrade, student, undergrad, grade...",[moody downgrade india rating first patriot un...
4,3,1143,3_gujarat_cancel_liquidation_please,"[gujarat, cancel, liquidation, please, company...","[liquidation gujarat, gujarat liquidation, com...",[Sociable Gujarat Employees Revival],"[cancel culture, gujarat liquidation, revival ...",[large manufacture company gujarat please revi...
5,4,1064,4_frequency_climate_energy_thunderstorm,"[frequency, climate, energy, thunderstorm, gro...","[frequency vibration, vibration frequency, vib...",[Vibrational Frequency Harmony],"[climate, thunderstorm, ground ground, highvol...",[human meant vibrate frequency truth frequency...
6,5,303,5_barber_shave_blade_arcane blade,"[barber, shave, blade, arcane blade, beard, ar...","[arcane blade, cornac arcane, hedgewizard trol...",[sociable blade wielder],"[barber, shave, arcane blade, blade death, dea...",[avaragard fourth1 cornac arcane blade death f...
7,6,232,6_violin_racquet_tennis_fruit violin,"[violin, racquet, tennis, fruit violin, violin...","[fruit violin, violin happy, violin piano, cha...",[Energetic socialite violinist],"[fruit violin, violin happy, happy albert, cha...",[table chair fruit violin happy albert einstei...
8,7,118,7_normalize_normalize normalize_people normali...,"[normalize, normalize normalize, people normal...","[normalize normalize, normalizing normalize, n...",[Normalize Social Interaction],"[people normalize, public normalize, normalize...",[normalize normalize normalize normalize norma...
9,8,56,8_apple_apple apple_guess apple_apple thing,"[apple, apple apple, guess apple, apple thing,...","[apple apple, kawhi apple, serious apple, appl...",[Outgoing Apple Enthusiast],"[apple apple, guess apple, apple thing, kawhi ...","[kawhi apple though—so serious apple apple, ap..."


In [31]:
agr_topics = pd.read_csv('emb/AGR_topic_10_modified_prompt.csv')
agr_topics[["Topic",   "OpenAI", "Representation", "Count"]].to_csv('emb/agr1.csv')


In [67]:
# agr_topics[["Topic",  "Count", "OpenAI", "Representation"]].to_csv('alcohol/agr.csv')
# neu_topics[["Topic",  "Count", "OpenAI", "Representation"]].to_csv('alcohol/neu.csv')
# opn_topics[["Topic",  "Count", "OpenAI", "Representation"]].to_csv('alcohol/opn.csv')
# con_topics[["Topic",  "Count", "OpenAI", "Representation"]].to_csv('alcohol/con.csv')
ext_topics[["Topic",  "Count", "OpenAI"]].to_csv('alcohol/ext.csv')

# ext_topics[["Topic",  "Count", "OpenAI", "Representation"]].to_csv('alcohol/ext.csv')
