In [8]:
import openai
from bertopic.representation import KeyBERTInspired, MaximalMarginalRelevance, OpenAI, PartOfSpeech

# KeyBERT
keybert_model = KeyBERTInspired()

# Part-of-Speech
# pos_model = PartOfSpeech("en_core_web_sm")

# MMR
mmr_model = MaximalMarginalRelevance(diversity=0.3)

# GPT-3.5
openai.api_key = ""
prompt = """
I have a topic that contains the following documents: 
[DOCUMENTS]
The topic is described by the following keywords: [KEYWORDS]

Based on the information above, extract a short but highly descriptive topic label of at most 5 words. Make sure it is in the following format:
topic: <topic label>
"""
openai_model = OpenAI(model="gpt-3.5-turbo", exponential_backoff=True, chat=True, prompt=prompt)

# All representation models
representation_model = {
    "KeyBERT": keybert_model,
    "OpenAI": openai_model,  # Uncomment if you will use OpenAI
    "MMR": mmr_model,
    # "POS": pos_model
}


  @numba.jit()
  @numba.jit()
  @numba.jit()
  from .autonotebook import tqdm as notebook_tqdm
  @numba.jit()
2024-04-29 10:35:37.557849: I tensorflow/core/util/port.cc:111] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2024-04-29 10:35:37.953438: E tensorflow/compiler/xla/stream_executor/cuda/cuda_dnn.cc:9342] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-04-29 10:35:37.953482: E tensorflow/compiler/xla/stream_executor/cuda/cuda_fft.cc:609] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-04-29 10:35:37.953512: E tensorflow/compiler/xla/stream_executor/cuda/cuda_blas.cc:1518] Unable to register cuBLAS factory: Attempting to register factory for plugin cu

In [2]:
import pandas as pd

from ethnicolr import census_ln, pred_census_ln

names = [{'name': 'smith'},
        {'name': 'zhang'},
        {'name': 'jackson'},
        {'name': 'maharjan'},
        {'name': 'rai'},
        {'name': 'lopez'},
        {'name': 'shakya'},
        {'name': 'jin'},
         {'name': 'bamiwo'},
         {'name': 'khan'},
         {'name': 'abmayomi'},
         {'name': 'stepanov'},
         {'name': 'olaya'},
         {'name': 'lujan'},
         {'name': 'kang'},
         {'name': 'singh'},
         {'name': 'valdez'},
         {'name': 'rincon'},
         {'name': 'zhang'},
         {'name': 'zadow'}]
df = pd.DataFrame(names)

df

2024-06-06 09:10:52.733558: I tensorflow/core/util/port.cc:111] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2024-06-06 09:10:53.131210: E tensorflow/compiler/xla/stream_executor/cuda/cuda_dnn.cc:9342] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-06-06 09:10:53.131249: E tensorflow/compiler/xla/stream_executor/cuda/cuda_fft.cc:609] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-06-06 09:10:53.131275: E tensorflow/compiler/xla/stream_executor/cuda/cuda_blas.cc:1518] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-06-06 09:10:53.210753: I tensorflow/core/platform/cpu_feature_g

Unnamed: 0,name
0,smith
1,zhang
2,jackson
3,maharjan
4,rai
5,lopez
6,shakya
7,jin
8,bamiwo
9,khan


In [4]:
census_ln(df, 'name')

Unnamed: 0,name,pctwhite,pctblack,pctapi,pctaian,pct2prace,pcthispanic
0,smith,73.35,22.22,0.4,0.85,1.63,1.56
1,zhang,0.61,0.09,98.16,0.02,0.96,0.16
2,jackson,41.93,53.02,0.31,1.04,2.18,1.53
3,maharjan,,,,,,
4,rai,5.41,1.38,81.77,0.38,9.5,1.57
5,lopez,5.85,0.61,1.04,0.47,0.52,91.51
6,shakya,(S),4.90,84.31,(S),5.88,0.00
7,jin,1.54,0.27,96.39,0.25,1.2,0.35
8,bamiwo,,,,,,
9,khan,7.92,3.26,70.5,0.47,15.63,2.22


In [5]:
df = pred_census_ln(df, 'name')

2024-06-06 09:11:05.574027: W tensorflow/core/common_runtime/gpu/gpu_device.cc:2211] Cannot dlopen some GPU libraries. Please make sure the missing libraries mentioned above are installed properly if you would like to use GPU. Follow the guide at https://www.tensorflow.org/install/gpu for how to download and setup the required libraries for your platform.
Skipping registering GPU devices...


In [6]:
df[['name', 'race']]

Unnamed: 0,name,race
0,Smith,white
1,Zhang,api
2,Jackson,black
3,Maharjan,api
4,Rai,api
5,Lopez,hispanic
6,Shakya,api
7,Jin,api
8,Bamiwo,white
9,Khan,api


In [7]:
df

Unnamed: 0,name,api,black,hispanic,white,race
0,Smith,0.008322,0.215059,0.029183,0.747435,white
1,Zhang,0.995437,0.000715,0.001243,0.002605,api
2,Jackson,0.00456,0.537857,0.018684,0.438898,black
3,Maharjan,0.749417,0.098781,0.018045,0.133758,api
4,Rai,0.76556,0.046871,0.024481,0.163089,api
5,Lopez,0.00851,0.005643,0.936194,0.049653,hispanic
6,Shakya,0.9903,0.001654,0.001827,0.006219,api
7,Jin,0.979642,0.002589,0.001698,0.01607,api
8,Bamiwo,0.200777,0.231276,0.049048,0.5189,white
9,Khan,0.864759,0.056773,0.007339,0.071129,api


In [8]:
df = pd.read_csv('/data2/julina/scripts/tweets/2020/01/user_csv/2020_01_01.csv')
df = df[:100]
df = df.loc[:, ~df.columns.str.match('  ')]
df = df.loc[:, ~df.columns.str.match('label')]
df = df.loc[:, ~df.columns.str.match('text_y')]


In [9]:
df

Unnamed: 0.1,Unnamed: 0,id,created_at,text,user_id,name,screen_name,description
0,0,1212519876146810880,Wed Jan 01 23:44:00 +0000 2020,USER not well eat soggy,956914320188743680,hyeyoon.,h_yeyoon,ㅤㅤㅤ𝐒𝐭𝐫𝐢𝐜𝐭𝐥𝐲 Eng. 𝟏𝟗𝟗𝟔. ♡₊· ͟͟͞͞➳ A @9OR6EOUS a...
1,1,1212519876146823168,Wed Jan 01 23:44:00 +0000 2020,USER son stay argue fortnite telling people dr...,3329553669,Jordann💜,yannojordannn,amosc~Jordan4days14
2,2,1212519876142735360,Wed Jan 01 23:44:00 +0000 2020,bot need racist ahegao,1153773923265339393,Zayne's Shitposting Machine,realZayne64,From Bot - reply to me to talk to me i am lone...
3,3,1212519876134408193,Wed Jan 01 23:44:00 +0000 2020,go sit fact set straight,1015424689911554048,dee 🌎,snoobicore,luv4eva nd eva 8-0 🚦 ... blueming
4,4,1212519876167917569,Wed Jan 01 23:44:00 +0000 2020,shit kill trippin acic,2282899404,Dgirl,FrenchhToastedd,Cash app: $Lynette1224
...,...,...,...,...,...,...,...,...
95,95,1212519897143508992,Wed Jan 01 23:44:05 +0000 2020,USER jyp say look report twice momo super juni...,893449510675816449,‘유한원’ 🦈,8wyttyw8,NCT | SKZ | SVT | WAYV — feat. YuTae DoTen Lee...
96,96,1212519897110138882,Wed Jan 01 23:44:05 +0000 2020,USER one last ride gator nation HTTPURL,3298820482,Austin Alderson,AustinAlderson_,21 years young chasing dreams!
97,97,1212519897122648064,Wed Jan 01 23:44:05 +0000 2020,USER hoodies 25 week,706316299115180032,Cristian🎄❄️,cristiann_57,
98,98,1212519897110138880,Wed Jan 01 23:44:05 +0000 2020,USER 𝐅𝐑𝐄𝐄 𝐆𝐀𝐌𝐄 whitemediumstar HTTPURL 𝐅𝐑𝐄𝐄 𝐆𝐀...,1160878484228628481,kendrick gonzalez,kendric17580618,yo


In [10]:
df = pred_census_ln(df, 'name')

In [11]:
df[['name', 'race']]

Unnamed: 0,name,race
0,Hyeyoon.,white
1,Jordann💜,white
2,Zayne'S Shitposting Machine,white
3,Dee 🌎,white
4,Dgirl,white
...,...,...
94,‘유한원’ 🦈,api
95,Austin Alderson,white
96,Cristian🎄❄️,hispanic
97,Kendrick Gonzalez,hispanic


In [12]:
dff = pd.read_csv('/data2/julina/scripts/tweets/cleaned_data_by_year/2020_emo.csv')
dff.shape

(3502171, 19)

In [14]:
dff.sample(10)

Unnamed: 0.1,Unnamed: 0,id,created_at,text,user_id,name,screen_name,description,DrugAbuse,lang,gender,age,org,sentiment,sent_score,date,is_teenage,drug_type,emotion
2315061,2315061,1300185514784497664,Sun Aug 30 21:35:57 +0000 2020,USER come get shot HTTPURL,1216370823524704256,KayKay💙,UniqueKayyKayy,Phenix City✈️ Atlanta 📍💙💰 PrettyGirl💅🏾 SelfLov...,1,en,female,19-29,non-org,neu,0.0,2020-08-01,0,[],[]
3261710,3261710,1337605790656589826,Sat Dec 12 03:50:46 +0000 2020,USER one day replace alcohol work go bitch lou...,1505820564,🍒,alethiaslay,👸🏼IG: Alethiaslay // 👻: alethiahk,1,en,female,19-29,non-org,neg,-0.5859,2020-12-01,0,[],"['anger', 'disgust']"
268264,268264,1220409005467668482,Thu Jan 23 18:12:35 +0000 2020,USER buharis incompetence smoke pervade every ...,250132587,UNiberibelizED,nnambudike,Igbo. Nigerian. African. Thinker. Zikist,1,en,male,>=40,non-org,neg,-0.4939,2020-01-01,0,['cannabinoids'],"['anger', 'disgust']"
610589,610589,1236527736715427840,Sun Mar 08 05:42:40 +0000 2020,girl definitely get cte,369840682,Marty McFly🅱️oi,Wet_and_wild69,did you just make a joke Bobby?,1,en,male,<=18,non-org,pos,0.4019,2020-03-01,1,[],"['joy', 'optimism']"
1117511,1117511,1264278204090478593,Sat May 23 19:33:07 +0000 2020,USER USER USER lumpy look need change diaper t...,336271213,Gloria Brown,usgloria,,1,en,female,>=40,non-org,neu,0.0,2020-05-01,0,['stimulants'],['disgust']
2833039,2833039,1319431380699435008,Fri Oct 23 00:12:09 +0000 2020,USER somebody tell smoke less HTTPURL,387307346,Kee™,LoftofLove,♫-Everything Happens For A Reason ♫ IG @kenyaa...,1,en,male,19-29,non-org,neu,0.0,2020-10-01,0,['cannabinoids'],[]
305002,305002,1225540879022252032,Thu Feb 06 22:04:49 +0000 2020,USER high sleep gtgtgtgt gon every time,1095923017849409536,CornieceK 🧚🏾‍♀️,Corniece2,AMOSC: Pxvches || AlcornStateU💜💛,1,en,female,19-29,non-org,neu,0.0,2020-02-01,0,[],[]
1461965,1461965,1273944283830091777,Fri Jun 19 11:42:40 +0000 2020,cmon lionshit hit hit,1252634684019707904,RJintaiwan,JintaiwanR,I love baseball around the world. Grew up as a...,1,en,male,<=18,non-org,neu,0.0,2020-06-01,1,[],['joy']
3347107,3347107,1334413883361128449,Thu Dec 03 08:27:16 +0000 2020,USER drink,1221660260869582848,apophyllite / celeste / aerie,iKissCatra,★✯✮ Doll Bop Xe It Vel Vam Mew Trick M...,1,en,female,<=18,non-org,neu,0.0,2020-12-01,1,[],[]
2498986,2498986,1306650451392167936,Thu Sep 17 17:45:18 +0000 2020,charlie kirk coke dealer come forward please,796591319971811328,pawg save america,kmpossible_,disgraced dirtbag leftist and currently listen...,1,en,female,<=18,non-org,pos,0.3182,2020-09-01,1,['stimulants'],['anticipation']
