# Creating a glossary

In [None]:
# This script takes a list of n-grams (the non-obvious n-grams which appear in the images in this paper)
# For each token, it finds an instance where the token appeared in a bio in 2022
# It then creates a dictionary csv file with three columns: token, definition, example
# The tokens are token, the bios are example, and defintion must be filled in by the user

In [95]:
import pandas as pd
import re
import random

# Opens csv file of bios and ids
# Returns pandas data frame
def load_data(filename):
    data = pd.read_csv(filename)
    assert data.shape[0] == 250000
    return data

# Takes token, data file as input
# Returns random bio with token
def get_bio_with_token(token, data):
    for i in range(data.shape[0]):
        bio = data.iloc[i, 1]
        if pd.isna(bio): bio = ""        
        bio = bio.casefold()
        tokens = re.split("[^a-zA-Z0-9/'`’-]", bio)
        if token in tokens:
            return data.iloc[i, 1] # Return original bio
    print(token)
    assert False
   
# Takes bigram, data file as input
# Returns random bio with bigram
def get_bio_with_bigram(bigram, data):
    bigram = bigram.casefold()
    bigrams = re.split("[^a-zA-Z0-9/'`’-]", bigram)
    assert len(bigrams) == 2
    
    for i in range(data.shape[0]):        
        bio = data.iloc[i, 1]
        if pd.isna(bio): bio = ""        
        bio = bio.casefold()
        tokens = re.split("[^a-zA-Z0-9/'`’-]", bio)
        
        for j in range(len(tokens) - 1):
            if tokens[j] == bigrams[0] and tokens[j+1] == bigrams[1]:
                return data.iloc[i, 1]
    print(bigrams[0], bigrams[1])
    assert False
    
# Takes trigram, data file as input
# Returns random bio with trigram
def get_bio_with_trigram(trigram, data):
    trigram = trigram.casefold()
    trigrams = re.split("[^a-zA-Z0-9/'`’-]", trigram)
    assert len(trigrams) == 3
    
    for i in range(data.shape[0]):        
        bio = data.iloc[i, 1]
        if pd.isna(bio): bio = ""        
        bio = bio.casefold()
        tokens = re.split("[^a-zA-Z0-9/'`’-]", bio)
        
        for j in range(len(tokens) - 2):
            if tokens[j] == trigrams[0] and tokens[j+1] == trigrams[1] and tokens[j+2] == trigrams[2]:
                return data.iloc[i, 1]
    print(trigrams[0], trigrams[1], trigrams[2])
    assert False

# Takes a list of tokens, bigrams, and trigrams, and a data frame containing bios as input
# Creates a df with the glossary skeleton
def create_df_from_tokens(tokens, bigrams, trigrams, bios):
    # Creating data frame
    df = pd.DataFrame(columns = ['Ngram', 'Definition', 'Example']) 
    
    # Adding tokens data to df
    # Not very efficient, but because the # of tokens being defined is <250, doesn't really matter
    for i in range(len(tokens)):
        this_bio = get_bio_with_token(tokens[i], data)
        df.loc[len(df.index)] = [tokens[i], '', this_bio] 
        
    # Adding bigrams data to df
    for i in range(len(bigrams)):
        this_bio = get_bio_with_bigram(bigrams[i], data)
        df.loc[len(df.index)] = [bigrams[i], '', this_bio] 
        
    # Adding trigrams data to df
    for i in range(len(trigrams)):
        this_bio = get_bio_with_trigram(trigrams[i], data)
        df.loc[len(df.index)] = [trigrams[i], '', this_bio]         
    
    # Writing output data frame to csv
    df = df.sort_values(by=['Ngram'], ascending = True)
    filename = '/REU/paper_data/glossary.csv'
    df.to_csv(filename, index = False)
    
    print("Created file:", filename)

In [17]:
# Test
'''
data = load_data("/REU/random_250000_bios.csv")

tokens = ['chicago', 'toronto', 'bama', '7']
bigrams = ['roll tide', 'make america']
trigrams = ['black lives matter', 'do not interact']

create_df_from_tokens(tokens, bigrams, trigrams, data)
'''

'\ndata = load_data("/REU/random_250000_bios.csv")\n\ntokens = [\'chicago\', \'toronto\', \'bama\', \'7\']\nbigrams = [\'roll tide\', \'make america\']\ntrigrams = [\'black lives matter\', \'do not interact\']\n\ncreate_df_from_tokens(tokens, bigrams, trigrams, data)\n'

In [96]:
# Adding tokens from charts
rel_prev_20_tokens = ['ttrpgs', 
                      'polyam', 
                      'cis', 
                      '1312', 
                      'ttrpg', 
                      'ao3', 
                      'critter', 
                      'pan', 
                      'trans', 
                      'actuallyautistic', 
                      'enby',
                      'que',
                      'para',
                      'por',
                      'located',
                      'champions',
                      'patriot',
                      'estate',
                      'maga',
                      'providing']

rel_prev_20_bigrams = ['blm acab',
                       'icon by',
                      'header by',
                      'are human',
                      'avatar by',
                      'not spoiler',
                      'sometimes nsfw',
                      'spoiler free',
                      'age in',
                      'rights are',
                      'pfp by',
                      'banner by',
                      'http //ko-fi',
                      'minors dni',
                      'all american',
                      'feed of',
                      'the leading',
                      'go brandon',
                      'n de',
                      '1st team',
                      'school football',
                      'and surrounding',
                      'your source',
                      '4 0',
                      '1a 2a',
                      'america first',
                      'our mission']

rel_prev_20_trigrams = ['rights are human',
                       'are human rights',
                       'profile pic by',
                       'into the void',
                       'all opinions my',
                       'all views my',
                       'opinions my own',
                       'views my own',
                       'i like video',
                       'like video games',
                       'is to help',
                       '1st team all',
                       'amante de la',
                       'back the blue',
                       'us on instagram',
                       'do your own',
                       'source for all',
                       'mission is to',
                       'your source for',
                       'page of the']

prev_scatterplot_tokens = ['blm']
prev_scatterplot_bigrams = []
prev_scatterplot_trigrams = []

rel_prev_by_gender_redblueyellow_tokens = ['bbw',
                                           'bangtan',
                                           'yoongi',
                                           'bts',
                                           'twt',
                                           'ot7',
                                           'tpwk',
                                           'liner',
                                           'scbwi',
                                           'multi',
                                           'got7',
                                           'mx',
                                           'mia',
                                           'cancer',
                                           'aquarius',
                                           'spoonie',
                                           'ao3',
                                           'intersectional',
                                           'acnh',
                                           'witch',
                                           'libra',
                                           'lover',
                                           '2d',
                                           'nsfw',
                                           'genshin',
                                           'infp',
                                           'lefty',
                                           'demisexual',
                                           '1312',
                                           'pan',
                                           'icon',
                                           'rpg',
                                           'gaymer',
                                           'furry',
                                           'critter',
                                           'ttrpg',
                                           'mtg',
                                           'ttrpgs',
                                           'cis',
                                           '//t',
                                           'pup',
                                           'halo',
                                           'dom',
                                           'aws',
                                           'boi',
                                           'telegram',
                                           'ranked',
                                           'twink',
                                           'ftm']                                          

rel_prev_by_gender_redblueyellow_bigrams = ['for bts',
                                            'bts army',
                                            'bts twt',
                                            'harry styles',
                                            'lives matter',
                                            'are human',
                                            'trans rights',
                                            'fighting games',
                                            'd d']

rel_prev_by_gender_redblueyellow_trigrams = []

rel_prev_by_gender_green_tokens = ['ig','pero', 'the', 'mi', 'vida', 'nft', 'que', 'nfl', 'lakers', 'brandon', 'dl', 'maga', 'stock', 'braves']

rel_prev_by_gender_green_bigrams = ['/ her', 'a call', '0 0', 'c a', 's s', 'd a', 'i l', 'it happen', '4 13', 'de los', 'lo que']
rel_prev_by_gender_green_bigrams += ['an independent', 'not financial', 'coach -', 'track and', 'a mi', 'o e']

rel_prev_by_gender_green_trigrams = ['do your own', '1 of 1', 'quality of life', 'r i p']

In [98]:
# Creating glossary
tokens = list(set(rel_prev_20_tokens + prev_scatterplot_tokens + rel_prev_by_gender_redblueyellow_tokens + rel_prev_by_gender_green_tokens))
bigrams = list(set(rel_prev_20_bigrams + prev_scatterplot_bigrams + rel_prev_by_gender_redblueyellow_bigrams + rel_prev_by_gender_green_bigrams))
trigrams = list(set(rel_prev_20_trigrams + prev_scatterplot_trigrams + rel_prev_by_gender_redblueyellow_trigrams + rel_prev_by_gender_green_trigrams))

data = load_data("/REU/random_250000_bios.csv")

create_df_from_tokens(tokens, bigrams, trigrams, data)

Created file: /REU/paper_data/glossary.csv
