In [55]:
import pandas as pd
import json
import torch
from datasets import load_dataset
import random

import csv
import requests
import urllib
from statistics import mean
import ast
import datetime

import nltk
from nltk.corpus import wordnet as wn


# Load and save MMLU dataset

More details can be found at https://huggingface.co/datasets/cais/mmlu 

In [24]:
cache_dir='/shared/3/cache/huggingface'
mmlu_subset_lst = ['abstract_algebra']
dataset_path = "../data/example_dataset.csv"

In [28]:
def process_mmlu_data(mmlu_subset_lst, cache_dir, dataset_path):
    mmlu_datasets = []
    for task in mmlu_subset_lst:
        subset = load_dataset("cais/mmlu", task, cache_dir=cache_dir)['test']
        mmlu_datasets.extend(subset)

    mmlu_datasets_df = pd.DataFrame(mmlu_datasets)
    options = mmlu_datasets_df['choices'].apply(pd.Series)
    options = options.rename(columns={0: 'option1', 1: 'option2', 2: 'option3', 3: 'option4'})

    dataset_df = pd.concat([mmlu_datasets_df.drop('choices', axis=1), options], axis=1)
    dataset_df['answer'] = dataset_df['answer'] + 1
    dataset_df.rename(columns={'subject': 'dataset'}, inplace=True)

    dataset_df.to_csv(dataset_path)

    return dataset_df

In [29]:
dataset_df = process_mmlu_data(mmlu_subset_lst, cache_dir, dataset_path)

You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this dataset from the next major release of `datasets`.


In [30]:
dataset_df.head()

Unnamed: 0,question,dataset,answer,option1,option2,option3,option4
0,Find the degree for the given field extension ...,abstract_algebra,2,0,4,2,6
1,"Let p = (1, 2, 5, 4)(2, 3) in S_5 . Find the i...",abstract_algebra,3,8,2,24,120
2,Find all zeros in the indicated finite field o...,abstract_algebra,4,0,1,01,04
3,Statement 1 | A factor group of a non-Abelian ...,abstract_algebra,2,"True, True","False, False","True, False","False, True"
4,Find the product of the given polynomials in t...,abstract_algebra,2,2x^2 + 5,6x^2 + 4x + 6,0,x^2 + 1


# Prepare social roles

A comprehensive list of occupations: https://www.pnas.org/syndication/doi/10.1073/pnas.1720347115#supplementary-materials (Appendix A.3)

## Construct the list of social roles of interest

In [31]:
occu_str = "janitor, statistician, midwife, bailiff, auctioneer, photographer, geologist, shoemaker, athlete, cashier, dancer, housekeeper, accountant, physicist, gardener, dentist, weaver, blacksmith, psychologist, supervisor, mathematician, surveyor, tailor, designer, economist, mechanic, laborer, postmaster, broker, chemist, librarian, attendant, clerical, musician, porter, scientist, carpenter, sailor, instructor, sheriff, pilot, inspector, mason,baker, administrator, architect, collector, operator, surgeon, driver, painter, conductor, nurse, cook, engineer,retired, sales, lawyer, clergy, physician, farmer, clerk, manager, guard, artist, smith, official, police, doctor,professor, student, judge, teacher, author, secretary, soldier"

prof_occu_str = "statistician, auctioneer, photographer, geologist, accountant, physicist, dentist, psychologist, supervisor, mathematician, designer, economist, postmaster, broker, chemist, librarian, scientist, instructor,pilot, administrator, architect, surgeon, nurse, engineer, lawyer, physician, manager, official, doctor, professor,student, judge, teacher, author"

In [35]:
occu_lst = occu_str.replace(",", " ").split()
prof_occu_lst = prof_occu_str.replace(",", " ").split()

In [33]:
mmlu_occu_dic = {'natural science':['biologist', 'geneticist', 'ecologist', 'physicist', 'chemist', 'research scientist'],
                'eecs':['software engineer', 'data scientist', 'web developer', 'electrical engineer', 'electronics technician'],
                'math':['mathematician', 'statistician', 'data analyst'],
                'econ':['economist', 'financial analyst', 'economic researcher'],
                'history':['historian', 'archivist', 'historical researcher', 'archaeologist'],
                'politics':['politician', 'governer', 'republican', 'democrat'],
                'law':['lawyer'],
                'psychology':['pcychologist'],
                'religion':['christian', 'muslim', 'jew', 'enthusiast']}

mmlu_rel_str = ', '.join([', '.join(values) for values in mmlu_occu_dic.values()])
mmlu_rel_str

'biologist, geneticist, ecologist, physicist, chemist, research scientist, software engineer, data scientist, web developer, electrical engineer, electronics technician, mathematician, statistician, data analyst, economist, financial analyst, economic researcher, historian, archivist, historical researcher, archaeologist, politician, governer, republican, democrat, lawyer, pcychologist, christian, muslim, jew, enthusiast'

In [34]:
# mmlu_rel2type = {}
# for key, values in mmlu_occu_dic.items():
#     for rel in values:
#         mmlu_rel2type[rel] = key

## Get Google n-gram frequency

In [46]:
def runQuery(query, start_year=2018, 
             end_year=2019, corpus=26,
             smoothing=0):
  
    # converting a regular string to the standard URL format 
    # eg: "geeks for,geeks" will convert to "geeks%20for%2Cgeeks"
    query = urllib.parse.quote(query)
  
    # creating the URL
    url = 'https://books.google.com/ngrams/json?content=' + query + '&year_start=' + str(start_year) + '&year_end=' + str(end_year) + '&corpus=' + str(corpus) + '&smoothing=' + str(smoothing) + ''
  
    # requesting data from the above url
    response = requests.get(url)
  
    # extracting the json data from the response we got
    output = response.json()
  
    # creating a list to store the ngram data
    return_data = []
  
    if len(output) == 0:
        # if no data returned from site,
        # print the following statement
        return "No data available for this Ngram."
    else:
        # if data returned from site,
        # store the data in return_data list
        for num in range(len(output)):
            
              # getting the name
            return_data.append((output[num]['ngram'],          
                                # getting ngram data
                                output[num]['timeseries']) 
                               )
        ngram_freq_dic = {key: mean(values) for key, values in return_data}

    
    return ngram_freq_dic

In [38]:
runQuery('researcher')

[('researcher', [1.1649870430119336e-05, 1.2804611287720036e-05])]

In [47]:
role_str = "researcher, sister"
ngram_freq_dic = runQuery(role_str)
ngram_freq_dic

{'researcher': 1.2227240858919686e-05, 'sister': 7.55954060878139e-05}

In [48]:
ngram_freq_df = pd.DataFrame(ngram_freq_dic.items(), columns=['role', 'frequency'])

In [51]:
ngram_freq_df

Unnamed: 0,role,frequency
0,researcher,1.2e-05
1,sister,7.6e-05


In [50]:
ngram_freq_df.to_csv("../data/example_social_roles.csv", index=False)

## Optional: Extend the role list using WordNet

In [53]:
def get_hyponyms(words, expected_num_hyponyms):
    result = {}
    for word in words:
        unique_hyponyms = set()
        synsets = wn.synsets(word)
        for synset in synsets:
            # Get hyponyms
            for hyponym in synset.hyponyms():
                unique_hyponyms.update([lemma.name() for lemma in hyponym.lemmas()])
                if len(unique_hyponyms) >= expected_num_hyponyms:
                    break
            if len(unique_hyponyms) >= expected_num_hyponyms:
                break

        result[word] = list(unique_hyponyms)[:expected_num_hyponyms]
    return result

In [59]:
nltk.download('wordnet')
nltk.download('omw-1.4')

[nltk_data] Downloading package wordnet to /home/elisazmq/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /home/elisazmq/nltk_data...


True

In [60]:
'''EXAMPLE'''
words = ['mother', 'father']
num = 5
hypo = get_hyponyms(words, num)
hypo

{'mother': ['mama', 'mum', 'ma', 'mommy', 'momma'],
 'father': ['pappa', 'pop', 'papa', 'pa', 'dada']}