In [1]:
import pandas
import nltk
from nltk.corpus import wordnet
import math

In [2]:
df = pandas.read_pickle("lcp_2021.pkl")

In [3]:
df['synset_count'] = df.token.apply(lambda x : len(wordnet.synsets(str(x))))

In [4]:
def get_wn_depth(token):
    synsets = wordnet.synsets(str(token))
    if len(synsets) > 0:
        return synsets[0].max_depth()
    else:
        return 0

df['wn_depth'] = df.token.apply(lambda x : get_wn_depth(x))

In [5]:
def get_hyponyms(token):
    synsets = wordnet.synsets(str(token))
    if len(synsets) > 0:
        return len(synsets[0].hyponyms())
    else:
        return 0

df['hyponym_count'] = df.token.apply(lambda x : get_hyponyms(x))

In [6]:
# to run this you will need a copy of the Web1T unigrams vocab_cs file

vocab_cs_path = ""

web_1T = {}
with open(vocab_cs_path,'r') as f:
    for line in f.readlines():
        split = line.split("\t")
        web_1T[split[0]] = int(split[1])

def get_web1t_freq(token, log=False):
    freq = 0
    if token in web_1T:
        freq = web_1T[token]
    return math.log(freq+1) if log else freq 


df['web_1t'] = df.token.apply(lambda x : get_web1t_freq(x))
df['log_web_1t'] = df.token.apply(lambda x : get_web1t_freq(x, True))

In [7]:
# to run this you will need a copy of the file: SUBTLEXus74286wordstextversion.txt

subtlex_path = ""

subtlex = {}
with open(subtlex_path,'r') as f:
    for line in f.readlines()[1:]:
        split = line.split("\t")
        subtlex[split[0]] = int(split[1])

def get_subtlex_freq(token, log=False):
    freq = 0
    if token in subtlex:
        freq = subtlex[token]
    return math.log(freq+1) if log else freq 


df['subtlex'] = df.token.apply(lambda x : get_subtlex_freq(x))
df['log_subtlex'] = df.token.apply(lambda x : get_subtlex_freq(x, True))

In [8]:
from wonderlic_nlp import WonderlicNLP
from tqdm.notebook import tqdm

In [9]:
wnlp = WonderlicNLP()

ftrs = wnlp.analyze("cat")

In [10]:
fam = []
img = []
cnc = []
aoa = []

for token in tqdm(df.token.to_list()):
    ftrs = wnlp.analyze(str(token))
    fam.append(ftrs['mrc']['Fam'])
    img.append(ftrs['mrc']['Imag'])
    cnc.append(ftrs['mrc']['Conc'])
    aoa.append(ftrs['mrc']['AOA'])

  0%|          | 0/10800 [00:00<?, ?it/s]

In [11]:
df['familiarity'] = fam
df['imageability'] = img
df['concreteness'] = cnc
df['ageOfAcqu'] = aoa

In [12]:
df.corr()

Unnamed: 0,anno_count,complexity,subjectivity,synset_count,wn_depth,hyponym_count,web_1t,log_web_1t,subtlex,log_subtlex,familiarity,imageability,concreteness,ageOfAcqu
anno_count,1.0,-0.006157,0.002166,-0.000456,-0.007995,-0.000673,0.002606,-0.005487,-0.001828,-0.004528,0.004799,0.002081,0.00109,0.000528
complexity,-0.006157,1.0,0.641457,-0.38695,-0.229183,-0.197414,-0.3302,-0.443077,-0.245508,-0.572658,-0.351407,-0.331379,-0.313677,-0.192058
subjectivity,0.002166,0.641457,1.0,-0.264912,-0.093522,-0.166773,-0.28303,-0.271023,-0.222114,-0.412187,-0.273612,-0.257938,-0.244608,-0.143811
synset_count,-0.000456,-0.38695,-0.264912,1.0,0.240944,0.102879,0.400755,0.472291,0.307537,0.509719,0.190059,0.176753,0.183789,0.083268
wn_depth,-0.007995,-0.229183,-0.093522,0.240944,1.0,0.076729,0.072052,0.679126,0.057437,0.394516,-0.126224,-0.091767,-0.060004,-0.055689
hyponym_count,-0.000673,-0.197414,-0.166773,0.102879,0.076729,1.0,0.158838,0.198561,0.089666,0.196264,0.08304,0.084203,0.09979,0.034594
web_1t,0.002606,-0.3302,-0.28303,0.400755,0.072052,0.158838,1.0,0.38645,0.454876,0.4175,0.26404,0.219275,0.217878,0.141981
log_web_1t,-0.005487,-0.443077,-0.271023,0.472291,0.679126,0.198561,0.38645,1.0,0.230239,0.668821,-0.002807,-0.005348,0.019663,0.036421
subtlex,-0.001828,-0.245508,-0.222114,0.307537,0.057437,0.089666,0.454876,0.230239,1.0,0.488809,0.234322,0.219372,0.209361,0.052384
log_subtlex,-0.004528,-0.572658,-0.412187,0.509719,0.394516,0.196264,0.4175,0.668821,0.488809,1.0,0.23851,0.226166,0.229902,0.143873


In [13]:
df.to_pickle("lcp_ftrs.pkl")