In [1]:
import pandas as pd

pd.set_option('mode.chained_assignment', None)

words = pd.read_csv('../input/subtlex-word-frequency/SUBTLEXfreqPoS.csv') # Words w/ their frequency & part of speech
nouns = pd.read_csv('../input/list-of-nouns/nounlist.csv', header=None) # Common singular nouns

# Drop words w/ only 1 occurrence 
lowFreq = words[words['FREQcount'] == 1].index
wordsSorted = words.drop(lowFreq)

# Sort by film frequency then alphebetically
wordsSorted.sort_values(by=['Lg10CD', 'Word'], ascending=[False, True], ignore_index=True, inplace=True)

wordsSorted

Unnamed: 0,Word,FREQcount,CDcount,FREQlow,Cdlow,SUBTLWF,Lg10WF,SUBTLCD,Lg10CD,Dom_PoS_SUBTLEX,Freq_dom_PoS_SUBTLEX,Percentage_dom_PoS,All_PoS_SUBTLEX,All_freqs_SUBTLEX
0,the,1501908,8388,1339811,8388,29449.18,6.1766,100.00,3.9237,Article,1499459.0,1.00,Article.Adverb.Noun.Preposition.Adjective,1499459.774.147.105.1
1,to,1156570,8383,1138435,8380,22677.84,6.0632,99.94,3.9235,To,787798.0,0.68,To.Preposition.Adverb.Noun.Adjective,787798.366834.931.79.68
2,a,1041179,8382,976941,8380,20415.27,6.0175,99.93,3.9234,Article,993445.0,0.96,Article.Adverb.Letter.To.Noun.Preposition.Adje...,993445.33186.6441.744.257.52.5
3,you,2134713,8381,1595028,8376,41857.12,6.3293,99.92,3.9233,Pronoun,2125610.0,1.00,Pronoun.Noun,2125610.79
4,and,682780,8379,515365,8374,13387.84,5.8343,99.89,3.9232,Conjunction,680792.0,1.00,Conjunction.Adverb.Interjection.Noun.Name,680792.745.22.4.4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
60379,zionism,2,1,2,1,0.04,0.4771,0.01,0.3010,Noun,2.0,1.00,Noun,2
60380,zionists,2,1,0,0,0.04,0.4771,0.01,0.3010,Noun,2.0,1.00,Noun,2
60381,zloty,2,1,2,1,0.04,0.4771,0.01,0.3010,Noun,2.0,1.00,Noun,2
60382,zoon,6,1,6,1,0.12,0.8451,0.01,0.3010,Noun,6.0,1.00,Noun,6


In [2]:
# Utility functions

# Return position of substring in string list
def findPos(string, substring, delimiter):
    return string[:string.index(substring)].count(delimiter)

# Return nth value in a string list of values
def findNthVal(vals, delimiter, n):
    vals = vals + delimiter 
    index = -1
    for i in range(n): 
        index = vals.find(delimiter, index+1)
        
    return int(vals[index+1:vals.index(delimiter,index+1)])

---
# Noun Data:
---

In [3]:
# List of all nouns
nounsAll = words[words['All_PoS_SUBTLEX'].str.contains('Noun', na = False)]

# Find frequency of each word as a noun 
nounFreq = []
for i in nounsAll.index:
    n = findPos(nounsAll['All_PoS_SUBTLEX'][i], 'Noun', '.')
    nounFreq.append(findNthVal(nounsAll['All_freqs_SUBTLEX'][i], '.', n))
   
# Sort by noun frequency
nounsAll['Noun_freq'] = nounFreq
nounsAll.sort_values(by=['Noun_freq', 'Word'], ascending=[False, True], ignore_index=True, inplace=True)

nounsAll

Unnamed: 0,Word,FREQcount,CDcount,FREQlow,Cdlow,SUBTLWF,Lg10WF,SUBTLCD,Lg10CD,Dom_PoS_SUBTLEX,Freq_dom_PoS_SUBTLEX,Percentage_dom_PoS,All_PoS_SUBTLEX,All_freqs_SUBTLEX,Noun_freq
0,time,99890,8308,96603,8303,1958.63,4.9995,99.05,3.9195,Noun,98420.0,0.99,Noun.Verb.Adverb,98420.725.656,98420
1,man,94133,8054,84739,7956,1845.75,4.9737,96.02,3.9061,Noun,93487.0,1.00,Noun.Verb.Name,93487.401.45,93487
2,way,72661,8222,71259,8206,1424.73,4.8613,98.02,3.9150,Noun,71592.0,0.99,Noun.Adverb.Preposition.Name,71592.899.94.6,71592
3,people,56252,7889,50482,7791,1102.98,4.7501,94.05,3.8971,Noun,56170.0,1.00,Noun.Name,56170.2,56170
4,thing,55522,8058,55245,8057,1088.67,4.7445,96.07,3.9063,Noun,55460.0,1.00,Noun.Name,55460.22,55460
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
45465,zoroaster,1,1,0,0,0.02,0.3010,0.01,0.3010,Noun,1.0,1.00,Noun,1,1
45466,zoroastrianism,1,1,0,0,0.02,0.3010,0.01,0.3010,Noun,1.0,1.00,Noun,1,1
45467,zugzwang,1,1,1,1,0.02,0.3010,0.01,0.3010,Noun,1.0,1.00,Noun,1,1
45468,zygotes,1,1,1,1,0.02,0.3010,0.01,0.3010,Noun,1.0,1.00,Noun,1,1


In [4]:
# List of all COMMON SINGULAR nouns
nounsCommon = nounsAll[nounsAll['Word'].isin(nouns[0])]

nounsCommon

Unnamed: 0,Word,FREQcount,CDcount,FREQlow,Cdlow,SUBTLWF,Lg10WF,SUBTLCD,Lg10CD,Dom_PoS_SUBTLEX,Freq_dom_PoS_SUBTLEX,Percentage_dom_PoS,All_PoS_SUBTLEX,All_freqs_SUBTLEX,Noun_freq
0,time,99890,8308,96603,8303,1958.63,4.9995,99.05,3.9195,Noun,98420.0,0.99,Noun.Verb.Adverb,98420.725.656,98420
1,man,94133,8054,84739,7956,1845.75,4.9737,96.02,3.9061,Noun,93487.0,1.00,Noun.Verb.Name,93487.401.45,93487
2,way,72661,8222,71259,8206,1424.73,4.8613,98.02,3.9150,Noun,71592.0,0.99,Noun.Adverb.Preposition.Name,71592.899.94.6,71592
3,people,56252,7889,50482,7791,1102.98,4.7501,94.05,3.8971,Noun,56170.0,1.00,Noun.Name,56170.2,56170
4,thing,55522,8058,55245,8057,1088.67,4.7445,96.07,3.9063,Noun,55460.0,1.00,Noun.Name,55460.22,55460
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
45075,vibrissae,1,1,0,0,0.02,0.3010,0.01,0.3010,Noun,1.0,1.00,Noun,1,1
45185,washbasin,1,1,1,1,0.02,0.3010,0.01,0.3010,Noun,1.0,1.00,Noun,1,1
45194,wasting,1135,955,1085,928,22.25,3.0554,11.39,2.9805,Verb,1132.0,1.00,Verb.Adjective.Noun,1132.2.1,1
45207,waterwheel,1,1,1,1,0.02,0.3010,0.01,0.3010,Noun,1.0,1.00,Noun,1,1


In [5]:
# List of common nouns missing from list
nounsMissing = nouns[~nouns[0].isin(nounsAll['Word'])]

nounsMissing

Unnamed: 0,0
0,ATM
1,CD
2,SUV
3,TV
11,abolishment
...,...
6770,yellowjacket
6780,yourself
6784,zampone
6786,zebrafish


---
# Verb Data:
---

In [6]:
# List of all verbs
verbsAll = words[words['All_PoS_SUBTLEX'].str.contains('Verb', na = False)]

# Find frequency of each word as a verb 
verbFreq = []
for i in verbsAll.index:
    n = findPos(verbsAll['All_PoS_SUBTLEX'][i], 'Verb', '.')
    verbFreq.append(findNthVal(verbsAll['All_freqs_SUBTLEX'][i], '.', n))
   
# Sort by verb frequency
verbsAll['Verb_freq'] = verbFreq
verbsAll.sort_values(by=['Verb_freq', 'Word'], ascending=[False, True], ignore_index=True, inplace=True)

verbsAll

Unnamed: 0,Word,FREQcount,CDcount,FREQlow,Cdlow,SUBTLWF,Lg10WF,SUBTLCD,Lg10CD,Dom_PoS_SUBTLEX,Freq_dom_PoS_SUBTLEX,Percentage_dom_PoS,All_PoS_SUBTLEX,All_freqs_SUBTLEX,Verb_freq
0,do,312915,8348,262536,8340,6135.59,5.4954,99.52,3.9216,Verb,629203.0,1.00,Verb.Noun,629203.22,629203
1,is,459663,8370,420989,8364,9013.00,5.6624,99.79,3.9228,Verb,488770.0,1.00,Verb.Adverb.Noun.Name,488770.730.14.1,488770
2,have,314232,8363,296664,8354,6161.41,5.4973,99.70,3.9224,Verb,332099.0,1.00,Verb.Noun,332099.104,332099
3,was,288391,8345,282414,8340,5654.73,5.4600,99.49,3.9215,Verb,309709.0,1.00,Verb.Noun,309709.4,309709
4,be,293085,8363,285000,8361,5746.76,5.4670,99.70,3.9224,Verb,292904.0,1.00,Verb.Noun,292904.68,292904
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
23540,zep,5,1,0,0,0.10,0.7782,0.01,0.3010,Name,4.0,0.80,Name.Verb,4.1,1
23541,zeroes,39,32,33,30,0.76,1.6021,0.38,1.5185,Noun,34.0,0.87,Noun.Number.Verb,34.4.1,1
23542,zigzagged,1,1,1,1,0.02,0.3010,0.01,0.3010,Verb,1.0,1.00,Verb,1,1
23543,zinged,1,1,1,1,0.02,0.3010,0.01,0.3010,Verb,1.0,1.00,Verb,1,1


---
# Adjective Data:
---

In [7]:
# List of all adjectives
adjectivesAll = words[words['All_PoS_SUBTLEX'].str.contains('Adjective', na = False)]

# Find frequency of each word as a adjective 
adjectiveFreq = []
for i in adjectivesAll.index:
    n = findPos(adjectivesAll['All_PoS_SUBTLEX'][i], 'Adjective', '.')
    adjectiveFreq.append(findNthVal(adjectivesAll['All_freqs_SUBTLEX'][i], '.', n))
   
# Sort by adjective frequency
adjectivesAll['Adjective_freq'] = adjectiveFreq
adjectivesAll.sort_values(by=['Adjective_freq', 'Word'], ascending=[False, True], ignore_index=True, inplace=True)

adjectivesAll

Unnamed: 0,Word,FREQcount,CDcount,FREQlow,Cdlow,SUBTLWF,Lg10WF,SUBTLCD,Lg10CD,Dom_PoS_SUBTLEX,Freq_dom_PoS_SUBTLEX,Percentage_dom_PoS,All_PoS_SUBTLEX,All_freqs_SUBTLEX,Adjective_freq
0,good,133117,8316,90673,8267,2610.14,5.1242,99.14,3.9200,Adjective,131274.0,0.99,Adjective.Adverb.Name.Noun,131274.1424.147.61,131274
1,sorry,59566,7923,43411,7595,1167.96,4.7750,94.46,3.8989,Adjective,59455.0,1.00,Adjective.Noun,59455.2,59455
2,little,73766,8072,69783,8051,1446.39,4.8679,96.23,3.9070,Adjective,46077.0,0.63,Adjective.Adverb.Determiner.Name.Noun,46077.23682.3524.394.1,46077
3,right,204428,8320,181467,8310,4008.39,5.3105,99.19,3.9202,Adverb,148137.0,0.73,Adverb.Adjective.Noun.Verb.Name,148137.41292.14475.200.76,41292
4,great,41864,7462,32689,7094,820.86,4.6219,88.96,3.8729,Adjective,41101.0,0.98,Adjective.Adverb.Name.Noun,41101.645.65.2,41101
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
15161,zing,60,37,33,25,1.18,1.7853,0.44,1.5798,Verb,40.0,0.67,Verb.Noun.Name.Adjective,40.12.7.1,1
15162,zippered,3,3,3,3,0.06,0.6021,0.04,0.6021,Verb,2.0,0.67,Verb.Adjective,2.1,1
15163,zombielike,1,1,1,1,0.02,0.3010,0.01,0.3010,Adjective,1.0,1.00,Adjective,1,1
15164,zonal,1,1,1,1,0.02,0.3010,0.01,0.3010,Adjective,1.0,1.00,Adjective,1,1


---
# Adverb Data:
---

In [8]:
# List of all adverbs
adverbsAll = words[words['All_PoS_SUBTLEX'].str.contains('Adverb', na = False)]

# Find frequency of each word as a adverb 
adverbFreq = []
for i in adverbsAll.index:
    n = findPos(adverbsAll['All_PoS_SUBTLEX'][i], 'Adverb', '.')
    adverbFreq.append(findNthVal(adverbsAll['All_freqs_SUBTLEX'][i], '.', n))
   
# Sort by adverb frequency
adverbsAll['Adverb_freq'] = adverbFreq
adverbsAll.sort_values(by=['Adverb_freq', 'Word'], ascending=[False, True], ignore_index=True, inplace=True)

adverbsAll

Unnamed: 0,Word,FREQcount,CDcount,FREQlow,Cdlow,SUBTLWF,Lg10WF,SUBTLCD,Lg10CD,Dom_PoS_SUBTLEX,Freq_dom_PoS_SUBTLEX,Percentage_dom_PoS,All_PoS_SUBTLEX,All_freqs_SUBTLEX,Adverb_freq
0,just,242206,8338,193792,8309,4749.14,5.3842,99.40,3.9211,Adverb,232666.0,0.96,Adverb.Adjective.Noun.Name,232666.9038.14.2,232666
1,here,230788,8352,201195,8343,4525.25,5.3632,99.57,3.9218,Adverb,230319.0,1.00,Adverb.Noun,230319.16,230319
2,so,216452,8350,128430,8325,4244.16,5.3354,99.55,3.9217,Adverb,192197.0,0.89,Adverb.Conjunction.Adjective.Noun,192197.23905.23.9,192197
3,up,187170,8345,184579,8342,3670.00,5.2722,99.49,3.9215,Adverb,172938.0,0.93,Adverb.Preposition.Verb.Adjective.Noun,172938.12815.641.531.12,172938
4,now,163333,8350,103585,8303,3202.61,5.2131,99.55,3.9217,Adverb,159745.0,0.98,Adverb.Conjunction.Noun,159745.3231.14,159745
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2697,wordlessly,2,2,0,0,0.04,0.4771,0.02,0.4771,Name,1.0,0.50,Name.Adverb,1.1,1
2698,workwise,1,1,0,0,0.02,0.3010,0.01,0.3010,Adverb,1.0,1.00,Adverb,1,1
2699,worriedly,1,1,1,1,0.02,0.3010,0.01,0.3010,Adverb,1.0,1.00,Adverb,1,1
2700,yesternight,1,1,1,1,0.02,0.3010,0.01,0.3010,Adverb,1.0,1.00,Adverb,1,1


---
# Preposition Data:
---

In [9]:
# List of all prepositions
prepositionsAll = words[words['All_PoS_SUBTLEX'].str.contains('Preposition', na = False)]

# Find frequency of each word as a prepositions 
prepositionFreq = []
for i in prepositionsAll.index:
    n = findPos(prepositionsAll['All_PoS_SUBTLEX'][i], 'Preposition', '.')
    prepositionFreq.append(findNthVal(prepositionsAll['All_freqs_SUBTLEX'][i], '.', n))
   
# Sort by preposition frequency
prepositionsAll['Preposition_freq'] = prepositionFreq
prepositionsAll.sort_values(by=['Preposition_freq', 'Word'], ascending=[False, True], ignore_index=True, inplace=True)

prepositionsAll

Unnamed: 0,Word,FREQcount,CDcount,FREQlow,Cdlow,SUBTLWF,Lg10WF,SUBTLCD,Lg10CD,Dom_PoS_SUBTLEX,Freq_dom_PoS_SUBTLEX,Percentage_dom_PoS,All_PoS_SUBTLEX,All_freqs_SUBTLEX,Preposition_freq
0,of,590439,8375,573021,8372,11577.24,5.7712,99.85,3.9230,Preposition,553506.0,0.94,Preposition.Adverb.Adjective.Conjunction.Noun....,553506.34686.1413.156.19.15,553506
1,in,498444,8372,473880,8369,9773.41,5.6976,99.81,3.9229,Preposition,443510.0,0.89,Preposition.Adverb.Conjunction.Noun.Verb.Adjec...,443510.50997.3154.67.49.39,443510
2,to,1156570,8383,1138435,8380,22677.84,6.0632,99.94,3.9235,To,787798.0,0.68,To.Preposition.Adverb.Noun.Adjective,787798.366834.931.79.68,366834
3,for,351650,8374,332686,8370,6895.10,5.5461,99.83,3.9230,Preposition,344212.0,0.98,Preposition.Adverb.Conjunction.Noun,344212.5287.1830.50,344212
4,with,257465,8351,247160,8349,5048.33,5.4107,99.56,3.9218,Preposition,257287.0,1.00,Preposition.Noun,257287.17,257287
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
198,reference,432,336,406,332,8.47,2.6365,4.01,2.5276,Noun,409.0,0.95,Noun.Verb.Preposition.Name,409.18.2.2,2
199,afore,19,10,18,10,0.37,1.3010,0.12,1.0414,Conjunction,18.0,0.95,Conjunction.Preposition,18.1,1
200,apropos,17,16,12,11,0.33,1.2553,0.19,1.2304,Adverb,16.0,0.94,Adverb.Preposition,16.1,1
201,propos,1,1,1,1,0.02,0.3010,0.01,0.3010,Preposition,1.0,1.00,Preposition,1,1


---
---

In [10]:
# Export data
wordsSorted.to_csv('SUBTLEXfreqPoS-sorted.csv',index=False)
nounsAll.to_csv('SUBTLEXfreqPoS-nouns.csv',index=False)
nounsCommon.to_csv('nounsCommon.csv',index=False)
nounsMissing.to_csv('nounsMissing.csv',index=False)
verbsAll.to_csv('SUBTLEXfreqPoS-verbs.csv',index=False)
adjectivesAll.to_csv('SUBTLEXfreqPoS-adjectives.csv',index=False)
adverbsAll.to_csv('SUBTLEXfreqPoS-adverbs.csv',index=False)
prepositionsAll.to_csv('SUBTLEXfreqPoS-prepositions.csv',index=False)