In [2]:
import os
import wikipediaapi
wiki_wiki = wikipediaapi.Wikipedia('en')

from itertools import repeat

import pandas as pd

from bs4 import BeautifulSoup
import re

import nltk
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import sent_tokenize
from nltk.corpus import stopwords
stopwords = nltk.corpus.stopwords.words('english')

import re
import requests

import urllib.request, urllib.parse, urllib.error 
from bs4 import BeautifulSoup

# Simple program to scrap contents of N categories into segregated dataframes of
# sentences for further analysis.

CATEGORIES = [
['Comperative', 'https://en.wikipedia.org/wiki/Comparative_psychology'],
['Behavioral genetics', 'https://en.wikipedia.org/wiki/Behavioural_genetics' ],
['Cognitivism', 'https://en.wikipedia.org/wiki/Cognitivism_(psychology)' ],
['Personality', 'https://en.wikipedia.org/wiki/Personality_psychology' ],
['Biological', 'https://en.wikipedia.org/wiki/Behavioral_neuroscience' ],
['Cognitive', 'https://en.wikipedia.org/wiki/Cognitive_psychology' ],
['Cross-cultural', 'https://en.wikipedia.org/wiki/Cross-cultural_psychology' ],
['Cultural', 'https://en.wikipedia.org/wiki/Cultural_psychology' ],
['Differential', 'https://en.wikipedia.org/wiki/Differential_psychology' ],
['Developmental', 'https://en.wikipedia.org/wiki/Developmental_psychology' ],
['Evolutionary', 'https://en.wikipedia.org/wiki/Evolutionary_psychology' ],
['Experimental', 'https://en.wikipedia.org/wiki/Experimental_psychology' ],
['Mathematical', 'https://en.wikipedia.org/wiki/Mathematical_psychology' ],
['Neuropsychology', 'https://en.wikipedia.org/wiki/Neuropsychology' ],
['Big Five', 'https://en.wikipedia.org/wiki/Big_Five_personality_traits' ],
['Positive', 'https://en.wikipedia.org/wiki/Positive_psychology' ],
['Quantitative', 'https://en.wikipedia.org/wiki/Quantitative_psychology' ],
['Social', 'https://en.wikipedia.org/wiki/Social_psychology' ],
]


In [7]:
# Data cleaning fuctions (to be used dynamically during data collection).

stopwords = nltk.corpus.stopwords.words('english')
WNL = WordNetLemmatizer()

def remove_url(text):
    return re.sub('https?://[A-Za-z0-9./]+','',text)
def html_strip_praser(text):
    return BeautifulSoup(text, "html.parser").get_text()
def html_strip_lxml(text):
    return BeautifulSoup(text, 'lxml').get_text()
def remove_special_characters(text, preserve):
    return re.sub("[^a-zA-Z{}]".format(preserve), " ", text)
def lowercase_text(text):
    return text.lower()
def strip_inner_spaces(text):
    return ' '.join([w.strip() for w in text.split()])
def remove_stop_words(text):
    return ' '.join([w for w in text.split() if not w in set(stopwords)])
def lemmatize_words(text, WNL):
    return ' '.join([WNL.lemmatize(word, pos='v') for word in text.split()])

def word_counts_text_cleaner(text):
    text = remove_url(text)
    text = html_strip_lxml(text)
    text = remove_special_characters(text, preserve='-')
    text = lowercase_text(text)
    text = strip_inner_spaces(text)
    text = remove_stop_words(text)
    text = lemmatize_words(text, WNL)
    return text

In [15]:
# Create data folders [Data segregation].
def create_category_folder(name):
    if not os.path.exists('data'):
        os.makedirs('data')
    if not os.path.exists('data/' + name):
        os.makedirs('data/' + name)

# Url to text, title [Data collection].
def get_page_txt(url):
    html = urllib.request.urlopen(url).read()
    soup = BeautifulSoup(html, 'html.parser') # "lxml")
    sentences = soup.find_all('p')
    sentences = [s.get_text() for s in sentences if s]
    full_text = ' '.join(sentences)
    title = str(soup.title).split('>')[1].split(' - Wiki')[0]
    return full_text, title

# Url to list of links inside [Data collection].
def get_page_links(url):
    html = urllib.request.urlopen(url).read()
    soup = BeautifulSoup(html, 'html.parser')
    prefix = 'https://en.wikipedia.org'
    urls = []
    for p in soup.find_all('p'):
        for url in p.find_all('a'):
            url = url.get("href", "")
            if 'wiki' in url and not 'www' in url and not 'http' in url and not ':' in url and not '//' in url and not 'Main_Page' in url and not 'Outline' in url and not 'List' in url and not 'Timeline' in url and not 'Index' in url:
                full_url = prefix + url 
                urls.append(full_url)
    return urls

def clean_sentences(df):
    df['sentence_c'] = df.sentence.apply(word_counts_text_cleaner)
    return df

# Save each page contentes as a separate dataframe.
def text_to_df(page_text, page_title, c, is_main=False):
    
    # Turn page text into a 3 col dataframe.
    sentences = sent_tokenize(page_text)
    title_data = repeat(page_title, len(sentences))
    category_data = repeat(c, len(sentences))
    
    data = zip(sentences, title_data, category_data)
    df = pd.DataFrame(data, columns=['sentence','title', 'source'])

    # Apply simple text cleaner.
    df = clean_sentences(df)
    
    # Prepare data folder names.
    if ' ' in c:
        df_source = '_'.join(c.split(' '))
    else:
        df_source = c
    create_category_folder(df_source)
    
    # Prepare csv file names.
    if ' ' in page_title:
        df_title = '_'.join(page_title.split(' '))
    else:
        df_title = page_title
    if is_main:
        csv_name = os.getcwd() + '/data/{}.csv'.format(df_source, df_title)
    else:
        csv_name = os.getcwd() + '/data/{}/{}.csv'.format(df_source, df_title)
    
    df.to_csv(csv_name, encoding='utf-8', index=False)
    return df
            
N_rows = 0
#df_main = pd.DataFrame(columns=['sentence','title', 'source']) # Might be comp./mem exhaustive.
for c, l in CATEGORIES:
    main_page_text, main_page_title = get_page_txt(l)
    main_page_links = get_page_links(l)
    
    try:
        df = text_to_df(main_page_text, main_page_title, c, is_main=False)
        #df_main = pd.concat([df_main, df], axis=0) # Might be comp./mem exhaustive.
        N_rows += len(df)
        print('Category {} contains {} links inside, collecting.\n------'.format(c, len(main_page_links)))
    except Exception as e:
        print(e)
        
    for i, ml in enumerate(main_page_links):
        try:
            page_text, page_title = get_page_txt(ml)
            df = text_to_df(page_text, page_title, c, is_main=False)
            
            #df_main = pd.concat([df_main, df], axis=0) # Might be comp/mem exhaustive.
            N_rows += len(df)
            print('{}. {} [{}] -- DONE [n rows: {}/{}].'.format(i, page_title, c, len(df), N_rows))
        except Exception as e:
            print(i, e)
    print()

Category Comperative contains 47 links inside, collecting.
------
0. Organism [Comperative] -- DONE [n rows: 120/220].
1. Proximate and ultimate causation [Comperative] -- DONE [n rows: 22/242].
2. Tinbergen's four questions [Comperative] -- DONE [n rows: 69/311].
3. Al-Jahiz [Comperative] -- DONE [n rows: 67/378].
4. Ibn al-Haytham [Comperative] -- DONE [n rows: 224/602].
5. Zoomusicology [Comperative] -- DONE [n rows: 74/676].
6. Arabic music [Comperative] -- DONE [n rows: 140/816].
7. Ethology [Comperative] -- DONE [n rows: 129/945].
8. Western world [Comperative] -- DONE [n rows: 230/1175].
9. Charles Darwin [Comperative] -- DONE [n rows: 396/1571].
10. George Romanes [Comperative] -- DONE [n rows: 79/1650].
11. Anthropomorphism [Comperative] -- DONE [n rows: 189/1839].
12. Douglas Spalding [Comperative] -- DONE [n rows: 13/1852].
13. Jacques Loeb [Comperative] -- DONE [n rows: 23/1875].
14. John Lubbock, 1st Baron Avebury [Comperative] -- DONE [n rows: 81/1956].
15. C. Lloyd Morga

77. Twin study [Behavioral genetics] -- DONE [n rows: 186/21463].
78. Adoption study [Behavioral genetics] -- DONE [n rows: 43/21506].
79. Quantitative genetics [Behavioral genetics] -- DONE [n rows: 830/22336].
80. Twin [Behavioral genetics] -- DONE [n rows: 223/22559].
81. Phenotype [Behavioral genetics] -- DONE [n rows: 43/22602].
82. Twin [Behavioral genetics] -- DONE [n rows: 223/22825].
83. Twin [Behavioral genetics] -- DONE [n rows: 223/23048].
84. Twin study [Behavioral genetics] -- DONE [n rows: 186/23234].
85. Gene–environment correlation [Behavioral genetics] -- DONE [n rows: 39/23273].
86. Variance [Behavioral genetics] -- DONE [n rows: 169/23442].
87. Gene–environment interaction [Behavioral genetics] -- DONE [n rows: 101/23543].
88. Additive genetic effects [Behavioral genetics] -- DONE [n rows: 2/23545].
89. Dominance (genetics) [Behavioral genetics] -- DONE [n rows: 203/23748].
90. Epistasis [Behavioral genetics] -- DONE [n rows: 143/23891].
91. Observational error [Beh

  ' Beautiful Soup.' % markup)


190. Race and genetics [Behavioral genetics] -- DONE [n rows: 230/37848].
191. Fringe science [Behavioral genetics] -- DONE [n rows: 42/37890].
192. Race (human categorization) [Behavioral genetics] -- DONE [n rows: 459/38349].
193. Genetic genealogy [Behavioral genetics] -- DONE [n rows: 90/38439].
194. Africa [Behavioral genetics] -- DONE [n rows: 319/38758].
195. Early expansions of hominins out of Africa [Behavioral genetics] -- DONE [n rows: 109/38867].
196. Qualitative research [Behavioral genetics] -- DONE [n rows: 148/39015].
197. Social norm [Behavioral genetics] -- DONE [n rows: 151/39166].
198. Scientific consensus [Behavioral genetics] -- DONE [n rows: 41/39207].
199. Scientific controversy [Behavioral genetics] -- DONE [n rows: 5/39212].
200. Biology and sexual orientation [Behavioral genetics] -- DONE [n rows: 228/39440].
201. Additive genetic effects [Behavioral genetics] -- DONE [n rows: 2/39442].
202. Genetic reductionism [Behavioral genetics] -- DONE [n rows: 6/39448]

53. Sigmund Freud [Personality] -- DONE [n rows: 547/58928].
54. Psychodynamics [Personality] -- DONE [n rows: 59/58987].
55. Id, ego and super-ego [Personality] -- DONE [n rows: 116/59103].
56. Alfred Adler [Personality] -- DONE [n rows: 217/59320].
57. Heinz Kohut [Personality] -- DONE [n rows: 65/59385].
58. Narcissism [Personality] -- DONE [n rows: 241/59626].
59. Karen Horney [Personality] -- DONE [n rows: 151/59777].
60. Behaviorism [Personality] -- DONE [n rows: 126/59903].
61. B. F. Skinner [Personality] -- DONE [n rows: 275/60178].
62. Operant conditioning [Personality] -- DONE [n rows: 239/60417].
63. Richard Herrnstein [Personality] -- DONE [n rows: 26/60443].
64. Ivan Pavlov [Personality] -- DONE [n rows: 134/60577].
65. Classical conditioning [Personality] -- DONE [n rows: 307/60884].
66. Albert Bandura [Personality] -- DONE [n rows: 84/60968].
67. Social learning theory [Personality] -- DONE [n rows: 117/61085].
68. Memory [Personality] -- DONE [n rows: 309/61394].
69. Em

26. Jean Piaget [Cognitive] -- DONE [n rows: 342/85012].
27. Ulric Neisser [Cognitive] -- DONE [n rows: 82/85094].
28. Hallucination [Cognitive] -- DONE [n rows: 150/85244].
29. Psychodynamics [Cognitive] -- DONE [n rows: 59/85303].
30. Cognition [Cognitive] -- DONE [n rows: 91/85394].
31. Attention [Cognitive] -- DONE [n rows: 297/85691].
32. Attention [Cognitive] -- DONE [n rows: 297/85988].
33. Cognition [Cognitive] -- DONE [n rows: 91/86079].
34. Auditory system [Cognitive] -- DONE [n rows: 137/86216].
35. Visual system [Cognitive] -- DONE [n rows: 194/86410].
36. Olfaction [Cognitive] -- DONE [n rows: 163/86573].
37. Taste [Cognitive] -- DONE [n rows: 349/86922].
38. Somatosensory system [Cognitive] -- DONE [n rows: 84/87006].
39. Orienting response [Cognitive] -- DONE [n rows: 39/87045].
40. Attention [Cognitive] -- DONE [n rows: 297/87342].
41. Attention [Cognitive] -- DONE [n rows: 297/87639].
42. Cocktail party effect [Cognitive] -- DONE [n rows: 122/87761].
43. Working memory

3. Electroencephalography [Differential] -- DONE [n rows: 337/106444].
4. Positron emission tomography [Differential] -- DONE [n rows: 175/106619].
5. Magnetic resonance imaging [Differential] -- DONE [n rows: 168/106787].
6. Functional magnetic resonance imaging [Differential] -- DONE [n rows: 441/107228].
7. Neurochemistry [Differential] -- DONE [n rows: 26/107254].
8. Neurotransmitter [Differential] -- DONE [n rows: 161/107415].
9. Temperament [Differential] -- DONE [n rows: 124/107539].
10. Mental chronometry [Differential] -- DONE [n rows: 107/107646].
11. Personality [Differential] -- DONE [n rows: 158/107804].
12. Temperament [Differential] -- DONE [n rows: 124/107928].
13. Intelligence [Differential] -- DONE [n rows: 60/107988].
14. Memory [Differential] -- DONE [n rows: 309/108297].
15. Stimulus (psychology) [Differential] -- DONE [n rows: 10/108307].
16. Yerkes–Dodson law [Differential] -- DONE [n rows: 17/108324].
17. Personalized medicine [Differential] -- DONE [n rows: 118

89. Cognitive model [Developmental] -- DONE [n rows: 110/129991].
90. Lev Vygotsky [Developmental] -- DONE [n rows: 159/130150].
91. K. Warner Schaie [Developmental] -- DONE [n rows: 21/130171].
92. Cognitive psychology [Developmental] -- DONE [n rows: 123/130294].
93. Differential psychology [Developmental] -- DONE [n rows: 22/130316].
94. Neo-Piagetian theories of cognitive development [Developmental] -- DONE [n rows: 249/130565].
95. Working memory [Developmental] -- DONE [n rows: 257/130822].
96. Domain specificity [Developmental] -- DONE [n rows: 19/130841].
97. Emotional self-regulation [Developmental] -- DONE [n rows: 180/131021].
98. Developmental coordination disorder [Developmental] -- DONE [n rows: 68/131089].
99. Internalizing disorder [Developmental] -- DONE [n rows: 10/131099].
100. Isolates [Developmental] -- DONE [n rows: 9/131108].
101. Fuzzy-trace theory [Developmental] -- DONE [n rows: 143/131251].
102. List of psychological research methods [Developmental] -- DONE [

22. Domain-general learning [Evolutionary] -- DONE [n rows: 81/148423].
23. Combinatorial explosion [Evolutionary] -- DONE [n rows: 50/148473].
24. Domain-specific learning [Evolutionary] -- DONE [n rows: 6/148479].
25. Human brain [Evolutionary] -- DONE [n rows: 522/149001].
26. Cognitive module [Evolutionary] -- DONE [n rows: 46/149047].
27. Language acquisition [Evolutionary] -- DONE [n rows: 248/149295].
28. Westermarck effect [Evolutionary] -- DONE [n rows: 22/149317].
29. Wason selection task [Evolutionary] -- DONE [n rows: 34/149351].
30. Cognitive psychology [Evolutionary] -- DONE [n rows: 123/149474].
31. Behavioral ecology [Evolutionary] -- DONE [n rows: 423/149897].
32. Artificial intelligence [Evolutionary] -- DONE [n rows: 552/150449].
33. Genetics [Evolutionary] -- DONE [n rows: 266/150715].
34. Ethology [Evolutionary] -- DONE [n rows: 129/150844].
35. Anthropology [Evolutionary] -- DONE [n rows: 283/151127].
36. Archaeology [Evolutionary] -- DONE [n rows: 323/151450].
37

144. Sexual conflict [Evolutionary] -- DONE [n rows: 315/164636].
145. Inclusive fitness [Evolutionary] -- DONE [n rows: 133/164769].
146. Bateman's principle [Evolutionary] -- DONE [n rows: 100/164869].
147. Mate choice [Evolutionary] -- DONE [n rows: 227/165096].
148. Courtship display [Evolutionary] -- DONE [n rows: 137/165233].
149. Gestation [Evolutionary] -- DONE [n rows: 32/165265].
150. Lactation [Evolutionary] -- DONE [n rows: 98/165363].
151. Life history theory [Evolutionary] -- DONE [n rows: 152/165515].
152. David Buss [Evolutionary] -- DONE [n rows: 87/165602].
153. David P. Schmitt [Evolutionary] -- DONE [n rows: 6/165608].
154. Sexual jealousy [Evolutionary] -- DONE [n rows: 130/165738].
155. Ovulation [Evolutionary] -- DONE [n rows: 66/165804].
156. Ovulatory shift hypothesis [Evolutionary] -- DONE [n rows: 146/165950].
157. Offspring [Evolutionary] -- DONE [n rows: 25/165975].
158. Sexual reproduction [Evolutionary] -- DONE [n rows: 119/166094].
159. W. D. Hamilton [E

51. Kenneth Craik [Experimental] -- DONE [n rows: 182/182496].
52. W. E. Hick [Experimental] -- DONE [n rows: 7/182503].
53. Donald Broadbent [Experimental] -- DONE [n rows: 20/182523].
54. Thought [Experimental] -- DONE [n rows: 82/182605].
55. Memory [Experimental] -- DONE [n rows: 309/182914].
56. Attention [Experimental] -- DONE [n rows: 297/183211].
57. Philosophy of science [Experimental] -- DONE [n rows: 298/183509].
58. Social psychology [Experimental] -- DONE [n rows: 299/183808].
59. Learned society [Experimental] -- DONE [n rows: 23/183831].
60. Scientific journal [Experimental] -- DONE [n rows: 98/183929].
61. University [Experimental] -- DONE [n rows: 200/184129].
62. Empiricism [Experimental] -- DONE [n rows: 185/184314].
63. Falsifiability [Experimental] -- DONE [n rows: 202/184516].
64. Determinism [Experimental] -- DONE [n rows: 189/184705].
65. Occam's razor [Experimental] -- DONE [n rows: 273/184978].
66. Edward C. Tolman [Experimental] -- DONE [n rows: 96/185074].
6

55. Mathematical logic [Mathematical] -- DONE [n rows: 262/201900].
56. Psychological Review [Mathematical] -- DONE [n rows: 13/201913].
57. Luce's choice axiom [Mathematical] -- DONE [n rows: 6/201919].
58. Detection theory [Mathematical] -- DONE [n rows: 62/201981].
59. Richard C. Atkinson [Mathematical] -- DONE [n rows: 79/202060].
60. William Kaye Estes [Mathematical] -- DONE [n rows: 27/202087].
61. R. Duncan Luce [Mathematical] -- DONE [n rows: 11/202098].
62. Patrick Suppes [Mathematical] -- DONE [n rows: 34/202132].
63. Journal of Mathematical Psychology [Mathematical] -- DONE [n rows: 5/202137].
64. Cognitive architecture [Mathematical] -- DONE [n rows: 38/202175].
65. Production system (computer science) [Mathematical] -- DONE [n rows: 48/202223].
66. ACT-R [Mathematical] -- DONE [n rows: 105/202328].
67. Connectionism [Mathematical] -- DONE [n rows: 78/202406].
68. Neural network [Mathematical] -- DONE [n rows: 110/202516].
69. Weber–Fechner law [Mathematical] -- DONE [n row

  ' Beautiful Soup.' % markup)


48. Functional neuroimaging [Neuropsychology] -- DONE [n rows: 38/210094].
49. Neuroimaging [Neuropsychology] -- DONE [n rows: 124/210218].
50. Functional magnetic resonance imaging [Neuropsychology] -- DONE [n rows: 441/210659].
51. Functional magnetic resonance imaging [Neuropsychology] -- DONE [n rows: 441/211100].

Category Big Five contains 134 links inside, collecting.
------
0. Personality [Big Five] -- DONE [n rows: 158/211644].
1. Factor analysis [Big Five] -- DONE [n rows: 237/211881].
2. Personality test [Big Five] -- DONE [n rows: 147/212028].
3. Conscientiousness [Big Five] -- DONE [n rows: 123/212151].
4. Neuropsychology [Big Five] -- DONE [n rows: 110/212261].
5. Personality [Big Five] -- DONE [n rows: 158/212419].
6. Psyche (psychology) [Big Five] -- DONE [n rows: 55/212474].
7. Academic achievement [Big Five] -- DONE [n rows: 59/212533].
8. Factor analysis [Big Five] -- DONE [n rows: 237/212770].
9. Lewis Goldberg [Big Five] -- DONE [n rows: 37/212807].
10. Lexical hyp

123. Egotism [Big Five] -- DONE [n rows: 26/227194].
124. Humour [Big Five] -- DONE [n rows: 191/227385].
125. Sensation seeking [Big Five] -- DONE [n rows: 48/227433].
126. Dan P. McAdams [Big Five] -- DONE [n rows: 8/227441].
127. Orthogonality [Big Five] -- DONE [n rows: 84/227525].
128. Factor analysis [Big Five] -- DONE [n rows: 237/227762].
129. Theory [Big Five] -- DONE [n rows: 122/227884].
130. Empirical evidence [Big Five] -- DONE [n rows: 27/227911].
131. Factor analysis [Big Five] -- DONE [n rows: 237/228148].
132. Jack Block [Big Five] -- DONE [n rows: 13/228161].
133. HEXACO model of personality structure [Big Five] -- DONE [n rows: 105/228266].

Category Positive contains 69 links inside, collecting.
------
0. Psychology [Positive] -- DONE [n rows: 561/228946].
1. Martin Seligman [Positive] -- DONE [n rows: 39/228985].
2. American Psychological Association [Positive] -- DONE [n rows: 113/229098].
3. Mihaly Csikszentmihalyi [Positive] -- DONE [n rows: 53/229151].
4. Chris

47. ACT (test) [Quantitative] -- DONE [n rows: 108/247501].
48. Asia [Quantitative] -- DONE [n rows: 260/247761].
49. South Korea [Quantitative] -- DONE [n rows: 621/248382].
50. China [Quantitative] -- DONE [n rows: 676/249058].
51. Arizona State University [Quantitative] -- DONE [n rows: 481/249539].
52. Item response theory [Quantitative] -- DONE [n rows: 178/249717].
53. Computerized adaptive testing [Quantitative] -- DONE [n rows: 117/249834].
54. Intelligence quotient [Quantitative] -- DONE [n rows: 358/250192].
55. Time series [Quantitative] -- DONE [n rows: 103/250295].
56. Functional magnetic resonance imaging [Quantitative] -- DONE [n rows: 441/250736].
57. Structural equation modeling [Quantitative] -- DONE [n rows: 113/250849].
58. Social network analysis [Quantitative] -- DONE [n rows: 109/250958].
59. Decision theory [Quantitative] -- DONE [n rows: 52/251010].
60. Statistical genetics [Quantitative] -- DONE [n rows: 4/251014].
61. Mathematical model [Quantitative] -- DONE

109. Stock market crash [Social] -- DONE [n rows: 147/271773].
110. Social group [Social] -- DONE [n rows: 158/271931].
111. Interpersonal relationship [Social] -- DONE [n rows: 161/272092].
112. Identity (social science) [Social] -- DONE [n rows: 142/272234].
113. Intergroup relations [Social] -- DONE [n rows: 123/272357].
114. Group polarization [Social] -- DONE [n rows: 146/272503].
115. Decision-making [Social] -- DONE [n rows: 124/272627].
116. Groupthink [Social] -- DONE [n rows: 165/272792].
117. Bay of Pigs Invasion [Social] -- DONE [n rows: 486/273278].
118. Productivity [Social] -- DONE [n rows: 141/273419].
119. Dominant response [Social] -- DONE [n rows: 7/273426].
120. Social loafing [Social] -- DONE [n rows: 197/273623].
121. Slacker [Social] -- DONE [n rows: 27/273650].
122. Crowd [Social] -- DONE [n rows: 40/273690].
123. Deindividuation [Social] -- DONE [n rows: 182/273872].
124. Self-awareness [Social] -- DONE [n rows: 339/274211].
125. Love [Social] -- DONE [n rows: 

In [3]:
# Load and concantate csv files by a category.

base_path = os.getcwd() + '/data/'

# Iterate over folders [categories].
folder_names = [f for f in os.listdir(base_path)]
topic_frames = []
for fn in folder_names:
    if not '.' in fn and not 'word_counts' in fn:
        
        # Iterate over csv files.
        file_names = [f for f in os.listdir(base_path+fn)]
        folder_frames = []
        for file in file_names:
            
            # Open each csv file.
            csv_file_path = base_path + fn + '/' + file
            df = pd.read_csv(csv_file_path, encoding='utf-8')
            folder_frames.append(df)
            
        # Concat csv files inside each of folders [categories].
        df_category = pd.concat(folder_frames, axis=0)
        topic_frames.append(df_category)
        print('Folder {} done, {} rows.'.format(fn, len(df_category)))

# Concat all csv fiels into one big dataframe of all categories.
df_all = pd.concat(topic_frames, axis=0)
print('All data df done, shaoe {}.'.format(df_all.shape))



Folder Cognitivism done, 5255 rows.
Folder Big_Five done, 13229 rows.
Folder Comperative done, 6748 rows.
Folder Biological done, 6901 rows.
Folder Evolutionary done, 25220 rows.
Folder Mathematical done, 9213 rows.
Folder Experimental done, 17777 rows.
Folder Social done, 27420 rows.
Folder Quantitative done, 12794 rows.
Folder Cross-cultural done, 3617 rows.
Folder Behavioral_genetics done, 23221 rows.
Folder Personality done, 19029 rows.
Folder Differential done, 6408 rows.
Folder Cognitive done, 14498 rows.
Folder Positive done, 7699 rows.
Folder Cultural done, 4460 rows.
Folder Neuropsychology done, 7239 rows.
Folder Developmental done, 24490 rows.
All data df done, shaoe (235218, 4).


In [9]:
# Turns any text into a dictionary containint unique words and their counts.
import collections
import spacy
nlp = spacy.load('en_core_web_sm')

# Inside-a-dict helper function.
def take_second(element):
    return element[1]

# Returns part of speach for given word.
def word_pos(word):
    return [w.pos_ for w in nlp(word)][0]

# Returns d={word:(word,count,pos),} ordered dict for given (cleaned) text.
def text_values_counts_dict(text):
    words = text.split()
    words = [w for w in text.split() if w not in stopwords]
    vc = {}
    for word in words:
        vc[word] = 0
    for word in words:
        vc[word] += 1
    vc_tuples = [(v, c) for v, c in vc.items()]
    vc_sorted = sorted(vc_tuples, key=take_second)[::-1]
    
    d = collections.OrderedDict()
    for tup in vc_sorted:
        k = tup[0]
        v = tup[1]
        d[k] = (k, v, word_pos(k))
    return d

# 1. Wcounts
category_wcnts = {}
for df in topic_frames:
    
    category_name = df.loc[0,'source'].to_list()[0] # Rewrite it.
    #print(category_name)
    #print('---')

    text = ' '.join([str(s) for s in df['sentence_c'].to_list()])
    text_clean = ' '.join([s for s in text.split() if len(s) > 2])
    wcnt = text_values_counts_dict(text_clean)
    category_wcnts[category_name] = wcnt
    
    create_category_folder('word_counts')
    df = pd.DataFrame(wcnt.values(), columns=['word', 'count', 'part_of_speach'], index=wcnt.keys())
    display(df.head(4))
    
    path = os.getcwd() + '/data/word_counts/{}.csv'.format(category_name)
    df.to_csv(path, encoding='utf-8', index=False)
    #print(path)
    #print(wcnt)

# 2. Wclouds
#for df in topic_frames:


knowledge familiarity awareness understand someone something facts information descriptions skills acquire experience education perceive discover learn knowledge refer theoretical practical understand subject implicit practical skill expertise explicit theoretical understand subject less formal systematic philosophy study knowledge call epistemology philosopher plato famously define knowledge justify true belief though definition think analytic philosophers citation need problematic gettier problems others defend platonic definition however several definitions knowledge theories explain exist knowledge acquisition involve complex cognitive process perception communication reason knowledge also say relate capacity acknowledgement human be eventual demarcation philosophy science make possible notion philosophy core theory knowledge theory distinct sciences foundation without idea theory knowledge hard imagine philosophy could age modern science definition knowledge matter ongoing debate 

In [19]:
# Load wcount csv file as a dataframe (one per category).

import os
import pandas as pd

print('Loading word counts CSV files into a dataframes:\n------------------------------------------------')

base_path = os.getcwd() + '/data/word_counts/'

# Iterate over csv_files [categories].
csv_files = [f for f in os.listdir(base_path)]

wcount_frames = []
for f in csv_files:
    
    # Open each csv file.
    csv_file_path = base_path + f
    df = pd.read_csv(csv_file_path, encoding='utf-8')
    wcount_frames.append(df)
    
    print('{} ({} words).'.format(f, len(df)))


Loading word counts CSV files into a dataframes:
------------------------------------------------
Positive.csv (12959 words).
Behavioral genetics.csv (25041 words).
Cultural.csv (8038 words).
Quantitative.csv (16947 words).
Developmental.csv (22574 words).
Mathematical.csv (12784 words).
Biological.csv (12350 words).
Cross-cultural.csv (8110 words).
Experimental.csv (19789 words).
Comperative.csv (13592 words).
Social.csv (23470 words).
Neuropsychology.csv (12204 words).
Cognitive.csv (16075 words).
Evolutionary.csv (24587 words).
Personality.csv (21456 words).
Big Five.csv (16041 words).
Cognitivism.csv (9796 words).
Differential.csv (9116 words).
