In [1]:
import os
import csv
import pandas as pd
import textstat
import re
import nltk
import numpy as np

from textblob import TextBlob
from datetime import datetime
from bs4 import BeautifulSoup
from word2number import w2n

# Dictionnary usage
#nltk.download('punkt')

fullpath= 'C://Users/tpensaert/OneDrive - Deloitte (O365D)/Documents/Thesis/JCP HTML/2008auh_shih_yoon@journal of consumer psychology~aligning benefits with payments - a test of the pattern alignment hypothesis.html'
parsed = BeautifulSoup(open(fullpath, 'rb'), 'html.parser')

testpath= r'C:\Users\tpensaert\OneDrive - Deloitte (O365D)\Documents\Thesis\JCP HTML\2010posavac_herzenstein_kardes_sundaram@journal of consumer psychology~profits and halos - the role of firm profitability information in consumer inference.html'
testpars= BeautifulSoup(open(testpath, 'rb'), 'html.parser')

In [5]:
w2n.word_to_num("In Study 1a, twenty three female undergraduates")

23

In [17]:
def main(soup):
    
    maindict = {
        
        #Basic info
        'Title': extract_metadata(soup, "title"),
        'Author': extract_metadata(soup, "author"),
        'Number of Authors': len(extract_metadata(soup, "author").split(',')),
        'Publication Date': extract_metadata(soup, "publication_date"),
        'Time Delta (months)': diff_month(extract_metadata(soup, "publication_date")),
        'Volume': extract_metadata(soup, "volume"),
        'Issue': extract_metadata(soup, "issue"),
        'First page': extract_metadata(soup, "firstpage"),
        'Last page': extract_metadata(soup, "lastpage"),
        'Page length': int(extract_metadata(soup, "lastpage")) - int(extract_metadata(soup, "firstpage")),
        'Reference Diversity': reference_diversity(soup),
        'Experimental Study': experimental(extract_body(soup)),
        
        #Readability
        'Avg sentence length': extract_readability(extract_body(soup))['average sentence length'],
        'Word count': extract_readability(extract_body(soup))['word count'],
        'Flesch Reading Ease': extract_readability(extract_body(soup))['flesch reading ease'],
        'Subjectivity': round(extract_readability(extract_body(soup))['subjectivity'], 3),
        'Polarity': round(extract_readability(extract_body(soup))['polarity'], 3),
        'In-text Citations': extract_readability(extract_body(soup))['citation'],
        
        # Linguistic Markers
        'Hedges': round(extract_markers(extract_body(soup))['Hedges'] / extract_readability(extract_body(soup))['word count'], 2),
        'Endophoric': round(extract_markers(extract_body(soup))['Endophoric'] / extract_readability(extract_body(soup))['word count'], 2),
        'Absolutes': round(extract_markers(extract_body(soup))['Absolutes'] / extract_readability(extract_body(soup))['word count'], 2),
        'Achievement': round(extract_markers(extract_body(soup))['Achievement'] / extract_readability(extract_body(soup))['word count'], 2),
        'Reward': round(extract_markers(extract_body(soup))['Reward'] / extract_readability(extract_body(soup))['word count'], 2),
        'Empathy': round(extract_markers(extract_body(soup))['Empathy'] / extract_readability(extract_body(soup))['word count'], 2),
        'Evidence': round(extract_markers(extract_body(soup))['Evidence'] / extract_readability(extract_body(soup))['word count'], 2),
        'Negate': round(extract_markers(extract_body(soup))['Negate'] / extract_readability(extract_body(soup))['word count'], 2),
        'Negative emotions': round(extract_markers(extract_body(soup))['Negative emotions'] / extract_readability(extract_body(soup))['word count'], 2),
        'First Person': round(extract_markers(extract_body(soup))['First Person'] / extract_readability(extract_body(soup))['word count'], 2),
        'Third Person': round(extract_markers(extract_body(soup))['Third Person'] / extract_readability(extract_body(soup))['word count'], 2),
        'Sensory': round(extract_markers(extract_body(soup))['Sensory'] / extract_readability(extract_body(soup))['word count'], 2),
        
        # Metrics:
        "P-Value": extract_metrics(extract_body(soup))[0],
        "Confidence Interval": extract_metrics(extract_body(soup))[1],
        "Sample size": extract_metrics(extract_body(soup))[2],
        "Sample size2": extract_metrics(extract_body(soup))[3],
        "Sample size3": extract_metrics(extract_body(soup))[4],
        "Confidence Interval 2": extract_metrics(extract_body(soup))[5]
        
    }
    
    return maindict

In [18]:
def extract_metadata(soup, query = "title"):
    
    results = soup.find_all("meta", attrs = {'name': 'citation_' + query})
    
    # When there are more than 1 author
    if len(results) > 1:
        output = ", ".join((item['content'] for item in results))
    # If there is only 1 author
    else:
        output = results[0]['content']
        
    return output

In [19]:
def extract_abstract(soup):
    
    """
    First select the paragraph including the abstract, afterwards strip all the breaks by splittings them in lines.
    Finally merging everything together again to 1 single string.
    """
    text = soup.find('section', attrs = {"class": "article-section article-section__abstract"}).p.get_text().splitlines()
    text = [part.strip() for part in text]

    return ' '.join(text).lstrip().rstrip()

In [20]:
def extract_method(soup):
    
    text = soup.find('section',attrs = {"class": "article-section__sub-title section2"}).get_text().splitlines()
    
    return text

In [22]:
def extract_body(soup):
    
    text = soup.find('section',attrs = {"class": "article-section article-section__full"}).get_text().splitlines()
    text = [part.strip() for part in text]
    ref_index = text.index('References')
    
    return ' '.join( text[:ref_index] ).lstrip().rstrip()

In [23]:
def reference_diversity(soup):
    
    # References division
    ref = soup.find('ul', attrs = {'class': 'rlist separator'})
    # List of all references
    total = ref.find_all('span', attrs = {'class': 'articleTitle'})
    # Unique referred journals
    unique = set([item.text.replace('\n', '') for item in ref.find_all('i')])  
        
    return round((len(unique) / len(total)), 2)

In [24]:
def main_text(soup):
    
    abstr_dict = {
        'Title': extract_metadata(soup, "title"),
        'Abstract': extract_abstract(soup)
    }
    
    body_dict = {
        'Title': extract_metadata(soup, "title"),
        'Body': extract_body(soup)
    }
    
    return abstr_dict, body_dict

In [25]:
def parse_csv(data, name = ""):
    
    table = pd.DataFrame(data)
    table.to_csv(F"{name}.csv", sep=";", index=False)

In [26]:
def parse_txt(body):
    
    title = body['Title'][:20]
    with open(F"{title}.txt", 'w', encoding='utf8') as file:
        file.write(body['Title'])
        file.write('\t')
        file.write(body['Body'])
        file.close()

In [27]:
def diff_month(datestring):
    
    d1 = datetime.strptime(datestring, '%Y/%m/%d').date()
    now = datetime.now()
    
    return (now.year - d1.year) * 12 + now.month - d1.month

In [28]:
def extract_readability(text):
    blob = TextBlob(text)
    
    pattern = "\, \d{4}\)"
    pattern2 = "\(\d{4}\)"
    
    in_citations = len(re.findall(pattern, text, re.IGNORECASE)) \
    + len(re.findall(pattern2, text, re.IGNORECASE))
    
    
    main = {
            "syllables": textstat.syllable_count(text),
            "word count": textstat.lexicon_count(text),
            "characters": textstat.char_count(text),
            #"polysyllables": textstat.polysyllabcount(text),
            "average letter per word": textstat.avg_letter_per_word(text),
            "average sentence length": textstat.avg_sentence_length(text),
            "average sentence per word": textstat.avg_sentence_per_word(text),
            "sentences": textstat.sentence_count(text),
            "flesch reading ease": textstat.flesch_reading_ease(text),
            #"smog index": textstat.smog_index(text),
            #"flesch kincaid grade": textstat.flesch_kincaid_grade(text),
            #"coleman liau index": textstat.coleman_liau_index(text),
            #"gunning fog": textstat.gunning_fog(text),
            "polarity": blob.sentiment.polarity, 
            "subjectivity": blob.sentiment.subjectivity,
            "citation": in_citations
    }

    return main

In [29]:
def extract_markers(text):
    
    # Uncertainty
    ## Hedges
    hedge = r"(?i)\b(((might|ought)(('|’|‘)?((ve)|(n('|’|‘)?t)))?)|about|almost|ambiguous|ambiguously|ambiguousness|apparent|apparently|appear|appeared|appearing|appears|approximate|approximately|around|assume|assumed|assumedly|assumes|assuming|assumption|assumptions|barely|blur|blurring|broadly|certain amount|certain extent|certain level|certain number|chance|depend|depended|depending|depends|doubt|doubtful|doubtfully|essentially|estimate|estimated|fairly|feel|feeling|feels|felt|from my perspective|from our perspective|from our perspectives|generally|guess|guessed|guesses|guessing|(half(-|–|—|―|‒)?((arsed?)|(ass(ed)?)))|hardly|hope|hoped|hopeful|hopefully|hopes|hoping|hypotheses|hypothesis|hypothesize|hypothesized|hypothesizes|hypothesizing|hypothetic|hypothetical|hypothetically|in my opinion|in my view|in our opinion|in our opinions|in our view|in our views|in that view|in this view|incomplete|incompletely|incompleteness|indecisive|indecisively|indecisiveness|indefinite|indefinitely|indefiniteness|indirect|indirectly|indirectness|it is likely|it must be|just|kind of|kinda|kindof|largely|likely|mainly|marginal|marginality|marginally|may|maybe|most|mostly|nearly|occasional|occasionally|often|oughta|partly|perhaps|plausible|plausibly|possible|possibly|potentional|potentionally|presumable|presumably|probable|probably|quite|relatively|roughly|seem|seemed|seeming|seems|some|sometimes|somewhat|sort of|sorta|spose|suggest|suggested|suggesting|suggests|suppose|supposed|supposes|supposing|tend to|tended to|tending to|tends to|they are likely|they must be|to my knowledge|to our knowledge|typical|typically|uncertain|uncertainly|unclear|unclearly|undecided|undecidedly|undecidedness|undetermined|unknowing|unknowingly|unknowingness|unknown|unknownness|unlikelihood|unlikeliness|unlikely|unresolvable|unresolved|unresolvedly|unresolvedness|unsettle|unsettled|unsettledness|unsettlement|unsettles|unsettlingly|unsure|unsurely|unsureness|usually|vague|vaguely|vagueness|varies|vary|varying|varyingly|wonder|wondered|wondering|wonders)\b"

    ## Endophoric markers
    endophoric = r"(?i)(((\b\s*(?!,|$)^I|^we|^We|[\W]I|[\W]we|[\W]We|the author|the writer|the authors|the writers|this paper|this study|current paper|current study|The author|The writer|The authors|The writers|This paper|This study|Current paper|Current study) (((am|are|is|was|were|has|have|had) )?(been )?(((believ|demonstrat|hypothesiz|propos|stat)((e(d|s)?)|ing))|((find|mention|point|say|show|suggest)(ed|ing|s)?)|found|said|shown)))|(according to|as the saying goes|((cit|quot)((e(d|s)?)|ing))|(there (is|was) a saying)))\b"

    # Certainty
    ## Absoluteness
    absolute = r"(?i)\b(must('|’|‘)nt|must('|’|‘)ve|mustn('|’|‘)t|absolute|absolutely|accura[\w]*|all|altogether|always|apparent|assur[\w]*|blatant[\w]*|certain[\w]*|clear|clearly|commit|commitment[\w]*|commits|committed|committing|complete|completed|completely|completes|confidence|confident|confidently|correct[\w]*|defined|definite|definitely|definitive[\w]*|directly|distinct[\w]*|entire[\w]*|especially|essential|ever|every|everybod[\w]*|everyday|everyone[\w]*|everything[\w]*|everytime|everywhere[\w]*|evident[\w]*|exact[\w]*|explicit[\w]*|extremely|fact|facts|factual[\w]*|forever|frankly|fundamental|fundamentalis[\w]*|fundamentally|fundamentals|guarant[\w]*|implicit[\w]*|indeed|inevitab[\w]*|infallib[\w]*|invariab[\w]*|irrefu[\w]*|must|mustnt|mustve|namely|necessari[\w]*|necessary|never|nothing|nowhere|obvious|obviously|particularly|perfect|perfected|perfecting|perfection|perfectly|perfects|positive|positively|positives|positivi[\w]*|precis[\w]*|promise[\w]*|proof|prove[\w]*|proving|pure|purely|pureness|purest|purity|specific|specifically|specifics|sure[\w]*|total|totally|truest|truly|truth[\w]*|unambigu[\w]*|undeniab[\w]*|undoubt[\w]*|unquestion[\w]*|visibly|wholly)\b"

    ## Achievement
    achieve = r"(?i)\b(abilit[\w]*|able|accomplish[\w]*|ace|achievable|achieve[\w]*|achievi[\w]*|acquir[\w]*|acquisition[\w]*|actualiz[\w]*|adequa[\w]*|advanc[\w]*|advantag[\w]*|ahead|ambition|ambitions|ambitious|ambitiously|ambitiousness|attain|attainable|attained|attaining|attainment|attains|authorit[\w]*|award[\w]*|beat|beaten|best|better|bonus[\w]*|burnout[\w]*|capab[\w]*|celebrat[\w]*|challeng[\w]*|champ[\w]*|cheat[\w]*|climb[\w]*|compet[\w]*|confidence|confident|confidently|conquer[\w]*|conscientious[\w]*|create|created|creates|creating|creation|creations|creative|creativity|defeat[\w]*|demot[\w]*|determina[\w]*|determined|diligen[\w]*|domina[\w]*|driven|dropout[\w]*|earn|earned|earning|earns|efficien[\w]*|effort[\w]*|elit[\w]*|emptier|emptiest|emptiness|empty|enabl[\w]*|endeav[\w]*|excel|excellent|excels|fail[\w]*|finaliz[\w]*|first|firsts|flunk[\w]*|founded|founder[\w]*|founding|fulfill[\w]*|gain[\w]*|glory|goal[\w]*|gpa|honor[\w]*|honour[\w]*|ideal[\w]*|importance|improve[\w]*|improving|inadequa[\w]*|incapab[\w]*|incentive[\w]*|incompeten[\w]*|ineffect[\w]*|initiat[\w]*|irresponsible[\w]*|lazier|laziest|lazy|lead|leader[\w]*|leading|leads|limit[\w]*|lose|loser[\w]*|loses|losing|loss[\w]*|lost|mastered|mastery|medal[\w]*|mediocr[\w]*|motiv[\w]*|obtain|obtainable|obtained|obtaining|obtains|opportun[\w]*|overcame|overcome|overcomes|overcoming|overconfiden[\w]*|overtak[\w]*|perfected|perfecting|perfection|perfectly|perfects|persever[\w]*|persist[\w]*|plan|planned|planning|plans|potential[\w]*|powerful|powerless[\w]*|practice|practiced|practices|practicing|prais[\w]*|pride|prize[\w]*|proficien[\w]*|progress|promot[\w]*|proud|prouder|proudest|proudly|purpose[\w]*|queen|quit|quitt[\w]*|rank|ranked|ranking|ranks|recover[\w]*|resolv[\w]*|resourceful[\w]*|reward[\w]*|skill[\w]*|solution[\w]*|solve|solved|solves|solving|strateg[\w]*|striv[\w]*|succeed[\w]*|success|successes|successful|successfully|super|superb[\w]*|surpass[\w]*|surviv[\w]*|team[\w]*|top|tried|tries|triumph[\w]*|try|trying|unable|unbeat[\w]*|unproduc[\w]*|unsuccessful[\w]*|victor[\w]*|win|winn[\w]*|wins|won|work|workabl[\w]*|worked|worker[\w]*|working|works)\b"

    ## Reward
    reward = r"(?i)\b(access[\w]*|accrue[\w]*|accumul[\w]*|achievable|achieve[\w]*|achievi[\w]*|acquir[\w]*|add|added|adding|adds|advanc[\w]*|advantag[\w]*|adventur[\w]*|amass[\w]*|approach|approached|approaches|approaching|award[\w]*|benefit|benefits|best|bet|bets|better|betting|bold|bonus[\w]*|confidence|confident|confidently|crave|craving|dare|dared|dares|daring|desir[\w]*|eager|eagerly|eagerness|earn|earned|earning|earnings|earns|enthus[\w]*|excite|excited|excitedly|excitement|exciting|fearless[\w]*|fulfill[\w]*|gain[\w]*|get|gets|getting|goal[\w]*|good|got|gotten|great|greed[\w]*|invigor[\w]*|jackpot[\w]*|luck|lucky|obtain|obtainable|obtained|obtaining|obtains|opportun[\w]*|optimal[\w]*|optimism|optimistic|perfect|perfected|perfecting|perfection|perfectly|plus|positive|positively|positives|positivi[\w]*|prize[\w]*|profit[\w]*|promot[\w]*|reward[\w]*|score[\w]*|scoring|seize[\w]*|snag[\w]*|steal[\w]*|stole|succeed[\w]*|success|successes|successful|successfully|surpass[\w]*|take|taken|takes|taking|took|triumph[\w]*|victor[\w]*|wager|wagered|wagering|wagers|willing|win|winn[\w]*|wins|won)\b"

    ## Empathics
    empathy = r"(?i)\b((it('|’|‘)s (better|clear|obvious))|best|better|big|bigger|biggest|certainly|clearly|definitely|demonstrate|enchanting|good|great|greater|greatest|he did|he does|I did|I do|in fact|incredible|incredibly|indeed|it did|it does|it is better|it is clear|it is obvious|its better|its clear|its obvious|latest|literally|much|obviously|of course|please|pure|purely|quite|really|she did|she does|they did|they do|undoubtedly|we did|we do|wonderfully|you did|you do)\b"

    ## Evidentials
    evidence = r"(?i)((((?<=^I )|(?<=^we )|(?<=[\W]I )|(?<=[\W]we )|(?<=the author )|(?<=the writer )|(?<=the authors )|(?<=the writers )|(?<=this (paper|study) )|(?<=current (paper|study) ))((am|are|is|was|were|has|have|had) )?(been )?(((believ|demonstrat|hypothesiz|propos|stat)((e(d|s)?)|ing))|((find|mention|point|say|show|suggest)(ed|ing|s)?)|found|said|shown))|(in ((th(e|is) current)|this) (paper|study)))\b"

    # Cues that related to deception
    ### Negations
    negate = r"(?i)\b(ain('|’|‘)t|aren('|’|‘)t|can('|’|‘)t|couldn('|’|‘)t|didn('|’|‘)t|doesn('|’|‘)t|don('|’|‘)t|hadn('|’|‘)t|hasn('|’|‘)t|haven('|’|‘)t|isn('|’|‘)t|must('|’|‘)nt|mustn('|’|‘)t|need('|’|‘)nt|needn('|’|‘)t|ought('|’|‘)nt|oughtn('|’|‘)t|shan('|’|‘)t|should('|’|‘)nt|shouldn('|’|‘)t|wasn('|’|‘)t|weren('|’|‘)t|won('|’|‘)t|wouldn('|’|‘)t|aint|arent|cannot|cant|couldnt|didnt|doesnt|dont|hadnt|hasnt|havent|idk|isnt|mustnt|nah[\w]*|neednt|negat[\w]*|neither|never|no|nobod[\w]*|noes|none|nope|nor|not|nothing|nowhere|np|oughtnt|shant|shouldnt|uh(-|–|—|―|‒)uh|wasnt|werent|without|wont|wouldnt)\b"

    ### Negative emotions
    negemo = r"(?i)(:\(|\):)|(\b(abandon[\w]*|abuse[\w]*|abusi[\w]*|ache[\w]*|aching[\w]*|advers[\w]*|afraid|aggravat[\w]*|aggress|aggressed|aggresses|aggressing|aggression[\w]*|aggressive|aggressively|aggressor[\w]*|agitat[\w]*|agoniz[\w]*|agony|alarm[\w]*|alone|anger[\w]*|angrier|angriest|angry|anguish[\w]*|annoy|annoyed|annoying|annoys|antagoni[\w]*|anxiety|anxious|anxiously|anxiousness|apath[\w]*|appall[\w]*|apprehens[\w]*|argh[\w]*|argu[\w]*|arrogan[\w]*|asham[\w]*|assault[\w]*|asshole[\w]*|attack[\w]*|aversi[\w]*|avoid[\w]*|awful|awkward|bad|badly|bashful[\w]*|bastard[\w]*|battl[\w]*|beaten|bereave[\w]*|bitch[\w]*|bitter|bitterly|bitterness|blam[\w]*|bore[\w]*|boring|bother[\w]*|broke|brutal[\w]*|burden[\w]*|careless[\w]*|cheat[\w]*|coldly|complain[\w]*|concerned|condemn[\w]*|confront[\w]*|confuse|confused|confusedly|confusing|contempt[\w]*|contradic[\w]*|crap|crappy|crazy|cried|cries|critical|critici[\w]*|crude|crudely|cruel|crueler|cruelest|cruelty|crushed|cry|crying|cunt[\w]*|curse|cut|cynic[\w]*|damag[\w]*|damn[\w]*|danger|dangerous|dangerously|dangers|daze[\w]*|decay[\w]*|deceiv[\w]*|deceptive|defeat[\w]*|defect[\w]*|defenc[\w]*|defend[\w]*|defense|defenseless|defensive|defensively|defensiveness|degrad[\w]*|demean[\w]*|demot[\w]*|denial|depress[\w]*|depriv[\w]*|despair[\w]*|desperat[\w]*|despis[\w]*|destroy[\w]*|destruct|destructed|destruction|destructive|destructiveness|devastat[\w]*|devil[\w]*|difficult|difficulties|difficulty|disadvantag[\w]*|disagree[\w]*|disappoint[\w]*|disaster[\w]*|discomfort[\w]*|discourag[\w]*|disgrac[\w]*|disgust[\w]*|dishearten[\w]*|dishonor[\w]*|disillusion[\w]*|dislike|disliked|dislikes|disliking|dismay[\w]*|disreput[\w]*|diss|dissatisf[\w]*|distraught|distress[\w]*|distrust[\w]*|disturb[\w]*|domina[\w]*|doom[\w]*|dork[\w]*|doubt[\w]*|dread[\w]*|dull|dumb|dumbass[\w]*|dumber|dumbest|dummy|dump[\w]*|dwell[\w]*|egotis[\w]*|embarrass[\w]*|emotional|emptier|emptiest|emptiness|empty|enemie[\w]*|enemy[\w]*|enrag[\w]*|envie[\w]*|envious|envy[\w]*|evil|excruciat[\w]*|exhaust[\w]*|fail[\w]*|fake|fatal[\w]*|fatigu[\w]*|fault[\w]*|fear|feared|fearful[\w]*|fearing|fears|feroc[\w]*|feud[\w]*|fiery|fight[\w]*|fired|flunk[\w]*|foe[\w]*|fool|fooled|fooling|foolish|foolishly|fools|forbade|forbid|forbidden|forbidding|forbids|fought|frantic[\w]*|freak[\w]*|fright[\w]*|frustrat[\w]*|fuck|fucked[\w]*|fucker[\w]*|fuckface[\w]*|fuckh[\w]*|fuckin[\w]*|fucks|fucktard|fucktwat[\w]*|fuckwad[\w]*|fume[\w]*|fuming|furious[\w]*|fury|geek[\w]*|gloom|gloomier|gloomiest|gloomily|gloominess|gloomy|goddam[\w]*|good(-|–|—|―|‒)for(-|–|—|―|‒)nothing|gossip[\w]*|grave[\w]*|greed[\w]*|grief|griev[\w]*|grim|grimac[\w]*|grimly|gross|grossed|grosser|grossest|grossing|grossly|grossness|grouch[\w]*|grr[\w]*|grudg[\w]*|guilt|guilt(-|–|—|―|‒)trip[\w]*|guiltier|guiltiest|guilty|hangover[\w]*|harass[\w]*|harm|harmed|harmful|harmfully|harmfulness|harming|harms|harsh|hate|hated|hateful[\w]*|hater[\w]*|hates|hating|hatred|haunted|hazard[\w]*|hazy|heartbreak[\w]*|heartbroke[\w]*|heartless[\w]*|hell|hellish|helpless[\w]*|hesita[\w]*|homesick[\w]*|hopeless[\w]*|horrible|horribly|horrid[\w]*|horror[\w]*|hostil[\w]*|humiliat[\w]*|hungover|hurt[\w]*|idiot[\w]*|ignorable|ignoramus|ignorant|ignore|ignored|ignores|ignoring|immoral[\w]*|impatien[\w]*|impersonal|impolite[\w]*|inadequa[\w]*|incompeten[\w]*|indecis[\w]*|ineffect[\w]*|inferior|inferiority|inhibit[\w]*|insecur[\w]*|insincer[\w]*|insult[\w]*|interrup[\w]*|intimidat[\w]*|irrational[\w]*|irrita[\w]*|isolat[\w]*|jaded|jealous|jealousies|jealously|jealousy|jerk|jerked|jerks|kill[\w]*|lame|lamely|lameness|lamer|lamest|lazier|laziest|lazy|liabilit[\w]*|liar[\w]*|lied|lies|lone|lonelier|loneliest|loneliness|lonely|loner[\w]*|longing[\w]*|lose|loser[\w]*|loses|losing|loss[\w]*|lost|lous[\w]*|loveless|low|lower|lowered|lowering|lowers|lowest|lowli[\w]*|lowly|luckless[\w]*|ludicrous[\w]*|lying|mad|maddening[\w]*|madder|maddest|maniac[\w]*|masochis[\w]*|meaner|meanest|melanchol[\w]*|mess|messier|messiest|messy|miser[\w]*|miss|missed|misses|missing|mistak[\w]*|mock|mocked|mocker[\w]*|mocking|mocks|molest[\w]*|mooch[\w]*|moodi[\w]*|moody|moron[\w]*|mourn[\w]*|murder[\w]*|nag[\w]*|nast[\w]*|needy|neglect[\w]*|nerd[\w]*|nervous|nervously|nervousness|neurotic[\w]*|nightmar[\w]*|numbed|numbing|numbness|numbs|obnoxious[\w]*|obsess[\w]*|offence[\w]*|offend[\w]*|offense|offenses|offensive|outrag[\w]*|overwhelm[\w]*|pain|pained|painf[\w]*|painl[\w]*|pains|panic[\w]*|paranoi[\w]*|pathetic|pathetically|peculiar[\w]*|perv|perver[\w]*|pervy|pessimis[\w]*|pest[\w]*|petrif[\w]*|pettier|pettiest|petty|phobi[\w]*|phony|piss[\w]*|pitiable|pitied|pities|pitiful|pitifully|pity[\w]*|poison[\w]*|poor|poorer|poorest|poorly|poorness[\w]*|powerless[\w]*|prejudic[\w]*|pressur[\w]*|prick[\w]*|problem[\w]*|protest|protested|protesting|protests|puk[\w]*|punish[\w]*|pushy|queas[\w]*|rage[\w]*|raging|rancid[\w]*|rape[\w]*|raping|rapist[\w]*|rebel[\w]*|reek[\w]*|regret[\w]*|reject[\w]*|reluctan[\w]*|remorse[\w]*|repress[\w]*|resent[\w]*|resign[\w]*|restless[\w]*|revenge[\w]*|ridicul[\w]*|rigid|rigidity|rigidly|risk[\w]*|rotten|rude|rudely|ruin[\w]*|sad|sadder|saddest|sadly|sadness|sarcas[\w]*|savage[\w]*|scare|scared|scares|scarier|scariest|scaring|scary|sceptic[\w]*|scream[\w]*|screw[\w]*|selfish[\w]*|serious|seriously|seriousness|severe[\w]*|shake[\w]*|shaki[\w]*|shaky|shame[\w]*|shit[\w]*|shock[\w]*|shook|shy|shyly|shyness|sick|sicken[\w]*|sicker|sickest|sickly|sigh|sighed|sighing|sighs|sin|sinister|sins|skeptic[\w]*|slut[\w]*|smh|smother[\w]*|smug[\w]*|snob[\w]*|sob|sobbed|sobbing|sobs|solemn[\w]*|sorrow[\w]*|sorry|spite[\w]*|stale|stammer[\w]*|stank[\w]*|startl[\w]*|steal[\w]*|stench[\w]*|stink|stinky|strain[\w]*|strange|strangest|stress[\w]*|struggl[\w]*|stubborn[\w]*|stunk|stupid|stupider|stupidest|stupidity|stupidly|stutter[\w]*|suck|sucked|sucker[\w]*|sucks|sucky|suffer|suffered|sufferer[\w]*|suffering|suffers|suspicio[\w]*|tantrum[\w]*|tears|teas[\w]*|tedious|temper|tempers|tense|tensely|tensing|tension[\w]*|terrible|terribly|terrified|terrifies|terrify|terrifying|terror[\w]*|thief|thiev[\w]*|threat[\w]*|timid[\w]*|tortur[\w]*|tough|traged[\w]*|tragic|tragically|trauma[\w]*|trembl[\w]*|trick|tricked|trickier|trickiest|tricks|tricky|trite|trivial|troubl[\w]*|turmoil|twitchy|ugh|uglier|ugliest|ugly|unaccept[\w]*|unattractive|uncertain[\w]*|uncomfortabl[\w]*|uncontrol[\w]*|undesir[\w]*|uneas[\w]*|unfair|unfortunate[\w]*|unfriendly|ungrateful[\w]*|unhapp[\w]*|unimportant|unimpress[\w]*|unkind|unlov[\w]*|unlucky|unpleasant|unprotected|unsafe|unsavory|unsettl[\w]*|unsuccessful[\w]*|unsure[\w]*|unwelcom[\w]*|upset|upsets|upsetting|uptight[\w]*|useless|uselessly|uselessness|vain|vanity|vicious|viciously|viciousness|victim[\w]*|vile|villain[\w]*|violat[\w]*|violence|violent|violently|vomit[\w]*|vulnerab[\w]*|war|warfare[\w]*|warn[\w]*|warred|warring|wars|weak|weaken|weakened|weakening|weakens|weaker|weakest|weakling|weakly|weapon[\w]*|weary|weep[\w]*|weird|weirded|weirder|weirdest|weirdly|weirdness|weirdo|weirdos|weirds|wept|whine[\w]*|whining|whore[\w]*|wicked|wickedly|wimp[\w]*|witch[\w]*|woe[\w]*|worried|worrier|worries|worry|worrying|worse|worsen|worsened|worsening|worsens|worst|worthless|wrong|wrongdoing|wronged|wrongful|wrongly|wrongness|wrongs|yearn[\w]*|yell|yelled|yelling|yells|yuck)\b)"

    ## Different patterns and usage of first- & third-person pronoun
    ### First-person pronoun
    firstpers = r"(?i)\b((I('|’|‘)(d(('|’|‘)ve)?|ll|m|ve))|(let('|’|‘)?s)|(the (author|writer)('|’|‘)?s?)|(we('|’|‘)(d|ll|re|ve))|I|id|idc|idgaf|idk|idve|ikr|ily|im|ima|imma|ive|me|methinks|mine|my|myself|our|ours|ourselves|us|we|weve)\b"

    ### Third-person pronoun
    thirdpers = r"(?i)\b(he('|’|‘)d|he('|’|‘)s|she('|’|‘)d|she('|’|‘)ll|she('|’|‘)s|he|her|hers|herself|hes|him|himself|his|hissel[\w]*|oneself|she|shes)|(it('|’|‘)d|it('|’|‘)ll|it('|’|‘)s|that('|’|‘)d|that('|’|‘)ll|that('|’|‘)s|what('|’|‘)d|what('|’|‘)ll|what('|’|‘)s|who('|’|‘)d|who('|’|‘)ll|who('|’|‘)s|another|anybod[\w]*|anymore|anyone[\w]*|anything|deez|everybod[\w]*|everyday|everyone[\w]*|everything[\w]*|it|itd|itll|its|itself|nobod[\w]*|other|others|somebod[\w]*|someone[\w]*|something[\w]*|somewhere|stuff|that|thatd|thatll|thats|these|thing[\w]*|this|those|what|whatd|whatever|whatll|whats|which|whichever|who|whod|whoever|wholl|whom|whomever|whos|whose|whosever|whoso[\w]*)\b"

    ## Sensory-perceptual words
    sensory = r"(?i)\b(skin('|’|‘)[\w]*|ache[\w]*|aching[\w]*|brush[\w]*|burn|burned|burning|burns|burnt|caress[\w]*|cold|colder|coldest|cool|cooler|coolest|cooling|dried|drier|driest|dry|dryness|feel|feelin|feeling|feelings|feels|felt|finger[\w]*|fire|flexib[\w]*|fragil[\w]*|frail[\w]*|freez[\w]*|froze[\w]*|fuzz[\w]*|goosebump[\w]*|grab[\w]*|grasp[\w]*|grip|gripp[\w]*|grips|hairless[\w]*|hairs|hairy|hand|handful[\w]*|hands|hard|harden|harder|hardest|harsh|heavie[\w]*|heavy|hot|hotter|hottest|hurt[\w]*|itch[\w]*|leather[\w]*|limp[\w]*|loose|loosed|loosely|loosen[\w]*|looser|looses|loosest|lump[\w]*|moist|pain|pained|painf[\w]*|pains|press|pressed|presses|pressing|rough|rougher|roughest|round|rounder|roundest|rub|rubbed|rubbing|rubs|sand|sands|sandy|scratch[\w]*|sensation|sensations|sharp[\w]*|silk[\w]*|skin|smooth[\w]*|soft|softer|softest|softly|squeez[\w]*|stroke[\w]*|stroki[\w]*|thick[\w]*|tight|tighter|tightest|tightly|tingl[\w]*|touch[\w]*|warm|warmed|warmer|warmest|warming|warmly|warms|warmth|weight|weighted|weightless[\w]*|weights|wet|wetter|wettest)\b"
    
    
    # Dummy function
    def clean(data):
        data[:] = (value2 for value in data for value2 in value if value2 != '')
        return data
    
    output = {
    "Hedges": len(clean(re.findall(hedge, text))),
    "Endophoric": len(clean(re.findall(endophoric, text,flags=re.MULTILINE))),
    "Absolutes": len(clean(re.findall(absolute, text))),
    "Achievement": len(clean(re.findall(achieve, text))),
    "Reward": len(clean(re.findall(reward, text))),
    "Empathy": len(clean(re.findall(empathy, text))),
    "Evidence": len(clean(re.findall(evidence, text))),
    "Negate": len(clean(re.findall(negate, text))),
    "Negative emotions": len(clean(re.findall(negemo, text))),
    "First Person": len(clean(re.findall(firstpers, text))),
    "Third Person": len(clean(re.findall(thirdpers, text))),
    "Sensory": len(clean(re.findall(sensory, text)))
    }
    
    return output

In [30]:
def extract_metrics(text):
    
    # The Main p-value
    pvalue_pattern = "(p \< \d*\.\d{1,4}|p \> \d*\.\d{1,4}|p \= \d*\.\d{1,4}|\*{1,4})"
    try:
        pvalue = re.findall(pvalue_pattern, text, re.IGNORECASE) # First hit
    except:
        pvalue = np.nan
    
    # Whether the study uses an Interval or not
    interval_pattern = "(\[\d+\;\d+\]|\[\d+\:\d+\]|\[ \d+\; \d+\]|\[ \d+ \; \d+ \])"
    try:
        interval = re.findall(interval_pattern, text, re.IGNORECASE)
    except:
        interval = 0
    
    # Size of the population in scope
    sample_pattern = "(n\= \d{1,7}|n \= \d{1,7})"
    
    try:
        sample = re.findall(sample_pattern, text, re.IGNORECASE)       
    except:
        sample = np.nan
    
    # Sample Size 2
    try:
        sample2 = samples(text)
    except:
        sample2 = np.nan
    
    # Sample size 3
    try:
        degree_pattern = "F\(\d{1,2}\, \d{1,4}\)"
        outcome = re.findall(degree_pattern, text, re.IGNORECASE)
        sample3 = sample3
    except:
        sample3 = np.nan
       
       
    # Confidence level - 2nd version
    keys = ["confidence interval", "confidence level", "cl:", "cl:", "cl(", "ci(", "cl[", "ci[", "ci{", "cl{"]
    confidence = 0
    
    for key in keys:
        if key in text.lower():
            confidence += 1
            break
    
    return pvalue, interval, sample, sample2, sample3, confidence

In [31]:
def samples(text):
    
    sample = ['respondents','participants', 'undergraduates', 'students', 'under-graduate', 'mturkers', 
              'amazon mturkers', 'amazon mechanical turkers', 'workers', 'members', 'subjects', 'panel',
              'prolific workers', 'prolific partificants', 'crowdflower', 'Qualtrics', 'SurveyMonkey', 'CheckMarket']
    verbs = ["recruited", "applied", "involved", "assigned", "participate", 'participated', 'responded', "employed",
             "asked", "informed", "given", "told", "instructed", "exposed", "shown", "invited"]
    results = []
    
    for item in sample:
        index = text.lower().find(item)
        selection = text[index-30:index+30]
        
        for verb in verbs:
            if verb in selection:
                results.append([int(s) for s in selection.split() if s.isdigit()])
    
    # Final bruteforce approach, using word numbers
    words = ['hundred', 'ten', 'twenty', 'thirty', 'fourty', 'fifty', 'sixty', 'seventy', 'eighty', 'ninety']
    
    for number in words:
        ind = text.lower().find(number)
        select = w2n.word_to_num(text[ind-30:ind+30])
        
        if select is not None:
            results.append(select)
    
    return results

In [32]:
def experimental(text):
    
    expUp = ['ANOVA', 'MANOVA', 'ANCOVA', 'MANCOVA', 'ANALYSIS OF VARIANCE', 'EXPERIMENTAL DESIGN']
    expNorm = ['PROCESS']
    
    experimental = "No"
    
    for item in expUp:
        if item in text.upper():
            experimental = "Yes"
            break
            
    for item in expNorm:
        if item in text:
            experimental = "Yes"
            break
    
    return experimental

In [33]:
main(parsed)

{'Title': 'Aligning benefits with payments: A test of the pattern alignment hypothesis',
 'Author': 'Seigyoung Auh, Eric Shih, Yeosun Yoon',
 'Number of Authors': 3,
 'Publication Date': '2008/10/01',
 'Time Delta (months)': 141,
 'Volume': '18',
 'Issue': '4',
 'First page': '292',
 'Last page': '303',
 'Page length': 11,
 'Reference Diversity': 0.61,
 'Experimental Study': 'No',
 'Avg sentence length': 31.3,
 'Word count': 8254,
 'Flesch Reading Ease': 31.25,
 'Subjectivity': 0.389,
 'Polarity': 0.063,
 'In-text Citations': 36,
 'Hedges': 0.02,
 'Endophoric': 0.01,
 'Absolutes': 0.01,
 'Achievement': 0.07,
 'Reward': 0.12,
 'Empathy': 0.0,
 'Evidence': 0.01,
 'Negate': 0.01,
 'Negative emotions': 0.02,
 'First Person': 0.01,
 'Third Person': 0.05,
 'Sensory': 0.0,
 'P-Value': ['p > .201',
  'p > .122',
  'p < .001',
  'p < .01',
  'p < .01',
  'p < .01',
  'p < .001',
  'p < .05',
  'p < .01',
  '*',
  '*',
  '*',
  '*',
  '*',
  '*',
  '*',
  '*',
  '*',
  '*',
  '*',
  '*',
  '*',


In [16]:
extract_abstract(parsed)

"This article examines consumer perception of transactions whose benefits of consumption and cost of purchase unfold over time. Specifically, the article employs the notion of narrow framing to suggest that, when consumers confront a series of decisions, they tend to make evaluations one at a time, rather than take into consideration the entire portfolio. Consistent with this argument, the authors test the pattern alignment hypothesis, which states that consumers prefer payment schemes that match the pattern of benefits and payments in each period, rather than a scheme that encompasses an entire financing period. In two experiments, the authors find general support for the pattern alignment hypothesis and for the underlying process by which this hypothesis occurs. Specifically, Experiment 2 highlights the mediating role of consumers' perceived fairness in determining the effectiveness of a financing program. The paper concludes with a discussion of the theoretical and practical implica

In [17]:
body = parsed.find('section', attrs = {"class": "article-section article-section__full"})

In [79]:
import os

directory = os.listdir(r'C:\Users\tpensaert\Documents\Thesis\JCP HTML')

count = 0
scraped, abstracts, body = [], [], []
failed = []

for file in directory:
    
    try:
        infile = os.path.join(r'C:\Users\tpensaert\Documents\Thesis\JCP HTML', os.fsdecode(file))
        raw = BeautifulSoup(open(infile, 'rb'), 'html.parser')
        
        # Metadata extraction
        main_dict = main(raw)
        scraped.append(main_dict)
        
        # Abstract & Body extraction
        abstract, body = main_text(raw)
        abstracts.append(abstract)
        
        parse_txt(body)
        
        
    except Exception as e:
        print(file, e)
        failed.append(file)
        pass
    
parse_csv(scraped, "meta_data")
parse_csv(abstracts, "abstracts")

2008childers_jiang@journal of consumer psychology~neurobiological perspectives on the nature of visual and verbal processes.html division by zero
2008hofmann_strack_deutsch@journal of consumer psychology~free to buy - explaining self-control and impulse in consumer behavior.html [Errno 22] Invalid argument: 'Free to buy? Explain.txt'
2009aaker_akutsu@journal of consumer psychology~why do people give - the role of identity in giving.html [Errno 22] Invalid argument: 'Why do people give? .txt'
2009posavac@journal of consumer psychology~on values and phenomenology.html 'NoneType' object has no attribute 'find_all'
2010bublitz_peracchio_block@journal of consumer psychology~why did i eat that - perspectives on food decision making and dietary restraint.html [Errno 22] Invalid argument: 'Why did I eat that? .txt'
2010priester@journal of consumer psychology~the use of structural equation models in consumer psychology - a methodological dialogue on its contributions, cautions, and concerns.htm

In [46]:
def update_meta(log=False):
    
    directory = os.listdir(r'C:\Users\tpensaert\Documents\Thesis\JCP HTML')

    count = 0
    scraped, abstracts, body = [], [], []
    failed = []

    for file in directory:
        
        
        
        try:
            infile = os.path.join(r'C:\Users\tpensaert\Documents\Thesis\JCP HTML', os.fsdecode(file))
            raw = BeautifulSoup(open(infile, 'rb'), 'html.parser')

            # Metadata extraction
            main_dict = main(raw)
            scraped.append(main_dict)

            if log:
                # Abstract & Body extraction
                abstract, body = main_text(raw)
                abstracts.append(abstract)

                parse_txt(body)

        except Exception as e:
            print(file, e)
            failed.append(file)
            pass
         
        count += 1       
        
        
    parse_csv(scraped, "meta_data_long")
    
    if log:
        parse_csv(abstracts, "abstracts")


In [47]:
# Run update on the data generation
update_meta()

2009posavac@journal of consumer psychology~on values and phenomenology.html 'NoneType' object has no attribute 'find_all'
2014motyka_grewal_puccinelli_roggeveen_avnet_daryanto_ruyter_wetzels@journal of consumer psychology~regulatory fit - a meta-analytic synthesis.html 'References' is not in list
2017mikeska_harrison_carlson@journal of consumer psychology~a meta-analysis of parental style and consumer socialization of children.html 'References' is not in list
2019fournier_alvarez@journal of consumer psychology~how brands acquire cultural meaning.html 'References' is not in list
2019krishna@journal of consumer psychology~how brands acquire cultural meaning - introduction.html 'References' is not in list
2019price_coulter@journal of consumer psychology~crossing bridges - assembling culture into brands and brands into consumers_ global local cultural lives.html 'References' is not in list
2020lamberton@journal of consumer psychology~reflective self-control in self-control scholarship - a 

In [62]:
scraped

[{'Title': 'Aligning benefits with payments: A test of the pattern alignment hypothesis',
  'Author': 'Seigyoung Auh, Eric Shih, Yeosun Yoon',
  'Publication Date': '2008/10/01',
  'Volume': '18',
  'Issue': '4',
  'Issn': '1532-7663',
  'First page': '292',
  'Last page': '303',
  'Page length': 11,
  'Reference Diversity': 0.47},
 {'Title': 'Some insights on visual and verbal processing strategies',
  'Author': 'Richard P. Bagozzi',
  'Publication Date': '2008/10/01',
  'Volume': '18',
  'Issue': '4',
  'Issn': '1532-7663',
  'First page': '258',
  'Last page': '263',
  'Page length': 5,
  'Reference Diversity': 1.17},
 {'Title': 'Social reality and the hole in determinism',
  'Author': 'Roy F. Baumeister',
  'Publication Date': '2008/01/01',
  'Volume': '18',
  'Issue': '1',
  'Issn': '1532-7663',
  'First page': '34',
  'Last page': '38',
  'Page length': 4,
  'Reference Diversity': 0.2},
 {'Title': 'Free will in consumer behavior: Self‐control, ego depletion, and choice',
  'Autho

In [238]:
len(scraped)

NameError: name 'scraped' is not defined

In [80]:
failed

['2008childers_jiang@journal of consumer psychology~neurobiological perspectives on the nature of visual and verbal processes.html',
 '2008hofmann_strack_deutsch@journal of consumer psychology~free to buy - explaining self-control and impulse in consumer behavior.html',
 '2009aaker_akutsu@journal of consumer psychology~why do people give - the role of identity in giving.html',
 '2009posavac@journal of consumer psychology~on values and phenomenology.html',
 '2010bublitz_peracchio_block@journal of consumer psychology~why did i eat that - perspectives on food decision making and dietary restraint.html',
 '2010priester@journal of consumer psychology~the use of structural equation models in consumer psychology - a methodological dialogue on its contributions, cautions, and concerns.html',
 '2013goodman_broniarczyk_griffin_mcalister@journal of consumer psychology~help or hinder - when recommendation signage expands consideration sets and heightens decision difficulty.html',
 '2014motyka_grew

In [239]:
len(failed)

NameError: name 'failed' is not defined

In [12]:
directory

['2008auh_shih_yoon@journal of consumer psychology~aligning benefits with payments - a test of the pattern alignment hypothesis.html',
 '2008bagozzi@journal of consumer psychology~some insights on visual and verbal processing strategies.html',
 '2008baumeister@journal of consumer psychology~social reality and the hole in determinism.html',
 '2008baumeister_sparks_stillman_vohs@journal of consumer psychology~free will in consumer behavior - self-control, ego depletion, and choice.html',
 '2008bettman_luce_payne@journal of consumer psychology~preference construction and preference stability - putting the pillow to rest.html',
 '2008childers_jiang@journal of consumer psychology~neurobiological perspectives on the nature of visual and verbal processes.html',
 '2008cho_schwarz@journal of consumer psychology~of great art and untalented artists - effort information and the flexible construction of judgmental heuristics.html',
 '2008cohen_belyavsky_silk@journal of consumer psychology~using vis

In [240]:
extract_abstract(parsed)

"This article examines consumer perception of transactions whose benefits of consumption and cost of purchase unfold over time. Specifically, the article employs the notion of narrow framing to suggest that, when consumers confront a series of decisions, they tend to make evaluations one at a time, rather than take into consideration the entire portfolio. Consistent with this argument, the authors test the pattern alignment hypothesis, which states that consumers prefer payment schemes that match the pattern of benefits and payments in each period, rather than a scheme that encompasses an entire financing period. In two experiments, the authors find general support for the pattern alignment hypothesis and for the underlying process by which this hypothesis occurs. Specifically, Experiment 2 highlights the mediating role of consumers' perceived fairness in determining the effectiveness of a financing program. The paper concludes with a discussion of the theoretical and practical implica

In [63]:
scraped

[{'Title': 'Aligning benefits with payments: A test of the pattern alignment hypothesis',
  'Author': 'Seigyoung Auh, Eric Shih, Yeosun Yoon',
  'Number of Authors': 3,
  'Publication Date': '2008/10/01',
  'Time Delta (months)': 137,
  'Volume': '18',
  'Issue': '4',
  'First page': '292',
  'Last page': '303',
  'Page length': 11,
  'Reference Diversity': 0.61,
  'Avg sentence length': 31.3,
  'Flesch Reading Ease': 31.25,
  'Subjectivity': 0.38860013698848,
  'Polarity': 0.06294921019991245},
 {'Title': 'Some insights on visual and verbal processing strategies',
  'Author': 'Richard P. Bagozzi',
  'Number of Authors': 1,
  'Publication Date': '2008/10/01',
  'Time Delta (months)': 137,
  'Volume': '18',
  'Issue': '4',
  'First page': '258',
  'Last page': '263',
  'Page length': 5,
  'Reference Diversity': 1.17,
  'Avg sentence length': 32.6,
  'Flesch Reading Ease': 29.93,
  'Subjectivity': 0.3720696559538024,
  'Polarity': 0.08583587043038267},
 {'Title': 'Social reality and the 

In [20]:
test_text = extract_body(parsed)

In [24]:
test_text

"Introduction Essentially, consumption of products and services is governed by mental tradeoffs between costs and benefits. Such tradeoffs may occur simultaneously with immediate gratification, as when a consumer completely pays for a product or service at the same time that it is consumed. An example of this would be the purchase of a frankfurter at a ballpark; in such a setting, the mental calculation is relatively straightforward, because the consumer simply evaluates the costs and benefits at the present point in time. However, for durable products or extended service encounters, the benefits can extend over multiple periods, and they are often paid for with credit cards or other financing schemes where payments are spread over time. Making cost–benefit tradeoffs over time has recently attracted the attention of researchers because it provides particular challenges for consumers (Kamleitner & Hölzl, 2006). Consumers are increasingly using credit cards and other financing schemes to

In [129]:
extract_metrics(test_text)

('p < .001', 0, 'n = 1')

In [153]:
test_text

"Introduction Essentially, consumption of products and services is governed by mental tradeoffs between costs and benefits. Such tradeoffs may occur simultaneously with immediate gratification, as when a consumer completely pays for a product or service at the same time that it is consumed. An example of this would be the purchase of a frankfurter at a ballpark; in such a setting, the mental calculation is relatively straightforward, because the consumer simply evaluates the costs and benefits at the present point in time. However, for durable products or extended service encounters, the benefits can extend over multiple periods, and they are often paid for with credit cards or other financing schemes where payments are spread over time. Making cost–benefit tradeoffs over time has recently attracted the attention of researchers because it provides particular challenges for consumers (Kamleitner & Hölzl, 2006). Consumers are increasingly using credit cards and other financing schemes to

In [161]:
test_text.find("p < .001")

29712

In [246]:
newtext = extract_body(testpars)

In [268]:
outcome = re.findall(patroontje, newtext, re.IGNORECASE)[0]

In [273]:
int(outcome.split()[-1].split(')')[0]) + 2

126

In [278]:
degrees(newtext)

126

In [27]:
experimental(test_text)

'No'

In [41]:
# Reading in the data
updated = pd.read_excel('C://Users/tpensaert/Documents/Thesis/extra_titles.xlsx', index=False)
updated

Unnamed: 0,Title
0,With suspicious (but happy) minds: Mood's abil...
1,How successful would a phoneâ€pillow be: Usin...
2,The broad embrace of luxury: Hedonic potential...
3,When thinking is beneficial and when it is not...
4,Proposing and testing the contextual gender in...
5,How much was your shopping basket? Working mem...
6,Consumer responses to brand elimination: An at...
7,Regulatory fit from attributeâ€based versus a...
8,Indulgence as selfâ€reward for prior shopping...
9,Affective forecasting and selfâ€control: Why ...


In [49]:
def text2int(textnum, numwords={}):
        units = [
        "zero", "one", "two", "three", "four", "five", "six", "seven", "eight",
        "nine", "ten", "eleven", "twelve", "thirteen", "fourteen", "fifteen",
        "sixteen", "seventeen", "eighteen", "nineteen",
        ]

        tens = ["", "", "twenty", "thirty", "forty", "fifty", "sixty", "seventy", "eighty", "ninety"]

        scales = ["hundred", "thousand", "million", "billion", "trillion"]

        numwords["and"] = (1, 0)
        for idx, word in enumerate(units):    numwords[word] = (1, idx)
        for idx, word in enumerate(tens):     numwords[word] = (1, idx * 10)
        for idx, word in enumerate(scales):   numwords[word] = (10 ** (idx * 3 or 2), 0)

        current = result = 0
        for word in textnum.split():
            if word not in numwords:
                raise Exception("Illegal word: " + word)

            scale, increment = numwords[word]
            current = current * scale + increment
            if scale > 100:
                result += current
                current = 0

        return result + current

In [51]:
directory = os.listdir(r'C:\Users\tpensaert\Documents\Thesis\JCP HTML')
overview = {}
for file in directory:

    for title in titles:
        if title.lower() in file:
            
            infile = os.path.join(r'C:\Users\tpensaert\Documents\Thesis\JCP HTML', os.fsdecode(file))
            raw = BeautifulSoup(open(infile, 'rb'), 'html.parser')

            body = extract_body(raw)
            body2 = body[body.find("Study 1"):]

            sample = ['respondents','participants', 'undergraduates', 'under-graduates', 'students', 'under-graduate', 'mturkers', 
              'amazon mturkers', 'mturks', 'mturkers', 'amazon mechanical turkers', 'workers', 'members', 'subjects', 'panel',
              'prolific workers', 'prolific partificants', 'crowdflower', 'Qualtrics', 'SurveyMonkey', 'CheckMarket']
            verbs = ["recruited", "applied", "involved", "assigned", "participate", 'participated', 'responded', "employed",
             "asked", "informed", "given", "told", "instructed", "exposed", "shown", "invited"]
        
            for item in sample:
                index = body.lower().find(item)
                selection = body[index-100:index+100]

                overview[title] = [int(s) for s in selection.split() if s.isdigit()]
            
overview
        

Exception: Illegal word: Study