### LIWC implementation
- https://pypi.org/project/liwc-analysis/
- https://github.com/dfederschmidt/pyliwc
- https://radimrehurek.com/gensim/

In [76]:
import sklearn
import pandas as pd
from sklearn.datasets import fetch_20newsgroups
import gensim
import numpy as np

import re
from collections import Counter
from pyliwc.core import LIWC

import nltk
from nltk.corpus import stopwords
stop_words = set(stopwords.words('english'))

# ignore warnings
import warnings
warnings.filterwarnings("ignore")

In [2]:
nltk.download('punkt')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\TK\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [3]:
# !pip install -e git+https://github.com/dfederschmidt/pyliwc#egg=pyliwc

In [4]:
liwc = LIWC("LIWC2015_English_Flat.dic")

### Pre-example to have a try

In [5]:
# fetch the dataset and put it in a dataframe
news_data = fetch_20newsgroups(shuffle=True, random_state=42)
df_news = pd.DataFrame(news_data["data"], columns=["text"])
print(df_news.shape)

(11314, 1)


In [6]:
df_news.head(5)

Unnamed: 0,text
0,From: lerxst@wam.umd.edu (where's my thing)\nS...
1,From: guykuo@carson.u.washington.edu (Guy Kuo)...
2,From: twillis@ec.ecn.purdue.edu (Thomas E Will...
3,From: jgreen@amber (Joe Green)\nSubject: Re: W...
4,From: jcm@head-cfa.harvard.edu (Jonathan McDow...


In [7]:
%%time
# run without multiprocessing
scores = liwc.process_df(df_news, "text")

Wall time: 1min 9s


In [8]:
scores.head(5)

Unnamed: 0,achiev,adj,adverb,affect,affiliation,anger,anx,article,assent,auxverb,...,social,space,swear,tentat,they,time,verb,we,work,you
0,,2.5,2.5,1.666667,0.833333,,,7.5,,10.0,...,7.5,9.166667,,5.0,,4.166667,17.5,,3.333333,2.5
1,1.639344,4.098361,1.639344,4.918033,1.639344,,,4.918033,,5.737705,...,12.295082,2.459016,,0.819672,1.639344,10.655738,8.196721,,2.459016,1.639344
2,0.615385,5.538462,6.769231,4.923077,0.307692,0.615385,,10.461538,0.307692,9.230769,...,6.153846,5.846154,,7.076923,,4.923077,16.615385,,2.153846,0.307692
3,,5.714286,4.761905,3.809524,,,0.952381,4.761905,,3.809524,...,2.857143,3.809524,,2.857143,,0.952381,11.428571,,6.666667,0.952381
4,,5.590062,4.347826,4.968944,1.242236,0.621118,,7.453416,1.242236,9.31677,...,5.590062,3.10559,0.621118,4.968944,1.242236,4.347826,14.285714,0.621118,1.863354,0.621118


### Now set up our own LIWC categories set 

In [9]:
liwcPath = 'LIWC2015_English_Flat.dic'
LIWC_file = open(liwcPath, 'r') # LIWC dictionary

# set up the category index-word dictionary
catNames = {}
LIWC_file.readline() #skips first '%' line
line = LIWC_file.readline()
lookup = []
while '%' not in line:
    keyval = line.split('\t')
    key = keyval[1].strip()
    value = keyval[0]
    catNames[key] = value
    line = LIWC_file.readline()

In [10]:
def word_collection(category_):
    flexicon = open('LIWC2015_English_Flat.dic', encoding='utf-8')
    # read all LIWC words from file
    wordlines = [line.strip() for line in flexicon]
    # each line has a word or a stem followed by * and numbers of the word classes it is in
    # word class 126 is positive emotion and 127 is negative emotion
    result = []
    category_index = catNames[category_]
    for line in wordlines:
        if not line == '':
            items = line.split()
            word = items[0]
            classes = items[1:]
            for c in classes:
                if c == category_index:
                    if '(' not in word and ')' not in word:
                        result.append(re.compile(word))
    return result
# test sample
print(word_collection('posemo')[:30])

[re.compile('accept'), re.compile('accepta*'), re.compile('accepted'), re.compile('accepting'), re.compile('accepts'), re.compile('active'), re.compile('actively'), re.compile('admir*'), re.compile('ador*'), re.compile('advantag*'), re.compile('adventur*'), re.compile('affection*'), re.compile('agree'), re.compile('agreeable'), re.compile('agreeableness'), re.compile('agreeably'), re.compile('agreed'), re.compile('agreeing'), re.compile('agreement*'), re.compile('agrees'), re.compile('alright*'), re.compile('amaze*'), re.compile('amazing'), re.compile('amazingly'), re.compile('amor*'), re.compile('amus*'), re.compile('aok'), re.compile('appreciat*'), re.compile('approv*'), re.compile('assur*')]


In [11]:
# check wether a given word would be consistent with the cluster
p = re.compile('abstain*')
print(p.match('abstainy') == None)

False


### Categories to consider
- Positive emotion, Negative emotion, Female references, Male references
- Insight, Tentative, Certainty, Differentiation, Risk, Future focus

In [12]:
# give topics list
topics = ['posemo', 'negemo', 'female', 'male', 'insight', 
          'tentat', 'certain', 'differ', 'risk', 'focusfuture']
# set up a sub-dictionary for these topics
sub_dic = {}
for topic in topics:
    sub_dic[topic] = word_collection(topic)
# test case
print(sub_dic['differ'][:20])

[re.compile('actually'), re.compile('adjust*'), re.compile('against'), re.compile("ain't"), re.compile('aint'), re.compile('alternativ*'), re.compile('although'), re.compile('apart'), re.compile("aren't"), re.compile('arent'), re.compile('but'), re.compile("can't"), re.compile('cannot'), re.compile('cant'), re.compile('despite'), re.compile("didn't"), re.compile('didnt'), re.compile('differ'), re.compile('differed'), re.compile('difference*')]


### Go back to our previous comments collection and check 

In [13]:
boxOffice_Allfilled = pd.read_csv("boxOffice_Allfilled.csv")
Name_list = boxOffice_Allfilled.columns.tolist()
Name_list = [item for item in Name_list if 'Unnamed' not in item]
# kick out Unnamed columns for simplicity
boxOffice_Allfilled = boxOffice_Allfilled[Name_list]
boxOffice_Allfilled.head(5)

Unnamed: 0,year,bomrank,remove,bomtitle,imdblink,bomlink,studio,totalusgross$,totaltheater,openingusgross$,openingtheaters,opendate,closedate,critic ratings_avg,critic rating_var,user_avg,user_var,critic_ratings_comments,critic_ratings_list
0,2004.0,1.0,,Shrek 2,http://www.imdb.com/title/tt0298148/,http://www.boxofficemojo.com/movies/?id=shrek2...,DW,441226247.0,4223.0,108037878.0,4163.0,5/19,11/25,75.0,94.4,7.2,2.3796,"Lightning strikes twice, but not as brilliant...",90.0||88.0||80.0||80.0||80.0||75.0||75.0||70.0...
1,2004.0,2.0,,Spider-Man 2,http://www.imdb.com/title/tt0316654/,http://www.boxofficemojo.com/movies/?id=spider...,Sony,373585825.0,4166.0,88156227.0,4152.0,6/30,12/19,83.0,115.2,7.3,2.86249,The pleasure is doubled in Spider-Man 2. Crac...,100.0||100.0||100.0||91.0||90.0||90.0||88.0||8...
2,2004.0,3.0,,The Passion of the Christ,http://www.imdb.com/title/tt0335345/,http://www.boxofficemojo.com/movies/?id=passio...,NM,370274604.0,3408.0,83848082.0,3043.0,2/25,7/29,47.0,663.9,7.2,6.3492,"This is not a sermon or a homily, but a visua...",100.0||80.0||80.0||75.0||63.0||63.0||50.0||50....
3,2004.0,4.0,,Meet the Fockers,http://www.imdb.com/title/tt0290002/,http://www.boxofficemojo.com/movies/?id=meetth...,Uni.,279261160.0,3554.0,46120980.0,3518.0,12/22,6/16,41.0,298.8,6.3,2.7872,One of those relatively rare comedies that's ...,70.0||63.0||60.0||60.0||50.0||50.0||50.0||40.0...
4,2004.0,5.0,,The Incredibles,http://www.imdb.com/title/tt0317705/,http://www.boxofficemojo.com/movies/?id=incred...,BV,261441092.0,3933.0,70467623.0,3933.0,11/5,4/14,90.0,100.0,8.0,2.395,Pixar again hitches top-notch storytelling to...,100.0||100.0||100.0||100.0||100.0||90.0||90.0|...


In [14]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\TK\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [73]:
# split the comment column to re-visit comment one by one
test_case = boxOffice_Allfilled['critic_ratings_comments'][0].split("||")[1]
# remove stop words example
tokenized = test_case.replace(".", "").split()
tokenized = [item for item in tokenized if item not in stop_words]
print(test_case)
print(tokenized)

# check the words frequency of this comment in above 10 topics
def count_topic_frequency(text_list, topic_):
    # make use of sub_dic here
    # make use of the tokenized comments
    frequency = 0
    for word in text_list:
        # check frequency in certain topic
        for p in sub_dic[topic_]:
            if p.match(word) !=None:
                # print(word)
                frequency += 1
                break
    return frequency

# one test case for positive emotion
print(count_topic_frequency(tokenized, topics[0]))

# count the frequency for all the topic
# and return frequency list
def count_frequency(text_, topic_list):
    frequency_list = [0]*len(topic_list)
    for i in range(len(topic_list)):
        frequency = count_topic_frequency(text_, topic_list[i])
        frequency_list[i] = str(frequency)
    return frequency_list

# one text case for the first comment
print(count_frequency(tokenized, topics))

 So gorgeously animated and so thoroughly entertaining for all ages that only an ogre would complain it's not quite as fresh as the original. 
['So', 'gorgeously', 'animated', 'thoroughly', 'entertaining', 'ages', 'ogre', 'would', 'complain', 'quite', 'fresh', 'original']
4
['4', '2', '1', '0', '0', '2', '0', '1', '0', '0']


In [74]:
# Now go through all the comments for the first movie to test
def count_frequency_AllComments(comment_para, topic_list):
    # make use of boxOffice_Allfilled comments column
    # to fill comment_para
    split = comment_para.split("||")
    # initialize for all the splitted comments 
    result = [np.nan]*len(split)
    for i in range(len(split)):
        # need to tokenize comment string first
        tokenized = split[i].replace(".", "").split()
        tokenized = [item for item in tokenized if item not in stop_words]
        result[i] = count_frequency(tokenized, topics)
    return result

# one test case
print(count_frequency_AllComments(boxOffice_Allfilled['critic_ratings_comments'][0], topics))

[['7', '2', '3', '3', '0', '1', '0', '1', '0', '0'], ['4', '2', '1', '0', '0', '2', '0', '1', '0', '0'], ['5', '1', '1', '0', '0', '0', '1', '0', '0', '0'], ['1', '2', '0', '0', '1', '1', '0', '0', '0', '0'], ['5', '2', '2', '1', '0', '0', '0', '0', '1', '0'], ['2', '0', '0', '0', '0', '0', '0', '0', '0', '1'], ['4', '1', '3', '2', '2', '1', '2', '0', '0', '0'], ['6', '2', '1', '2', '1', '1', '1', '0', '1', '0'], ['1', '1', '0', '0', '2', '1', '1', '0', '0', '0'], ['2', '3', '0', '0', '0', '0', '0', '0', '0', '0']]


In [None]:
# Now update the table by the frequency nested list
boxOffice_Allfilled['critic_comment_topic_frequency'] = np.nan

# over 2400 rows in total
for i in range(len(boxOffice_Allfilled)):
    # need to check available comments para first
    if str(boxOffice_Allfilled['critic_ratings_comments'][i]) != "nan":
        result = count_frequency_AllComments(boxOffice_Allfilled['critic_ratings_comments'][i], topics)
        # flatten = [str(item) for sublist in list for item in sublist]
        boxOffice_Allfilled['critic_comment_topic_frequency'][i] = result
    # process
    if i%200 == 0:
        print(i)

In [89]:
boxOffice_Allfilled.head(5)

Unnamed: 0,year,bomrank,remove,bomtitle,imdblink,bomlink,studio,totalusgross$,totaltheater,openingusgross$,openingtheaters,opendate,closedate,critic ratings_avg,critic rating_var,user_avg,user_var,critic_ratings_comments,critic_ratings_list,critic_comment_topic_frequency
0,2004.0,1.0,,Shrek 2,http://www.imdb.com/title/tt0298148/,http://www.boxofficemojo.com/movies/?id=shrek2...,DW,441226247.0,4223.0,108037878.0,4163.0,5/19,11/25,75.0,94.4,7.2,2.3796,"Lightning strikes twice, but not as brilliant...",90.0||88.0||80.0||80.0||80.0||75.0||75.0||70.0...,"[[7, 2, 3, 3, 0, 1, 0, 1, 0, 0], [4, 2, 1, 0, ..."
1,2004.0,2.0,,Spider-Man 2,http://www.imdb.com/title/tt0316654/,http://www.boxofficemojo.com/movies/?id=spider...,Sony,373585825.0,4166.0,88156227.0,4152.0,6/30,12/19,83.0,115.2,7.3,2.86249,The pleasure is doubled in Spider-Man 2. Crac...,100.0||100.0||100.0||91.0||90.0||90.0||88.0||8...,"[[8, 6, 2, 0, 0, 2, 2, 0, 2, 0], [3, 1, 0, 0, ..."
2,2004.0,3.0,,The Passion of the Christ,http://www.imdb.com/title/tt0335345/,http://www.boxofficemojo.com/movies/?id=passio...,NM,370274604.0,3408.0,83848082.0,3043.0,2/25,7/29,47.0,663.9,7.2,6.3492,"This is not a sermon or a homily, but a visua...",100.0||80.0||80.0||75.0||63.0||63.0||50.0||50....,"[[0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [1, 2, 0, 0, ..."
3,2004.0,4.0,,Meet the Fockers,http://www.imdb.com/title/tt0290002/,http://www.boxofficemojo.com/movies/?id=meetth...,Uni.,279261160.0,3554.0,46120980.0,3518.0,12/22,6/16,41.0,298.8,6.3,2.7872,One of those relatively rare comedies that's ...,70.0||63.0||60.0||60.0||50.0||50.0||50.0||40.0...,"[[4, 1, 0, 0, 1, 0, 0, 0, 0, 0], [2, 0, 1, 0, ..."
4,2004.0,5.0,,The Incredibles,http://www.imdb.com/title/tt0317705/,http://www.boxofficemojo.com/movies/?id=incred...,BV,261441092.0,3933.0,70467623.0,3933.0,11/5,4/14,90.0,100.0,8.0,2.395,Pixar again hitches top-notch storytelling to...,100.0||100.0||100.0||100.0||100.0||90.0||90.0|...,"[[2, 0, 0, 0, 0, 0, 0, 0, 0, 0], [3, 1, 0, 1, ..."


In [90]:
boxOffice_Allfilled.to_excel("comments_frequency.xlsx")