In [114]:
# Imports
import pandas as pd
import math
import nltk
nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/kushagraseth/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


True

In [78]:
# Reading txt Files into Pandas DataFrame
phrases_df = pd.read_csv('Data/stanfordSentimentTreebank/dictionary.txt', 
                         header = None, 
                         sep = '|')
phrases_df.columns = ['phrases', 'phrase ids']
print(phrases_df.head())
print(phrases_df.shape)

labels_df = pd.read_csv('Data/stanfordSentimentTreebank/sentiment_labels.txt', 
                        sep = '|')
print(labels_df.head())
print(labels_df.shape)

       phrases  phrase ids
0            !           0
1          ! '       22935
2         ! ''       18235
3       ! Alas      179257
4  ! Brilliant       22936
(239232, 2)
   phrase ids  sentiment values
0           0           0.50000
1           1           0.50000
2           2           0.44444
3           3           0.50000
4           4           0.42708
(239232, 2)


In [79]:
# Merged DataFrame
df = pd.merge(phrases_df, 
              labels_df, 
              on = ['phrase ids'])
df.sort_values(by='phrase ids', 
               inplace = True)
df.set_index('phrase ids', 
             inplace = True)
print(df.head())
print(df.shape)

                        phrases  sentiment values
phrase ids                                       
0                             !           0.50000
1                             '           0.50000
2                           ' (           0.44444
3             ' ( the cockettes           0.50000
4           ' ( the cockettes )           0.42708
(239232, 2)


In [121]:
# Raw Scores for phrases
df['raw scores'] = df['sentiment values'].apply(lambda row: math.ceil(row * 25))
print(df.head())

                        phrases  sentiment values  raw scores
phrase ids                                                   
0                             !           0.50000          13
1                             '           0.50000          13
2                           ' (           0.44444          12
3             ' ( the cockettes           0.50000          13
4           ' ( the cockettes )           0.42708          11


In [144]:
# Print Phrases with score 1 to 20
ott_list = []
for idx, row in df.iterrows():
    if(row['raw scores'] >= 10 and row['raw scores'] <= 20):
        ott_list.append((idx, row['phrases']))
for i in range(50):
    print(ott_list[i])

(0, '!')
(1, "'")
(2, "' (")
(3, "' ( the cockettes")
(4, "' ( the cockettes )")
(5, "' ( the cockettes ) provides a window into a subculture hell-bent on expressing itself in every way imaginable")
(6, "' ( the cockettes ) provides a window into a subculture hell-bent on expressing itself in every way imaginable .")
(7, "' ( the cockettes ) provides a window into a subculture hell-bent on expressing itself in every way imaginable . '")
(9, "' a nightmare on elm street '")
(10, "' a nightmare on elm street ' or")
(11, "' a nightmare on elm street ' or `")
(16, "' a perfect family film , ' because it 's about family")
(17, "''")
(18, "'ll")
(19, "'ll like it")
(20, "'s")
(21, "'s a certain style and wit to the dialogue")
(24, "'s a visual delight and a decent popcorn adventure")
(25, "'s a visual delight and a decent popcorn adventure ,")
(26, "'s a visual delight and a decent popcorn adventure , as long as you do n't try to look too deep into the story")
(27, "'s about family")
(28, "'

In [145]:
# Print Phrases with score 1 to 5 and num of tokens atleast 10
otf_list = []
for idx, row in df.iterrows():
    if((row['raw scores'] > 1 and row['raw scores'] < 5)):
        phrase_list = row['phrases'].split()
        LEN_PHRASE = len(phrase_list)
        if (LEN_PHRASE >= 10):
            otf_list.append((idx, row['phrases'], LEN_PHRASE))
for i in range(50):
    print(otf_list[i])

(342, "adults will certainly want to spend their time in the theater thinking up grocery lists and ways to tell their kids how not to act like pinocchio . as for children , they wo n't enjoy the movie at all .", 41)
(550, 'be a movie that ends up slapping its target audience in the face by shooting itself in the foot', 19)
(655, 'but its abrupt drop in iq points as it races to the finish line proves simply too discouraging to let slide', 21)
(1012, "for all its technical virtuosity , the film is so mired in juvenile and near-xenophobic pedagogy that it 's enough to make one pine for the day when godard can no longer handle the rigors of filmmaking .", 38)
(1388, 'is nothing funny in this every-joke-has - been-told-a - thousand-times - before movie', 13)
(1431, "it 's all pretty tame . the most offensive thing about the movie is that hollywood expects people to pay to see it", 23)
(1435, "it 's enough to make one pine for the day when godard can no longer handle the rigors of filmmaking

In [146]:
# Print Phrases that contain any of the following words: “love”, “like”, “hate”. Exclude phrases where the word “like” is not a verb.
llh_list = []
for idx, row in df.iterrows():
    phrase_list = row['phrases'].split()
    pos_tags_list = nltk.pos_tag(phrase_list)
    
    for i in range(len(pos_tags_list)):
        if(pos_tags_list[i][0] == 'love' or pos_tags_list[i][0] == 'hate' or (pos_tags_list[i][0] == 'like' and pos_tags_list[i][1].startswith('VB'))):
            llh_list.append((idx, row['phrases']))
for i in range(50):
    print(llh_list[i])

(19, "'ll like it")
(196, 'a little too smugly superior to like')
(199, 'a moody , multi-dimensional love story and sci-fi mystery')
(200, 'a moody , multi-dimensional love story and sci-fi mystery ,')
(201, 'a moody , multi-dimensional love story and sci-fi mystery , solaris')
(202, 'a moody , multi-dimensional love story and sci-fi mystery , solaris is a thought-provoking , haunting film that allows the seeds of the imagination to germinate')
(203, 'a moody , multi-dimensional love story and sci-fi mystery , solaris is a thought-provoking , haunting film that allows the seeds of the imagination to germinate .')
(651, 'but a little too smugly superior to like')
(1260, 'if you love him')
(1261, 'if you love him ,')
(1262, 'if you love him , you')
(1263, "if you love him , you 'll like it")
(1263, "if you love him , you 'll like it")
(1265, 'ignore but a little too smugly superior to like')
(1399, "is so de palma . if you love him , you 'll like it .")
(1399, "is so de palma . if you lo