In [120]:
import nltk

# movie review sentences
from nltk.corpus import sentence_polarity
import random
import json
import csv
import nltk
from nltk.book import *
from nltk.corpus import stopwords
from nltk import word_tokenize
from nltk.collocations import *
from nltk.collocations import BigramCollocationFinder
import re
from nltk.corpus import treebank

In [161]:
# get the 1500 most frequently appearing keywords in the corpus
word_items = all_words.most_common(1500)
word_features = [word for (word,count) in word_items]

In [122]:
## Pre-processing: repeat the setup of the movie review sentences for classification
# for each sentence(document), get its words and category (positive/negative)
documents = [(sent, cat) for cat in sentence_polarity.categories()
    for sent in sentence_polarity.sents(categories=cat)]


random.shuffle(documents)

In [123]:
# get all words from all movie_reviews and put into a frequency distribution
#   note lowercase, but no stemming or stopwords
all_words_list = [word for (sent,cat) in documents for word in sent]
all_words = nltk.FreqDist(all_words_list)


In [124]:
# define features (keywords) of a document for a BOW/unigram baseline
# each feature is 'contains(keyword)' and is true or false depending
# on whether that keyword is in the document
def document_features(document, word_features):
    document_words = set(document)
    features = {}
    for word in word_features:
        features['V_{}'.format(word)] = (word in document_words)
    return features

In [125]:
# get features sets for a document, including keyword features and category feature
featuresets = [(document_features(d, word_features), c) for (d, c) in documents]

In [164]:
# training using naive Baysian classifier, testing set is 10% of data
train_set, test_set = featuresets[1000:], featuresets[:1000]
classifier = nltk.NaiveBayesClassifier.train(train_set)

In [165]:
# evaluate the accuracy of the classifier
nltk.classify.accuracy(classifier, test_set)
# the accuracy result may vary since we randomized the documents

0.732

---

In [185]:
# set up for using bigrams
from nltk.collocations import *
bigram_measures = nltk.collocations.BigramAssocMeasures()

In [186]:
# create the bigram finder on all the words in sequence
print(all_words_list[:50])
finder = BigramCollocationFinder.from_words(all_words_list)

['an', 'undistinguished', 'attempt', 'to', 'make', 'a', 'classic', 'theater', 'piece', 'cinematic', '.', 'effective', 'in', 'all', 'its', 'aspects', ',', 'margarita', 'happy', 'hour', 'represents', 'an', 'auspicious', 'feature', 'debut', 'for', 'chaiken', '.', 'this', 'strenuously', 'unfunny', 'showtime', 'deserves', 'the', 'hook', '.', 'despite', 'some', 'strong', 'performances', ',', 'never', 'rises', 'above', 'the', 'level', 'of', 'a', 'telanovela', '.']


In [187]:
# define the top 500 bigrams using the chi squared measure
bigram_features = finder.nbest(bigram_measures.chi_sq, 500)
print(bigram_features[:50])

[("''independent", "film''"), ("'60s-homage", 'pokepie'), ("'[the", 'cockettes]'), ("'ace", "ventura'"), ("'alternate", "reality'"), ("'aunque", 'recurre'), ("'black", "culture'"), ("'blue", "crush'"), ("'chan", "moment'"), ("'chick", "flicks'"), ("'date", "movie'"), ("'ethnic", 'cleansing'), ("'face", "value'"), ("'fully", "experienced'"), ("'jason", "x'"), ("'juvenile", "delinquent'"), ("'laugh", "therapy'"), ("'masterpiece", "theatre'"), ("'nicholas", "nickleby'"), ("'old", "neighborhood'"), ("'opening", "up'"), ("'rare", "birds'"), ("'sacre", 'bleu'), ("'science", "fiction'"), ("'shindler's", "list'"), ("'snow", "dogs'"), ("'some", "body'"), ("'special", "effects'"), ("'terrible", "filmmaking'"), ("'time", "waster'"), ("'true", "story'"), ("'unfaithful'", 'cheats'), ("'very", "sneaky'"), ("'we're", '-doing-it-for'), ("'who's", "who'"), ('-after', 'spangle'), ('-as-it-', 'thinks-it-is'), ('-as-nasty', '-as-it-'), ('-doing-it-for', "-the-cash'"), ('10-course', 'banquet'), ('10-year',

In [188]:
# examples to demonstrate the bigram feature function definition
sent = ['Arthur','carefully','rode','the','brown','horse','around','the','castle']
sentbigrams = list(nltk.bigrams(sent))
print(sentbigrams)

[('Arthur', 'carefully'), ('carefully', 'rode'), ('rode', 'the'), ('the', 'brown'), ('brown', 'horse'), ('horse', 'around'), ('around', 'the'), ('the', 'castle')]


In [189]:
# for a single bigram, test if it's in the sentence bigrams and format the feature name
bigram = ('brown','horse')
print(bigram in sentbigrams)
print('B_{}_{}'.format(bigram[0], bigram[1]))

True
B_brown_horse


In [190]:
# define features that include words as before
# add the most frequent significant bigrams
# this function takes the list of words in a document as an argument and returns a feature dictionary
# it depends on the variables word_features and bigram_features
def bigram_document_features(document, word_features, bigram_features):
    document_words = set(document)
    document_bigrams = nltk.bigrams(document)
    features = {}
    for word in word_features:
        features['V_{}'.format(word)] = (word in document_words)
    for bigram in bigram_features:
        features['B_{}_{}'.format(bigram[0], bigram[1])] = (bigram in document_bigrams)
    return features

In [191]:
# use this function to create feature sets for all sentences
bigram_featuresets = [(bigram_document_features(d, word_features, bigram_features), c) for (d, c) in documents]

In [192]:
# number of features for document 0
print(len(bigram_featuresets[0][0].keys()))

2000


In [193]:
# features in document 0
print(bigram_featuresets[0][0])

{'V_.': True, 'V_the': False, 'V_,': False, 'V_a': True, 'V_and': False, 'V_of': False, 'V_to': True, 'V_is': False, 'V_in': False, 'V_that': False, 'V_it': False, 'V_as': False, 'V_but': False, 'V_with': False, 'V_film': False, 'V_this': False, 'V_for': False, 'V_its': False, 'V_an': True, 'V_movie': False, "V_it's": False, 'V_be': False, 'V_on': False, 'V_you': False, 'V_not': False, 'V_by': False, 'V_about': False, 'V_one': False, 'V_more': False, 'V_like': False, 'V_has': False, 'V_are': False, 'V_at': False, 'V_from': False, 'V_than': False, 'V_"': False, 'V_all': False, 'V_--': False, 'V_his': False, 'V_have': False, 'V_so': False, 'V_if': False, 'V_or': False, 'V_story': False, 'V_i': False, 'V_too': False, 'V_just': False, 'V_who': False, 'V_into': False, 'V_what': False, 'V_most': False, 'V_out': False, 'V_no': False, 'V_much': False, 'V_even': False, 'V_good': False, 'V_up': False, 'V_will': False, 'V_comedy': False, 'V_time': False, 'V_can': False, 'V_some': False, 'V_charac

In [194]:
# train a classifier and report accuracy
train_set, test_set = bigram_featuresets[1000:], bigram_featuresets[:1000]
classifier = nltk.NaiveBayesClassifier.train(train_set)
nltk.classify.accuracy(classifier, test_set)

0.735

---

In [178]:
###  Task 2: POS tag counts
# using the default pos tagger in NLTK (the Stanford tagger)
print(sent)
print(nltk.pos_tag(sent))

['Arthur', 'carefully', 'rode', 'the', 'brown', 'horse', 'around', 'the', 'castle']
[('Arthur', 'NNP'), ('carefully', 'RB'), ('rode', 'VBD'), ('the', 'DT'), ('brown', 'JJ'), ('horse', 'NN'), ('around', 'IN'), ('the', 'DT'), ('castle', 'NN')]


In [179]:
# this function takes a document list of words and returns a feature dictionary
# it runs the default pos tagger (the Stanford tagger) on the document
#   and counts 4 types of pos tags to use as features
def POS_features(document, word_features):
    document_words = set(document)
    tagged_words = nltk.pos_tag(document)
    features = {}
    for word in word_features:
        features['contains({})'.format(word)] = (word in document_words)
    numNoun = 0
    numVerb = 0
    numAdj = 0
    numAdverb = 0
    for (word, tag) in tagged_words:
        if tag.startswith('N'): numNoun += 1
        if tag.startswith('V'): numVerb += 1
        if tag.startswith('J'): numAdj += 1
        if tag.startswith('R'): numAdverb += 1
    features['nouns'] = numNoun
    features['verbs'] = numVerb
    features['adjectives'] = numAdj
    features['adverbs'] = numAdverb
    return features

In [180]:
# define feature sets using this function
POS_featuresets = [(POS_features(d, word_features), c) for (d, c) in documents]
# number of features for document 0
print(len(POS_featuresets[0][0].keys()))

1504


In [181]:
# the first sentence
print(documents[0])
# the pos tag features for this sentence
print('num nouns', POS_featuresets[0][0]['nouns'])
print('num verbs', POS_featuresets[0][0]['verbs'])
print('num adjectives', POS_featuresets[0][0]['adjectives'])
print('num adverbs', POS_featuresets[0][0]['adverbs'])

(['an', 'undistinguished', 'attempt', 'to', 'make', 'a', 'classic', 'theater', 'piece', 'cinematic', '.'], 'neg')
num nouns 4
num verbs 1
num adjectives 2
num adverbs 0


In [182]:
# train and test the classifier
train_set, test_set = POS_featuresets[1000:], POS_featuresets[:1000]
classifier = nltk.NaiveBayesClassifier.train(train_set)
nltk.classify.accuracy(classifier, test_set)

0.727

---

In [195]:
## Task 3: cross-validation ##
# this function takes the number of folds, the feature sets
# it iterates over the folds, using different sections for training and testing in turn
#   it prints the accuracy for each fold and the average accuracy at the end
def cross_validation_accuracy(num_folds, featuresets):
    subset_size = int(len(featuresets)/num_folds)
    print('Each fold size:', subset_size)
    accuracy_list = []
    # iterate over the folds
    for i in range(num_folds):
        test_this_round = featuresets[(i*subset_size):][:subset_size]
        train_this_round = featuresets[:(i*subset_size)] + featuresets[((i+1)*subset_size):]
        # train using train_this_round
        classifier = nltk.NaiveBayesClassifier.train(train_this_round)
        # evaluate against test_this_round and save accuracy
        accuracy_this_round = nltk.classify.accuracy(classifier, test_this_round)
        print (i, accuracy_this_round)
        accuracy_list.append(accuracy_this_round)
    # find mean accuracy over all rounds
    print ('mean accuracy', sum(accuracy_list) / num_folds)

In [197]:
# perform the cross-validation on the featuresets with word features and generate accuracy
num_folds = 5
cross_validation_accuracy(num_folds, bigram_featuresets)

Each fold size: 2132
0 0.7293621013133208
1 0.7387429643527205
2 0.7120075046904315
3 0.75
4 0.7439024390243902
mean accuracy 0.7348030018761726


In [198]:
num_folds = 5
cross_validation_accuracy(num_folds, featuresets)

Each fold size: 2132
0 0.7279549718574109
1 0.7401500938086304
2 0.7115384615384616
3 0.7485928705440901
4 0.7457786116322702
mean accuracy 0.7348030018761726


In [199]:
num_folds = 5
cross_validation_accuracy(num_folds, POS_featuresets)


Each fold size: 2132
0 0.7349906191369606
1 0.7373358348968105
2 0.7110694183864915
3 0.7514071294559099
4 0.7443714821763602
mean accuracy 0.7358348968105066


In [145]:
## Task 4: other evaluation measures:  confusion matrix, precision, recall, F1 ##

goldlist = []
predictedlist = []
for (features, label) in test_set:
    	goldlist.append(label)
    	predictedlist.append(classifier.classify(features))

In [146]:
# look at the first 30 examples
print(goldlist[:30])
print(predictedlist[:30])

['neg', 'pos', 'neg', 'neg', 'neg', 'pos', 'neg', 'pos', 'pos', 'neg', 'pos', 'pos', 'pos', 'neg', 'pos', 'pos', 'pos', 'neg', 'neg', 'pos', 'pos', 'pos', 'neg', 'pos', 'pos', 'pos', 'neg', 'neg', 'pos', 'neg']
['pos', 'pos', 'neg', 'pos', 'neg', 'pos', 'neg', 'pos', 'pos', 'neg', 'pos', 'pos', 'neg', 'pos', 'pos', 'pos', 'neg', 'pos', 'neg', 'pos', 'pos', 'pos', 'pos', 'pos', 'pos', 'neg', 'neg', 'neg', 'pos', 'pos']


In [147]:
cm = nltk.ConfusionMatrix(goldlist, predictedlist)
print(cm.pretty_format(sort_by_count=True, truncate=9))

    |   n   p |
    |   e   o |
    |   g   s |
----+---------+
neg |<368>136 |
pos | 137<359>|
----+---------+
(row = reference; col = test)



In [148]:
# or show the results as percentages
print(cm.pretty_format(sort_by_count=True, show_percents=True, truncate=9))

    |      n      p |
    |      e      o |
    |      g      s |
----+---------------+
neg | <36.8%> 13.6% |
pos |  13.7% <35.9%>|
----+---------------+
(row = reference; col = test)



In [149]:
# Function to compute precision, recall and F1 for each label
#  and for any number of labels
# Input: list of gold labels, list of predicted labels (in same order)
# Output:  prints precision, recall and F1 for each label
def eval_measures(gold, predicted):
    # get a list of labels
    labels = list(set(gold))
    # these lists have values for each label
    recall_list = []
    precision_list = []
    F1_list = []
    for lab in labels:
        # for each label, compare gold and predicted lists and compute values
        TP = FP = FN = TN = 0
        for i, val in enumerate(gold):
            if val == lab and predicted[i] == lab:  TP += 1
            if val == lab and predicted[i] != lab:  FN += 1
            if val != lab and predicted[i] == lab:  FP += 1
            if val != lab and predicted[i] != lab:  TN += 1
        # use these to compute recall, precision, F1
        recall = TP / (TP + FN)
        precision = TP / (TP + FP)
        recall_list.append(recall)
        precision_list.append(precision)
        F1_list.append( 2 * (recall * precision) / (recall + precision))

    # the evaluation measures in a table with one row per label
    print('\tPrecision\tRecall\t\tF1')
    # print measures for each label
    for i, lab in enumerate(labels):
        print(lab, '\t', "{:10.3f}".format(precision_list[i]), \
          "{:10.3f}".format(recall_list[i]), "{:10.3f}".format(F1_list[i]))

In [150]:
# call the function with our data
eval_measures(goldlist, predictedlist)

	Precision	Recall		F1
neg 	      0.729      0.730      0.729
pos 	      0.725      0.724      0.725


In [225]:
list_text=[]
# This list is to help to do with the text part
i=0
j=0
kk=0
#counters
counter=0
pos_list=[]
neg_list=[]
# a counter


with open('16119_webhose_2020_01_db21c91a1ab47385bb13773ed8238c31_0000002.json', 'r', encoding='utf-8') as file:
#open the file


        for line in file:
        # file is too big to just use load(), to save my laptop, load in by line

                data=json.loads(line)

                if data["text"] is not None:
                    text=data["text"]


                    sentences = nltk.sent_tokenize(text)



                    for p in sentences:


                        text_low=p.lower()
                        #lower text

                        counter=counter+1
                        #number of sentence


                    list_text=[]


                    list_text.extend([w for w in word_tokenize(text_low) if re.search('^[a-zA-Z]',w)])


                    if classifier.classify(document_features(list_text, word_features)) == 'pos':
                            pos_list.extend(list_text)
                            i=i+1

                    if classifier.classify(document_features(list_text, word_features)) == 'neg':
                            neg_list.extend(list_text)
                            j=j+1







                          #save the clean data text in list_text

        ## end for loop

file.close()
## all data loaded

In [207]:
print(i,j)

3342 7614


In [226]:
list_pos_p=[]
list_pos_n=[]
list_pos_p =nltk.pos_tag(pos_list)
list_pos_n =nltk.pos_tag(neg_list)

In [227]:
grammar = "RBADJ: {<RB.*><JJ.*>}"
#define grammar on find adjective phrase

cp = nltk.RegexpParser(grammar)
tree_p=cp.parse(list_pos_p)
tree_n=cp.parse(list_pos_n)

###Explain:#################################################################
#there only [RB+JJ] or [JJ+NN] to make an adjective phrase
#the reason not include [JJ+NN] because it does not give the useful thing

#For example, nltk will define ["white" as "JJ"] and ["house" as "NN"]
#but "white house" is actually a word, it also happens on example "next week"
#will talk more in the report

In [228]:
rbadj_p=[]
rbadj_n=[]

for subtree in tree_p.subtrees():
    if subtree.label() == 'RBADJ':
        #to find the find adjective phrase
            rbadj_p.append(subtree)
            #save the correct adjective phrase in subtree

for subtree in tree_n.subtrees():
    if subtree.label() == 'RBADJ':
        #to find the find adjective phrase
            rbadj_n.append(subtree)
            #save the correct adjective phrase in subtree


In [238]:
rb_adj_p = []
rb_adj_n = []


rbadj_stop_word = ["n't political ", 'south lake ', 'hong kong ', 'only ibd ', 'south shore ', 'threshold type ',
                   'so i ','ago january ', 'more ap ']

for phrase in rbadj_p:
    i = ''
    for word, pos in phrase:
        i = i + word + ' '
        #to make tree type to a readable list
        #WTS: model 'RB JJ' as output
    if i not in rbadj_stop_word:
        rb_adj_p.append(i)

for phrase in rbadj_n:
    i = ''
    for word, pos in phrase:
        i = i + word + ' '
        #to make tree type to a readable list
        #WTS: model 'RB JJ' as output
    if i not in rbadj_stop_word:
        rb_adj_n.append(i)

In [239]:
nltk.FreqDist(rb_adj_p).most_common(10)
#frequency 50 top adjective phrase

[('most popular ', 37),
 ('best preventative ', 33),
 ('most read ', 21),
 ('more confirmed ', 6),
 ('most recent ', 6),
 ('very unusual ', 5),
 ('more related ', 5),
 ('most viewed ', 4),
 ('flipboard whatsapp ', 3),
 ('so many ', 3)]

In [240]:
nltk.FreqDist(rb_adj_n).most_common(10)
#frequency 50 top adjective phrase

[('broadcast rewritten ', 17),
 ('solely responsible ', 11),
 ('more likely ', 8),
 ('only temporary ', 6),
 ('relatively mild ', 5),
 ('still possible ', 5),
 ('collide due ', 5),
 ('more contact ', 5),
 ('currently undergoing ', 4),
 ('singapore bangkok ', 4)]

In [241]:
adj_tags = {'JJ', 'JJS', 'JJR'}
#label of adjective words

In [247]:
adj_pos_p=[]
adj_pos_n=[]
adj_stop_words=[]

for w in ['january','u.s.','new','south','china','own','chinese','trump','other','white','united','carson','first','last','human','next','wuhan','holiday','nevada','thursday','friday']:
    adj_stop_words.append(w)

for word,pos in list_pos_p:
    if len(word)>2:
        #by looking the data, most len(word)<2 are useless in this case

        if (pos in adj_tags):
            if word not in adj_stop_words:
                adj_pos_p.append(word)

for word,pos in list_pos_n:
    if len(word)>2:
        #by looking the data, most len(word)<2 are useless in this case

        if (pos in adj_tags):
            if word not in adj_stop_words:
                adj_pos_n.append(word)

In [248]:
FreqDist(adj_pos_p).most_common(50)

[('local', 222),
 ('anywhere', 206),
 ('canadian', 197),
 ('global', 162),
 ('more', 147),
 ('associated', 132),
 ('latest', 114),
 ('international', 113),
 ('public', 97),
 ('related', 96),
 ('raw', 80),
 ('fotografo', 73),
 ('top', 69),
 ('novel', 68),
 ('relevant', 59),
 ('coronavirus', 50),
 ('such', 46),
 ('outbreak', 46),
 ('popular', 45),
 ('same', 45),
 ('medical', 44),
 ('additional', 42),
 ('february', 42),
 ('american', 42),
 ('armenian', 42),
 ('full', 40),
 ('recent', 39),
 ('national', 38),
 ('economic', 38),
 ('daily', 37),
 ('common', 36),
 ('david', 36),
 ('most', 36),
 ('read', 35),
 ('australian', 35),
 ('live', 33),
 ('india', 33),
 ('preventative', 33),
 ('typical', 33),
 ('foreign', 33),
 ('best', 32),
 ('original', 31),
 ('human-to-human', 31),
 ('due', 31),
 ('good', 31),
 ('inbox', 30),
 ('official', 30),
 ('open', 29),
 ('prime', 29),
 ('important', 28)]

In [249]:
FreqDist(adj_pos_n).most_common(50)

[('more', 428),
 ('latest', 300),
 ('global', 143),
 ('associated', 124),
 ('top', 118),
 ('related', 115),
 ('canadian', 107),
 ('under-19', 94),
 ('local', 91),
 ('additional', 90),
 ('australian', 83),
 ('medical', 83),
 ('potential', 82),
 ('icc', 81),
 ('public', 75),
 ('international', 74),
 ('daily', 66),
 ('due', 66),
 ('london', 63),
 ('free', 63),
 ('confirmed', 60),
 ('national', 59),
 ('coronavirus', 58),
 ('anywhere', 54),
 ('least', 53),
 ('outbreak', 52),
 ('good', 50),
 ('economic', 50),
 ('responsible', 47),
 ('similar', 45),
 ('such', 44),
 ('likely', 42),
 ('british', 42),
 ('hong', 41),
 ('https', 41),
 ('total', 41),
 ('novel', 40),
 ('sure', 40),
 ('low', 40),
 ('nick', 40),
 ('close', 38),
 ('full', 37),
 ('original', 36),
 ('direct', 35),
 ('official', 35),
 ('financial', 35),
 ('possible', 35),
 ('same', 35),
 ('deadly', 35),
 ('foreign', 34)]