In [1]:
import nltk
import os
import pandas as pd
import numpy as np
import csv
import random
import re

## Preprocessing 

In [2]:
# Preprocess the 3 inaugural addresses in different times 
# (from the following years: 2001, 2009, 2017) for manual annotation work later.

print(os.getcwd(), "\n") # make use of the default path

filelist = [fileid for fileid in os.listdir(
    "./American-Inaugural-Address-Corpus") 
            if fileid[:4] in [ "2001", "2009", "2017"]]
print(filelist) 

/Users/genecosmo/Desktop/Computer Applications in Linguistics 

['2009-01-20-Barack-Obama.txt', '2017-01-20-Donald-J-Trump.txt', '2001-01-20-George-W-Bush.txt']


In [3]:
from nltk import word_tokenize
from nltk.tag.stanford import StanfordPOSTagger
# enter the paths of Stanford POS Tagger .jar file as well as the model to be used
jar = "/Users/Shared/stanford-postagger-full-2018-10-16/stanford-postagger-3.9.2.jar"
model = "/Users/Shared/stanford-postagger-full-2018-10-16/models/english-left3words-distsim.tagger"
# Instantiate an English pos-tagger using the jar and model defined above 
pos_tagger_en = StanfordPOSTagger(model, jar, encoding = "utf-8")

In [94]:
# Use Stanford POS-tagger defined above to Tag all source texts (English)

for fileid in filelist:
    if fileid.endswith(".txt"):                           # go through all English texts to apply tagger
        print(fileid)            
        with open ("American-Inaugural-Address-Corpus/" + fileid, encoding = "utf-8") as f:   
            raw = f.read()
            tokenized_text = word_tokenize(raw)              # tokenizing
            tagged_text = pos_tagger_en.tag(tokenized_text)  # pos-tagging

# Write the tagged text into new .txt files in specific format respectively

        with open ("American-Inaugural-Address-Corpus/tagged_" + fileid, "w", encoding = "utf-8") as tag_f:
            write_text = ""
            for (a, b) in tagged_text:
                write_text += a + "_" + b +" "   # combine word and tag together in the format: "My_PRP$ "
            
            # add newline character after the character which marks the end of a sentence 
            result = write_text.replace("_. ", "_.\n").replace(";_:", ";_:\n").replace(
            ":_:", ":_:\n")   # "_." include "!", "?" these sentence closers
            
                
            tag_f.write(result)       # write all the results into corresponding .txt file     
            

for fileid in filelist:
    with open ("American-Inaugural-Address-Corpus/tagged_"+ fileid) as f:
        sentences = []
        for row in f:
            sentences.append(row)  
        
# Get corresponding .xlsx files which contain every sentence of the speech in the first column.
        
        df_sent = pd.DataFrame({"sentence" : sentences[:len(sentences)]})
        df_sent.to_excel("American-Inaugural-Address-Corpus/Tagged/"+ 
                    fileid + "sent.xlsx", encoding = "utf-8", index = False)
        

2009-01-20-Barack-Obama.txt
1985-01-20-Ronald-Reagan.txt
1977-01-20-Jimmy-Carter.txt
2017-01-20-Donald-J-Trump.txt
2001-01-20-George-W-Bush.txt


In [4]:
# Get indexes of all coordinators
def get_index_of_coor(sent):
    split_sent = sent.split()
    indexes_of_coor = [-1]  # this ancillary element is used to compute the index of the first coordinator
    for w in split_sent:
        if w.startswith(",") or w.endswith("_CC") or w.lower() == "while_in":
            # Use the formula to get the right index 
            # indexes_of_coor[-1] means the last element(index) of the index list
            indexes_of_coor.append(split_sent.index(w) + indexes_of_coor[-1] +1 ) 
           
            # chop the previous part of the sentence off (by +1) in order to find the next coordinator
            split_sent = split_sent[(split_sent.index(w) +1) :]  
            
    # Exclude the conjunction at the beginning of the sentence       
    if 0 in indexes_of_coor:   
        indexes_of_coor = indexes_of_coor[2:]
    else:
        indexes_of_coor = indexes_of_coor[1:] # First ancillary index should be excluded
    return indexes_of_coor  


In [5]:
test_7 = "While_IN the_DT Congress_NNP determines_VBZ the_DT objects_NNS and_CC the_DT sum_NN of_IN appropriations_NNS ,_, the_DT officials_NNS of_IN the_DT executive_NN departments_NNS are_VBP responsible_JJ for_IN honest_JJ and_CC faithful_JJ disbursement_NN ,_, and_CC it_PRP should_MD be_VB their_PRP$ constant_JJ care_NN to_TO avoid_VB waste_NN and_CC extravagance_NN ._."
get_index_of_coor(test_7)

[6, 11, 22, 25, 26, 36]

In [6]:
# used for comparing 2 phrases as string
from difflib import SequenceMatcher

def compare(str1, str2):
    return SequenceMatcher(None, str1, str2).ratio() 
# returns the similarity score (float in [0,1]) between input strings, the higher the score is, more similar 2 Strings are. 



In [7]:
def get_pos_seq (word_tag_sequence):
    pos_sequence = ''.join([w[w.index("_"):w.index("_")+3] for w in word_tag_sequence])
    return pos_sequence

In [8]:
def get_pos_seq_raw (word_tag_sequence):
    pos_sequence = ''.join([w[w.index("_"):] for w in word_tag_sequence])
    return pos_sequence

In [9]:
def similar(prev, after):
    similar_pair = 0
    if len(prev) > 2 and len(after) > 2:
        
        if len(prev) == len(after):
            if compare(get_pos_seq(prev), get_pos_seq(after)) > 0.79 or compare(
            get_pos_seq(prev[1:]), get_pos_seq(after)) > 0.79 or compare(
            get_pos_seq(prev[2:]), get_pos_seq(after)) > 0.79 or compare(
            get_pos_seq(prev), get_pos_seq(after[: -1])) > 0.79 or compare(
            get_pos_seq(prev), get_pos_seq(after[: -2])) > 0.79:
                #print("1")
                similar_pair += 1
        elif len(prev) - len(after) == 1:   # previous subsentence is longer, so latter subsentence as length-base
            if compare(get_pos_seq(prev[-len(after) :]), get_pos_seq(after)) > 0.79 or compare(
            get_pos_seq(prev[-len(after)-1:]), get_pos_seq(after)) > 0.79:
                #print("2")
                similar_pair += 1
        elif len(prev) - len(after) >= 2:
            if compare(get_pos_seq(prev[-len(after) :]), get_pos_seq(after)) > 0.79 or compare(
            get_pos_seq(prev[-len(after)-1:]), get_pos_seq(after)) > 0.79 or compare(
            get_pos_seq(prev[-len(after)-2:]), get_pos_seq(after)) > 0.79:
                #print("3")
                similar_pair += 1
        elif len(prev) - len(after) == -1:
            if compare(get_pos_seq(prev), get_pos_seq(after[:len(prev)])) > 0.79 or compare(
            get_pos_seq(prev), get_pos_seq(after[:len(prev)+1])) > 0.79:
                #print("4")
                similar_pair += 1
        elif len(prev) - len(after) <= -2:
            if compare(get_pos_seq(prev), get_pos_seq(after[:len(prev)])) > 0.79 or compare(
            get_pos_seq(prev), get_pos_seq(after[:len(prev)+1])) > 0.79 or compare(
            get_pos_seq(prev), get_pos_seq(after[:len(prev)+2])) > 0.79:
                #print("5")
                similar_pair += 1
   
   # If one of the subsentence is less than 3 words, no normalization of PoS-tags will be performed                 
    elif len(prev) == 2 or len(after) == 2: 
        if len(prev) >= len(after):
            if compare(get_pos_seq_raw(prev[-len(after):]), get_pos_seq_raw(after)) == 1:
                similar_pair += 1
        if len(prev) < len(after):
            if compare(get_pos_seq_raw(prev), get_pos_seq_raw(after[:len(prev)])) == 1:
                similar_pair += 1
                
    return similar_pair
                
        

In [10]:
# for reversed parallelism
def reversed_simi(prev, after):
    similar_phrase_pair = 0
    if len(prev) > 1 and len(after) > 1:
        pos = after[0][after[0].index("_"):]
        for w in prev[1:]:   # avoid "index_out_of_range" problem
            if w.endswith(pos):
                reverse_start = w
                index_reverse_start = prev.index(reverse_start)
                w_before = prev[index_reverse_start - 1]
                if w_before[w_before.index("_"):] == after[1][after[1].index("_"):]:
                    similar_phrase_pair += 1
    return similar_phrase_pair

In [162]:
# Only extract word-recurrence in particular positions
def recurrence_w (prev, after):
    recur = 0
    if len(prev) >= len(after):
        length = len(after)
        for index in range(length):
            if prev[-length : ][index].lower() == after[index].lower():
                recur += 1
    if len(prev) < len(after):
        length = len(prev)
        for index in range(length):
            if prev[index].lower() == after [: length][index].lower():
                recur += 1
    return recur

In [163]:
def num_of_CC(sent):
    sp_sent = sent.split()
    counter = 0
    for w_t in sp_sent:
        if w_t.endswith("_CC"):
            counter += 1
    return counter

In [164]:
def trigram_repetition(sent):
    num_tri = 0
    if len(sent.split()) > 2:
        
        trigrams = list(nltk.trigrams([w[w.index("_"):] for w in sent.split()]))
        if len(trigrams) == 5:
            if trigrams[0] == trigrams[4]:
                num_tri += 1
        if len(trigrams) == 6:
            if trigrams[0] == trigrams[4] or trigrams[0] == trigrams[5] or trigrams[1] == trigrams[5]:
                num_tri += 1
        if len(trigrams) > 6:
            for i in range(len(trigrams)-4):
                if trigrams[i] == trigrams[i+4]:
                    num_tri += 1
            for i in range(len(trigrams)-5):
                if trigrams[i] == trigrams[i+5]:
                    num_tri += 1
    return num_tri

In [165]:
trigram_repetition("We_PRP treat_VBP it_PRP like_IN a_DT nice-to-have_JJ instead_RB of_IN a_DT must-have_JJ ._.")

1

In [166]:
def get_features_new2(sent):
    features = {}
    recurrence = 0
    similar_phrase_pair = 0
    reversed_simi_pair = 0
    
    norm_sent = re.sub("...?_CC (also|not)_RB", ",_,", sent)  # nomalization: change "but_CC also_RB/not_RB" this kind of pattern into ",_,"
    norm_sent = re.sub("are_VBP not_RB", ",_,", norm_sent)
    norm_sent = re.sub(",_,.?,_,", ",_,", norm_sent)         # last step could result with ",_, ,_,"
    norm_sent = re.sub(",_, ...?_CC", ",_,", norm_sent)      # nomalization: change ",_, and_CC/or_CC/but_CC" this kind of pattern into ",_,"
    coor_indexes = get_index_of_coor(norm_sent)
    process_sent = norm_sent.split()[: -1]                   # chop the "sentence closer off"
           
    if len(coor_indexes) == 1:   # only 2 sub-sentences
        prev, after = process_sent[: coor_indexes[0]], process_sent[coor_indexes[0] + 1 :]
        similar_phrase_pair += similar(prev, after)
        recurrence += recurrence_w(prev, after)
        if "nor" in sent:
            similar_phrase_pair += reversed_simi(prev, after)
        
    elif len(coor_indexes) > 1:
        prev = process_sent[: coor_indexes[0]]             # First sentence
        for ii in range(len(coor_indexes) - 1):
            after = process_sent[coor_indexes[ii] + 1 : coor_indexes[ii + 1]]
            similar_phrase_pair += similar(prev, after)
            recurrence += recurrence_w(prev, after)
            if "nor" in sent:
                similar_phrase_pair += reversed_simi(prev, after)
            prev = after
            
        after = process_sent[coor_indexes[-1] + 1 :]        # Last sentence
        similar_phrase_pair += similar(prev, after)
        recurrence += recurrence_w(prev, after)
        if "nor" in sent:
            similar_phrase_pair += reversed_simi(prev, after)
            
# If by now no similar structures are found, try to split the sentence by <comma>, because conjunction words
# like "and" might be part of the parallel elements
    s = sent[:-4]     # chop off the sentence-closer
    ssplit = s.split(",_,")
    if len(ssplit) > 1:
        for i in range(len(ssplit)-1):
            similar_phrase_pair += similar(ssplit[i].split(), ssplit[i+1].split())

                
# If by now no similar structures are found, try to identify [A, B cc C], [A cc B cc C] or [A, B, C] types (structure consist of only one word) 
    
    if len(sent.split()) > 4:      # for these types, sentence should contain at least 5 words.
        tag_sequence = [w[w.index("_") :] for w in sent.split()]  # use full PoS-Tag
        for index in range(len(tag_sequence) - 4):
            t1 = tag_sequence[index]
            t2 = tag_sequence[index + 2]
            t3 = tag_sequence[index + 4]
            if t1 == t2 == t3 and ((tag_sequence[index+1] == "_CC" and tag_sequence[index+3] == "_CC") or (
            tag_sequence[index+1] == "_," and (tag_sequence[index+3] == "_," or tag_sequence[index+3] == "_CC"))):
                #print(tag_sequence)
                similar_phrase_pair += 1
                
# If still no similar structures are found, try to identify [A, B cc C] types (structure consist of only one word)                             
   
                    
    features["simi"] = similar_phrase_pair > 0
    features["recur"] = recurrence
    features["num_CC"] = num_of_CC(sent)
    features["num_tri"] = trigram_repetition(sent) > 0
    return features


    
test_sent1 = "In_IN the_DT year_NN of_IN America_NNP 's_POS birth_NN ,_, in_IN the_DT coldest_JJS of_IN months_NNS ,_, a_DT small_JJ band_NN of_IN patriots_NNS huddled_VBN by_IN dying_VBG campfires_NNS on_IN the_DT shores_NNS of_IN an_DT icy_NN river_NN ._."
test_sent2 = "And_CC each_DT day_NN brings_VBZ further_JJ evidence_NN that_IN the_DT ways_NNS we_PRP use_VBP energy_NN strengthen_VB our_PRP$ adversaries_NNS and_CC threaten_VB our_PRP$ planet_NN ._."
test_sent3 = "America_NNP has_VBZ never_RB been_VBN united_VBN by_IN blood_NN or_CC birth_NN or_CC soil_NN ._."  
t4 = "I_PRP thank_VBP President_NNP Bush_NNP for_IN his_PRP$ service_NN to_TO our_PRP$ Nation_NNP ,_, as_RB well_RB as_IN the_DT generosity_NN and_CC cooperation_NN he_PRP has_VBZ shown_VBN throughout_IN this_DT transition_NN ._."
t5 = " an_DT education_NN system_NN ,_, flush_NN with_IN cash_NN ,_, but_CC which_WDT leaves_VBZ our_PRP$ young_JJ and_CC beautiful_JJ students_NNS deprived_VBN of_IN all_DT knowledge_NN ;_:"
t6 = "Our_PRP$ Nation_NN is_VBZ at_IN war_NN against_IN a_DT far-reaching_JJ network_NN of_IN violence_NN and_CC hatred_NN ._."
t7 = "The_DT peaceful_JJ transfer_NN of_IN authority_NN is_VBZ rare_JJ in_IN history_NN ,_, yet_RB common_JJ in_IN our_PRP$ country_NN ._."
t8 = "We_PRP 've_VBP defended_VBN other_JJ nations_NNS '_POS borders_NNS while_IN refusing_VBG to_TO defend_VB our_PRP$ own_JJ and_CC spent_VBD trillions_NNS and_CC trillions_NNS of_IN dollars_NNS overseas_RB while_IN America_NNP 's_POS infrastructure_NN has_VBZ fallen_VBN into_IN disrepair_NN and_CC decay_NN ._."
t9 = "Permission_NN ,_, community_NN ,_, curiosity_NN :_:"
t11 = "It_PRP is_VBZ the_DT firefighter_NN 's_POS courage_NN to_TO storm_VB a_DT stairway_NN filled_VBN with_IN smoke_NN ,_, but_CC also_RB a_DT parent_NN 's_POS willingness_NN to_TO nurture_VB a_DT child_NN ,_, that_WDT finally_RB decides_VBZ our_PRP$ fate_NN ._."
t12 = "Our_PRP$ spirits_NNS dampened_VBD ,_, we_PRP showed_VBD up_RP again_RB ,_, August_NNP 2018_CD ,_, year_NN 10_CD ._."
t13 = "We_PRP are_VBP shaped_VBN by_IN every_DT language_NN and_CC culture_NN ,_, drawn_VBN from_IN every_DT end_NN of_IN this_DT Earth_NNP ._."
print("t1: ", get_features_new2(test_sent1))
print("t2: ", get_features_new2(test_sent2))
print("t3: ", get_features_new2(test_sent3))
print("t4: ", get_features_new2(t4))
print("t5: ", get_features_new2(t5))
print("t6: ", get_features_new2(t6))
print("t7: ", get_features_new2(t7))
print("t8: ", get_features_new2(t8))
print("t9: ", get_features_new2(t9))
print("t10: ", get_features_new2(t10))
print("t11: ", get_features_new2(t11))
print("t12: ", get_features_new2(t12))
print("t13: ", get_features_new2(t13))

t1:  {'simi': True, 'recur': 1, 'num_CC': 0, 'num_tri': False}
t2:  {'simi': True, 'recur': 1, 'num_CC': 2, 'num_tri': False}
t3:  {'simi': True, 'recur': 0, 'num_CC': 2, 'num_tri': False}
t4:  {'simi': False, 'recur': 0, 'num_CC': 1, 'num_tri': False}
t5:  {'simi': True, 'recur': 0, 'num_CC': 2, 'num_tri': False}
t6:  {'simi': False, 'recur': 0, 'num_CC': 1, 'num_tri': False}
t7:  {'simi': False, 'recur': 0, 'num_CC': 0, 'num_tri': False}
t8:  {'simi': False, 'recur': 0, 'num_CC': 3, 'num_tri': False}
t9:  {'simi': True, 'recur': 0, 'num_CC': 0, 'num_tri': False}
t10:  {'simi': True, 'recur': 0, 'num_CC': 1, 'num_tri': False}
t11:  {'simi': False, 'recur': 0, 'num_CC': 1, 'num_tri': False}
t12:  {'simi': False, 'recur': 0, 'num_CC': 0, 'num_tri': False}
t13:  {'simi': False, 'recur': 0, 'num_CC': 1, 'num_tri': False}


In [167]:
# define this function for evaluation
def compute_PRF(gold, predicted, class_label):
    TP = sum(int(g == class_label and p == class_label) for (g, p) in zip (gold, predicted))
    FP = sum(int(p == class_label and g != class_label) for (g, p) in zip (gold, predicted)) 
    FN = sum(int(p != class_label and g == class_label) for (g, p) in zip (gold, predicted))
    if TP + FP > 0:
        precision = TP/(TP + FP)
    else:
        precision = 0
    if TP + FN > 0:
        recall = TP/(TP + FN)
    else:
        recall = 0
    if precision > 0 and recall > 0:
        f_measure = 2 * precision * recall / (precision + recall)
    else:
        f_measure = 0
    #print('Precision=%.2f Recall=%.2f F_Measure=%.2f'  %  (precision, recall, f_measure))
    return (precision, recall, f_measure)

## Try Classifiers from nltk

In [168]:
data_set_raw1 = []
for file in os.listdir("American-Inaugural-Address-Corpus/Tagged"):
    if file.endswith("csv"):
        with open("American-Inaugural-Address-Corpus/Tagged/" + file, encoding = "utf-8") as f:
            reader = csv.reader(f, delimiter = ",")
            rows = [row for row in reader]
            data_set_raw1.extend(rows[1:])   # the first row is the header ["sentence", "Tag"], so not needed
print(len(data_set_raw1))
data_set = [(sent, tag) for [sent, tag] in data_set_raw1]
print(data_set[:5])

380
[('Inaugural_JJ Address_NNP January_NNP 20_CD ,_, 2009_CD Public_NNP Papers_NNP of_IN the_DT Presidents_NNS Barack_NNP Obama_NNP <_JJR br_NN >_JJR 2009_CD :_:\n', 'f'), (' Book_VB I_PRP Barack_NNP Obama_NNP 2009_CD :_:\n', 'f'), (' Book_VB I_PRP Location_NNP :_:\n', 'f'), (' District_NNP of_IN Columbia_NNP Washington_NNP The_NNP American_NNP Presidency_NNP Project_NNP\n', 'f'), ('My_PRP$ fellow_JJ citizens_NNS ,_, I_PRP stand_VBP here_RB today_NN humbled_VBN by_IN the_DT task_NN before_IN us_PRP ,_, grateful_JJ for_IN the_DT trust_NN you_PRP have_VBP bestowed_VBN ,_, mindful_JJ of_IN the_DT sacrifices_NNS borne_VBN by_IN our_PRP$ ancestors_NNS ._.', 't')]


In [169]:
# cross validation
k = 5
sum_precision = 0
sum_recall = 0
sum_f_measure = 0
random.Random(6).shuffle(data_set) 
featuresets = [(get_features_new2(sent), tag) for (sent, tag) in data_set ]
size = len(featuresets)
for fold in range(k):
    train_set = featuresets[: int(size/k*fold)] + featuresets[int(size/k*(fold+1)):]
    devtest_set = featuresets[int(size/k*fold) : int(size/k*(fold+1))]
    nb = nltk.NaiveBayesClassifier.train(train_set)
    gold = [tag for (sent_feature, tag) in devtest_set]
    pred1 = [nb.classify(sent_feature) for (sent_feature, tag) in devtest_set]
    sum_precision += compute_PRF(gold, pred1, "t")[0]
    sum_recall += compute_PRF(gold, pred1, "t")[1]
    sum_f_measure += compute_PRF(gold, pred1, "t")[2]

print("\ncross validated precision for 't' is: ", sum_precision/5)
print("\ncross validated recall for 't' is: ", sum_recall/5)
print("\ncross validated f-measure for 't' is: ", sum_f_measure/5)


cross validated precision for 't' is:  0.8505120885555668

cross validated recall for 't' is:  0.8435003450655625

cross validated f-measure for 't' is:  0.8407891156462585


In [170]:
nb.show_most_informative_features(5)

Most Informative Features
                    simi = True                t : f      =     14.7 : 1.0
                   recur = 2                   t : f      =     14.4 : 1.0
                   recur = 1                   t : f      =     11.2 : 1.0
                 num_tri = True                t : f      =      9.5 : 1.0
                  num_CC = 0                   f : t      =      6.7 : 1.0


In [171]:
nb.classify(get_features_new2(" that_IN we_PRP did_VBD not_RB turn_VB back_RB ,_, nor_CC did_VBD we_PRP falter_VBP ._."))

't'

### Error Analysis 

In [172]:
for (sent, tag) in data_set:
    if nb.classify(get_features_new2(sent)) == 't' and tag == 'f':
        print(sent)

Guided_VBN by_IN these_DT principles_NNS once_RB more_RBR ,_, we_PRP can_MD meet_VB those_DT new_JJ threats_NNS that_WDT demand_VBP even_RB greater_JJR effort_NN ,_, even_RB greater_JJR cooperation_NN and_CC understanding_NN between_IN nations_NNS ._.

For_IN we_PRP know_VBP that_IN our_PRP$ patchwork_NN heritage_NN is_VBZ a_DT strength_NN ,_, not_RB a_DT weakness_NN ._.

We_PRP have_VBP a_DT place_NN ,_, all_DT of_IN us_PRP ,_, in_IN a_DT long_JJ story_NN ,_, a_DT story_NN we_PRP continue_VBP but_CC whose_WP$ end_NN we_PRP will_MD not_RB see_VB ._.

For_IN many_JJ decades_NNS ,_, we_PRP 've_VBP enriched_VBN foreign_JJ industry_NN at_IN the_DT expense_NN of_IN American_JJ industry_NN ,_, subsidized_VBD the_DT armies_NNS of_IN other_JJ countries_NNS while_IN allowing_VBG for_IN the_DT very_RB sad_JJ depletion_NN of_IN our_PRP$ military_NN ._.

that_WDT in_IN the_DT depth_NN of_IN winter_NN ,_, when_WRB nothing_NN but_CC hope_NN and_CC virtue_NN could_MD survive_VB ._.

Yet_CC ,_, compas

In [173]:
for (sent, tag) in data_set:
    if nb.classify(get_features_new2(sent)) == 'f' and tag == 't':
        print(sent)

We_PRP are_VBP shaped_VBN by_IN every_DT language_NN and_CC culture_NN ,_, drawn_VBN from_IN every_DT end_NN of_IN this_DT Earth_NNP ._.

The_DT ambitions_NNS of_IN some_DT Americans_NNS are_VBP limited_VBN by_IN failing_VBG schools_NNS and_CC hidden_VBN prejudice_NN and_CC the_DT circumstances_NNS of_IN their_PRP$ birth_NN ._.

To_TO those_DT leaders_NNS around_IN the_DT globe_NN who_WP seek_VBP to_TO sow_VB conflict_NN or_CC blame_VB their_PRP$ society_NN 's_POS ills_NNS on_IN the_DT West_NNP ,_, know_VBP that_IN your_PRP$ people_NNS will_MD judge_VB you_PRP on_IN what_WP you_PRP can_MD build_VB ,_, not_RB what_WP you_PRP destroy_VBP ._.

And_CC for_IN those_DT who_WP seek_VBP to_TO advance_VB their_PRP$ aims_NNS by_IN inducing_VBG terror_NN and_CC slaughtering_VBG innocents_NNS ,_, we_PRP say_VBP to_TO you_PRP now_RB that_IN our_PRP$ spirit_NN is_VBZ stronger_JJR and_CC can_MD not_RB be_VB broken_VBN ._.

Duties_NNS that_IN we_PRP do_VBP not_RB grudgingly_RB accept_VB but_CC ,_, rat

In [174]:
nb.classify(get_features_new2("While_IN the_DT Congress_NNP determines_VBZ the_DT objects_NNS and_CC the_DT sum_NN of_IN appropriations_NNS ,_, the_DT officials_NNS of_IN the_DT executive_NN departments_NNS are_VBP responsible_JJ for_IN honest_JJ and_CC faithful_JJ disbursement_NN ,_, and_CC it_PRP should_MD be_VB their_PRP$ constant_JJ care_NN to_TO avoid_VB waste_NN and_CC extravagance_NN ._."))

't'

## Try Classifiers from scikit-learn

In [175]:
data_set_raw = []
for file in os.listdir("American-Inaugural-Address-Corpus/Tagged"):
    if file.endswith("csv"):
        with open("American-Inaugural-Address-Corpus/Tagged/" + file, encoding = "utf-8") as f:
            reader = csv.reader(f, delimiter = ",")
            rows = [row for row in reader]
            data_set_raw.extend(rows[1:])
print(data_set_raw[:3])

[['Inaugural_JJ Address_NNP January_NNP 20_CD ,_, 2009_CD Public_NNP Papers_NNP of_IN the_DT Presidents_NNS Barack_NNP Obama_NNP <_JJR br_NN >_JJR 2009_CD :_:\n', 'f'], [' Book_VB I_PRP Barack_NNP Obama_NNP 2009_CD :_:\n', 'f'], [' Book_VB I_PRP Location_NNP :_:\n', 'f']]


In [176]:
transformed_data = []
for [sent, tag] in data_set_raw:
    features_tag = []
    simi = get_features_new2(sent)["simi"]
    recur = get_features_new2(sent)['recur']
    num_CC = get_features_new2(sent)["num_CC"]
    num_tri = get_features_new2(sent)["num_tri"]
    features_tag.extend([sent,simi,recur,num_CC, num_tri,tag])
    transformed_data.append(features_tag)  # use append to maintain list form
len(transformed_data)

380

In [177]:
df_data = pd.DataFrame(transformed_data, columns = ['sent','simi','recur',"num_CC",'num_tri','tag'])
df_data.head(10)


Unnamed: 0,sent,simi,recur,num_CC,num_tri,tag
0,"Inaugural_JJ Address_NNP January_NNP 20_CD ,_,...",False,0,0,False,f
1,Book_VB I_PRP Barack_NNP Obama_NNP 2009_CD :_:\n,False,0,0,False,f
2,Book_VB I_PRP Location_NNP :_:\n,False,0,0,False,f
3,District_NNP of_IN Columbia_NNP Washington_NN...,False,0,0,False,f
4,"My_PRP$ fellow_JJ citizens_NNS ,_, I_PRP stand...",False,1,0,False,t
5,I_PRP thank_VBP President_NNP Bush_NNP for_IN ...,False,0,1,False,f
6,Forty-four_CD Americans_NNPS have_VBP now_RB t...,False,0,0,False,f
7,The_DT words_NNS have_VBP been_VBN spoken_VBN ...,True,1,1,False,t
8,"Yet_CC every_DT so_RB often_RB ,_, the_DT oath...",True,0,2,False,t
9,"At_IN these_DT moments_NNS ,_, America_NNP has...",False,0,3,False,t


In [178]:
df_data["tag_num"] = df_data.tag.map({'f' : 0, 't' : 1})
df_data.to_excel("American-Inaugural-Address-Corpus/Tagged/transformed.xlsx", 
                 encoding = "utf-8", index = False)

In [179]:
feature_cols = ['simi','recur','num_CC','num_tri']
# select all 10 features (X)
X = df_data[feature_cols]
# select numerical tag as responses/targets (y)
y = df_data.tag_num 

In [180]:
# use cross validation to evaluate the results:
from sklearn.cross_validation import train_test_split, cross_val_score, cross_val_predict
from sklearn import metrics

In [181]:
# Try different algorithms
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC 


In [182]:
# initialize classifiers
knn = KNeighborsClassifier()
lr = LogisticRegression()
svc = LinearSVC()

In [183]:
from sklearn.cross_validation import KFold
kf_5 = KFold(n = 380, n_folds = 5)

In [184]:
def tryClassifier(clf):
    precisions = 0
    recalls = 0
    f_measures = 0

    for (train_index, test_index) in kf_5:
        X_train, X_test = np.asarray(X)[train_index], np.asarray(X)[test_index]  # convert dataFrame(X,y) into array!
        y_train, y_test = np.asarray(y)[train_index], np.asarray(y)[test_index]  # Because index works with array!
        
        clf.fit(X_train, y_train)
        tag_pred = clf.predict(X_test)

        precisions += metrics.precision_score(y_test, tag_pred, 1) # cross-validated precision for label "t"
        recalls += metrics.recall_score(y_test, tag_pred, 1)
        f_measures += metrics.f1_score(y_test, tag_pred, 1)
    print(type(clf))
    print("CV_Precion for label 't':   ", precisions / 5)
    print("CV_Recall for label 't':    ", recalls / 5)
    print("CV_F_measure for label 't': ", f_measures / 5)

    f1_scores = cross_val_score(clf, X, y, cv=5, scoring='f1')
    
    print("f1_scores_all_classes: ", f1_scores)
    print("Average_f1_all_classed: ", sum([s for s in f1_scores])/5, "\n")
        

In [185]:
tryClassifier(knn)
tryClassifier(lr)
tryClassifier(svc)

<class 'sklearn.neighbors.classification.KNeighborsClassifier'>
CV_Precion for label 't':    0.8997902097902098
CV_Recall for label 't':     0.7875979737141261
CV_F_measure for label 't':  0.8327123202001252
f1_scores_all_classes:  [0.77272727 0.68085106 0.90196078 0.93333333 0.88888889]
Average_f1_all_classed:  0.8355522686186017 

<class 'sklearn.linear_model.logistic.LogisticRegression'>
CV_Precion for label 't':    0.8996935817805383
CV_Recall for label 't':     0.8182644941265631
CV_F_measure for label 't':  0.8511416177501481
f1_scores_all_classes:  [0.77272727 0.72       0.94117647 0.93333333 0.93333333]
Average_f1_all_classed:  0.860114081996435 

<class 'sklearn.svm.classes.LinearSVC'>
CV_Precion for label 't':    0.8986910755148741
CV_Recall for label 't':     0.8177718832891246
CV_F_measure for label 't':  0.8490266346086119
f1_scores_all_classes:  [0.77272727 0.74509804 0.94117647 0.93333333 0.93333333]
Average_f1_all_classed:  0.8651336898395723 

