In [1]:
import nltk
import os
import pandas as pd
import csv
import random
import re

## Preprocessing 

In [2]:
# Preprocess the 3 inaugural addresses in different times 
# (from the following years: 2001, 2009, 2017) for manual annotation work later.

print(os.getcwd(), "\n") # make use of the default path

filelist = [fileid for fileid in os.listdir(
    "./American-Inaugural-Address-Corpus") 
            if fileid[:4] in [ "2001", "2009", "2017"]]
print(filelist) 

/Users/genecosmo/Desktop/Computer Applications in Linguistics 

['2009-01-20-Barack-Obama.txt', '2017-01-20-Donald-J-Trump.txt', '2001-01-20-George-W-Bush.txt']


In [3]:
from nltk import word_tokenize
from nltk.tag.stanford import StanfordPOSTagger
# enter the paths of Stanford POS Tagger .jar file as well as the model to be used
jar = "/Users/Shared/stanford-postagger-full-2018-10-16/stanford-postagger-3.9.2.jar"
model = "/Users/Shared/stanford-postagger-full-2018-10-16/models/english-left3words-distsim.tagger"
# Instantiate an English pos-tagger using the jar and model defined above 
pos_tagger_en = StanfordPOSTagger(model, jar, encoding = "utf-8")

In [4]:
# Use Stanford POS-tagger defined above to Tag all source texts (English)

for fileid in filelist:
    if fileid.endswith(".txt"):                           # go through all English texts to apply tagger
        print(fileid)            
        with open ("American-Inaugural-Address-Corpus/" + fileid, encoding = "utf-8") as f:   
            raw = f.read()
            tokenized_text = word_tokenize(raw)              # tokenizing
            tagged_text = pos_tagger_en.tag(tokenized_text)  # pos-tagging

# Write the tagged text into new .txt files in specific format respectively

        with open ("American-Inaugural-Address-Corpus/tagged_" + fileid, "w", encoding = "utf-8") as tag_f:
            write_text = ""
            for (a, b) in tagged_text:
                write_text += a + "_" + b +" "   # combine word and tag together in the format: "My_PRP$ "
            
            # add newline character after the character which marks the end of a sentence 
            result = write_text.replace("_. ", "_.\n").replace(";_:", ";_:\n").replace(
            ":_:", ":_:\n")   # "_." include "!", "?" these sentence closers
            
                
            tag_f.write(result)       # write all the results into corresponding .txt file     
            

for fileid in filelist:
    with open ("American-Inaugural-Address-Corpus/tagged_"+ fileid) as f:
        sentences = []
        for row in f:
            sentences.append(row)  
        
# Get corresponding .xlsx files which contain every sentence of the speech in the first column.
        
        df_sent = pd.DataFrame({"sentence" : sentences[:len(sentences)]})
        df_sent.to_excel("American-Inaugural-Address-Corpus/Tagged_2/"+ 
                    fileid + "sent.xlsx", encoding = "utf-8", index = False)
        

2009-01-20-Barack-Obama.txt
2017-01-20-Donald-J-Trump.txt
2001-01-20-George-W-Bush.txt


In [4]:
# Get indexes of all coordinators
def get_index_of_coor(sent):
    split_sent = sent.split()
    indexes_of_coor = [-1]  # this ancillary element is used to compute the index of the first coordinator
    for w in split_sent:
        if w.startswith(",") or w.endswith("_CC"):
            # Use the formula to get the right index 
            # indexes_of_coor[-1] means the last element(index) of the index list
            indexes_of_coor.append(split_sent.index(w) + indexes_of_coor[-1] +1 ) 
           
            # chop the previous part of the sentence off (by +1) in order to find the next coordinator
            split_sent = split_sent[(split_sent.index(w) +1) :]  
            
    # Exclude the conjunction at the beginning of the sentence, 
    # in order to avoid problem that could occur in "similarity" function        
    if 0 in indexes_of_coor:   
        indexes_of_coor = indexes_of_coor[2:]
    else:
        indexes_of_coor = indexes_of_coor[1:] # First ancillary index should be excluded
    return indexes_of_coor  


In [5]:
test_7 = "But_CC know_VB this_DT ,_, America_NNP :_:"
get_index_of_coor(test_7)

[3]

In [6]:
# used for comparing 2 phrases as string
from difflib import SequenceMatcher

def compare(str1, str2):
    return SequenceMatcher(None, str1, str2).ratio() 
# returns the similarity score (float in [0,1]) between input strings, the higher the score is, more similar 2 Strings are. 


def simi_structure(prev_str, after_str):
    similar_phrase_pair = 0
    if len(prev_str) > 6 and len(after_str) > 6:   # both sub-sentence contains more than two words
            
        if len(prev_str) == len(after_str):
            if compare(prev_str[-len(after_str):], after_str) > 0.7:
                similar_phrase_pair += 1
        elif len(prev_str) - len(after_str) == 3:    # difference: 1 tag and prev_str is longer
            if compare(prev_str[-len(after_str):], after_str) > 0.7 or compare(
                prev_str[-len(after_str)-3 :], after_str) > 0.8:
                similar_phrase_pair += 1
        elif len(prev_str) - len(after_str) >= 6: # difference: at least 2 tags and prev_str is longer
            if compare(prev_str[-len(after_str):], after_str) > 0.7 or compare(
                prev_str[-len(after_str)-3 :], after_str) > 0.8 or compare(
                prev_str[-len(after_str)-6 :], after_str) > 0.8:
                similar_phrase_pair += 1
        elif len(prev_str) - len(after_str) == -3:  # difference: 1 tag and after_str is longer
            if compare(prev_str, after_str[: len(prev_str)]) > 0.7 or compare(
                prev_str, after_str[: len(prev_str) + 3]) > 0.8:
                similar_phrase_pair += 1
        elif len(prev_str) - len(after_str) <= -6:   # difference: at least 2 tags and after_str is longer
            if compare(prev_str, after_str[: len(prev_str)]) > 0.7 or compare(
                prev_str, after_str[: len(prev_str) + 3]) > 0.8 or compare(
                prev_str, after_str[: len(prev_str) + 6]) > 0.8:
                similar_phrase_pair += 1
    
    elif len(prev_str) == 6 or len(after_str) == 6:    # at least one of the sub-sentence contains only two words

        if len(prev_str) >= len(after_str):
            if compare(prev_str[-len(after_str) :], after_str) == 1:     # both sequences within the same span should be identical to be considered as parallel
                similar_phrase_pair += 1
            
        if len(prev_str) < len(after_str):
            if compare(prev_str, after_str[: len(prev_str)]) == 1:
                similar_phrase_pair += 1
                
        
        
    return similar_phrase_pair

In [9]:

def similarity(sent):
    similar_phrase_pair = 0
    norm_sent = re.sub("...?_CC (also|not)_RB", ",_,", sent)  # nomalization: change "but_CC also_RB/not_RB" this kind of pattern into ",_,"
    norm_sent = re.sub(",_,.?,_,", ",_,", norm_sent)     # last step could result with ",_, ,_,"
    norm_sent = re.sub(",_, ...?_CC", ",_,", norm_sent)  # nomalization: change ",_, and_CC/or_CC/but_CC" this kind of pattern into ",_,"
    coor_indexes = get_index_of_coor(norm_sent)
    process_sent = norm_sent.split()[: -1]         # chop the "sentence closer off"
           
    if len(coor_indexes) == 1:   # only 2 sub-sentences
        prev, after = process_sent[: coor_indexes[0]], process_sent[coor_indexes[0] + 1 :]
        prev_str = "".join([w[w.index("_") : w.index("_") + 3] for w in prev])
        after_str = "".join([w[w.index("_") : w.index("_") + 3] for w in after])
        similar_phrase_pair += simi_structure(prev_str, after_str)
        
    elif len(coor_indexes) > 1:
        prev = process_sent[: coor_indexes[0]]
        for ii in range(len(coor_indexes) - 1):
            after = process_sent[coor_indexes[ii] + 1 : coor_indexes[ii + 1]]
            prev_str = "".join([w[w.index("_") : w.index("_") + 3] for w in prev])
            after_str = "".join([w[w.index("_") : w.index("_") + 3] for w in after])
            #print("pre", prev_str, "af", after_str)
            similar_phrase_pair += simi_structure(prev_str, after_str)
            prev = after
        after = process_sent[coor_indexes[-1] + 1 :]
        prev_str = "".join([w[w.index("_") : w.index("_") + 3] for w in prev])
        after_str = "".join([w[w.index("_") : w.index("_") + 3] for w in after])
        #print("pre", prev_str, "af", after_str)
        similar_phrase_pair += simi_structure(prev_str, after_str)
        
    if similar_phrase_pair == 0:
        if len(process_sent) > 4:   # at least 4 words
            tag_sequence = [w[w.index("_") : w.index("_") + 3] for w in process_sent]
            #print(tag_sequence)
            tag_bigram_list = [bi for bi in nltk.bigrams(tag_sequence)]
            for index_w in range(len(tag_sequence) - 4):
                if tag_sequence[index_w] == tag_sequence[index_w + 2] == tag_sequence[index_w + 4]:
                    for index in range(len(tag_bigram_list) - 2):
                        if tag_bigram_list[index] == tag_bigram_list[index + 2] and (tag_bigram_list[index][0] == "_CC" or tag_bigram_list[index][0] == "_,"):
                            similar_phrase_pair += 1
            
    
    return similar_phrase_pair


    
test_sent1 = "In_IN the_DT year_NN of_IN America_NNP 's_POS birth_NN ,_, in_IN the_DT coldest_JJS of_IN months_NNS ,_, a_DT small_JJ band_NN of_IN patriots_NNS huddled_VBN by_IN dying_VBG campfires_NNS on_IN the_DT shores_NNS of_IN an_DT icy_NN river_NN ._."
test_sent2 = "And_CC each_DT day_NN brings_VBZ further_JJ evidence_NN that_IN the_DT ways_NNS we_PRP use_VBP energy_NN strengthen_VB our_PRP$ adversaries_NNS and_CC threaten_VB our_PRP$ planet_NN ._."
test_sent3 = "America_NNP has_VBZ never_RB been_VBN united_VBN by_IN blood_NN or_CC birth_NN or_CC soil_NN ._."  
t4 = "I_PRP thank_VBP President_NNP Bush_NNP for_IN his_PRP$ service_NN to_TO our_PRP$ Nation_NNP ,_, as_RB well_RB as_IN the_DT generosity_NN and_CC cooperation_NN he_PRP has_VBZ shown_VBN throughout_IN this_DT transition_NN ._."
t5 = " an_DT education_NN system_NN ,_, flush_NN with_IN cash_NN ,_, but_CC which_WDT leaves_VBZ our_PRP$ young_JJ and_CC beautiful_JJ students_NNS deprived_VBN of_IN all_DT knowledge_NN ;_:"
t6 = "Our_PRP$ Nation_NN is_VBZ at_IN war_NN against_IN a_DT far-reaching_JJ network_NN of_IN violence_NN and_CC hatred_NN ._."
t7 = "The_DT peaceful_JJ transfer_NN of_IN authority_NN is_VBZ rare_JJ in_IN history_NN ,_, yet_RB common_JJ in_IN our_PRP$ country_NN ._."
t8 = "We_PRP will_MD build_VB new_JJ roads_NNS and_CC highways_NNS and_CC bridges_NNS and_CC airports_NNS and_CC tunnels_NNS and_CC railways_NNS all_DT across_IN our_PRP$ wonderful_JJ Nation_NN ._."
t9 = "So_RB when_WRB I_PRP was_VBD a_DT little_JJ girl_NN ,_, a_DT book_NN sat_VBD on_IN the_DT coffee_NN table_NN in_IN our_PRP$ living_NN room_NN ,_, just_RB steps_NNS from_IN our_PRP$ front_JJ door_NN ._."
t10 = "And_CC we_PRP will_MD reduce_VB taxes_NNS to_TO recover_VB the_DT momentum_NN of_IN our_PRP$ economy_NN and_CC reward_VB the_DT effort_NN and_CC enterprise_NN of_IN working_VBG Americans_NNPS ._."
t11 = "It_PRP is_VBZ the_DT firefighter_NN 's_POS courage_NN to_TO storm_VB a_DT stairway_NN filled_VBN with_IN smoke_NN ,_, but_CC also_RB a_DT parent_NN 's_POS willingness_NN to_TO nurture_VB a_DT child_NN ,_, that_WDT finally_RB decides_VBZ our_PRP$ fate_NN ._."
t12 = " a_DT book_NN sat_VBD on_IN the_DT coffee_NN table_NN in_IN our_PRP$ living_NN room_NN ,_, just_RB steps_NNS from_IN our_PRP$ front_JJ door_NN"
print("t1: ", similarity(test_sent1))
print("t2: ", similarity(test_sent2))
print("t3: ", similarity(test_sent3))
print("t4: ", similarity(t4))
print("t5: ", similarity(t5))
print("t6: ", similarity(t6))
print("t7: ", similarity(t7))
print("t8: ", similarity(t8))
print("t9: ", similarity(t9))
print("t10: ", similarity(t10))
print("t11: ", similarity(t11))
print("t12: ", similarity(t12))

t1:  1
t2:  1
t3:  1
t4:  0
t5:  0
t6:  0
t7:  1
t8:  28
t9:  1
t10:  0
t11:  0
t12:  0


In [10]:
similarity("Online_NN by_IN Gerhard_NNP Peters_NNP and_CC John_NNP T._NNP Woolley_NNP ,_, The_NNP American_NNP Presidency_NNP Project_NNP ._.")

2

In [32]:
def get_features(sent):
    feature = {}
    feature["simi"] = similarity(sent) > 0
    return feature

In [33]:
# define this function for evaluation
def compute_PRF(gold, predicted, class_label):
    TP = sum(int(g == class_label and p == class_label) for (g, p) in zip (gold, predicted))
    FP = sum(int(p == class_label and g != class_label) for (g, p) in zip (gold, predicted)) 
    FN = sum(int(p != class_label and g == class_label) for (g, p) in zip (gold, predicted))
    if TP + FP > 0:
        precision = TP/(TP + FP)
    else:
        precision = 0
    if TP + FN > 0:
        recall = TP/(TP + FN)
    else:
        recall = 0
    if precision > 0 and recall > 0:
        f_measure = 2 * precision * recall / (precision + recall)
    else:
        f_measure = 0
    
    return 'Precision=%.2f Recall=%.2f F_Measure=%.2f'  %  (precision, recall, f_measure)

In [34]:
data_set_raw1 = []
for file in os.listdir("American-Inaugural-Address-Corpus/Tagged"):
    if file.endswith("csv"):
        with open("American-Inaugural-Address-Corpus/Tagged/" + file, encoding = "utf-8") as f:
            reader = csv.reader(f, delimiter = ",")
            rows = [row for row in reader]
            data_set_raw1.extend(rows[1:])   # the first row is the header ["sentence", "Tag"], so not needed
print(len(data_set_raw1))
data_set = [(sent, tag) for [sent, tag] in data_set_raw1]
print(data_set[:5])

380
[('Inaugural_JJ Address_NNP January_NNP 20_CD ,_, 2009_CD Public_NNP Papers_NNP of_IN the_DT Presidents_NNS Barack_NNP Obama_NNP <_JJR br_NN >_JJR 2009_CD :_:\n', 'f'), (' Book_VB I_PRP Barack_NNP Obama_NNP 2009_CD :_:\n', 'f'), (' Book_VB I_PRP Location_NNP :_:\n', 'f'), (' District_NNP of_IN Columbia_NNP Washington_NNP The_NNP American_NNP Presidency_NNP Project_NNP\n', 'f'), ('My_PRP$ fellow_JJ citizens_NNS ,_, I_PRP stand_VBP here_RB today_NN humbled_VBN by_IN the_DT task_NN before_IN us_PRP ,_, grateful_JJ for_IN the_DT trust_NN you_PRP have_VBP bestowed_VBN ,_, mindful_JJ of_IN the_DT sacrifices_NNS borne_VBN by_IN our_PRP$ ancestors_NNS ._.', 't')]


In [145]:
random.Random(2).shuffle(data_set) 
featuresets = [(get_features(sent), tag) for (sent, tag) in data_set ]
size = len(featuresets)
train_set, devtest_set = featuresets[:size*6//10], featuresets[size*6//10:]
print(len(devtest_set))

152


In [146]:
classifier1 = nltk.NaiveBayesClassifier.train(train_set)

In [147]:
nltk.classify.accuracy(classifier1, devtest_set)

0.9276315789473685

In [148]:
classifier1.show_most_informative_features()

Most Informative Features
                    simi = True                t : f      =     10.8 : 1.0
                    simi = False               f : t      =      4.9 : 1.0


In [149]:
gold = [tag for (sent_feature, tag) in devtest_set]
pred = [classifier1.classify(sent_feature) for (sent_feature, tag) in devtest_set]
compute_PRF(gold, pred, "t")

'Precision=0.84 Recall=0.93 F_Measure=0.89'

In [86]:
classifier1.classify(get_features("We_PRP treat_VBP it_PRP like_IN a_DT nice-to-have_JJ instead_RB of_IN a_DT must-have_JJ ._."))

'f'

In [150]:
data_set_raw = []
for file in os.listdir("American-Inaugural-Address-Corpus/Tagged"):
    if file.endswith("csv"):
        with open("American-Inaugural-Address-Corpus/Tagged/" + file, encoding = "utf-8") as f:
            reader = csv.reader(f, delimiter = ",")
            rows = [row for row in reader]
            data_set_raw.extend(rows[1:])
print(data_set_raw[:3])

[['Inaugural_JJ Address_NNP January_NNP 20_CD ,_, 2009_CD Public_NNP Papers_NNP of_IN the_DT Presidents_NNS Barack_NNP Obama_NNP <_JJR br_NN >_JJR 2009_CD :_:\n', 'f'], [' Book_VB I_PRP Barack_NNP Obama_NNP 2009_CD :_:\n', 'f'], [' Book_VB I_PRP Location_NNP :_:\n', 'f']]


In [151]:
transformed_data = []
for [sent, tag] in data_set_raw:
    features_tag = []
    simi = similarity(sent) 
    features_tag.extend([sent,simi,tag])
    transformed_data.append(features_tag)  # use append to maintain list form
len(transformed_data)

380

In [152]:
df_data = pd.DataFrame(transformed_data, columns = ['sent','simi','tag'])
df_data.head(20)


Unnamed: 0,sent,simi,tag
0,"Inaugural_JJ Address_NNP January_NNP 20_CD ,_,...",0,f
1,Book_VB I_PRP Barack_NNP Obama_NNP 2009_CD :_:\n,0,f
2,Book_VB I_PRP Location_NNP :_:\n,0,f
3,District_NNP of_IN Columbia_NNP Washington_NN...,0,f
4,"My_PRP$ fellow_JJ citizens_NNS ,_, I_PRP stand...",1,t
5,I_PRP thank_VBP President_NNP Bush_NNP for_IN ...,0,f
6,Forty-four_CD Americans_NNPS have_VBP now_RB t...,0,f
7,The_DT words_NNS have_VBP been_VBN spoken_VBN ...,1,t
8,"Yet_CC every_DT so_RB often_RB ,_, the_DT oath...",1,t
9,"At_IN these_DT moments_NNS ,_, America_NNP has...",0,t


In [153]:
df_data["tag_num"] = df_data.tag.map({'f' : 0, 't' : 1})
df_data.to_excel("American-Inaugural-Address-Corpus/Tagged/transformed.xlsx", 
                 encoding = "utf-8", index = False)

In [154]:
feature_cols = ['simi']
# select all 10 features (X)
X = df_data[feature_cols]
# select numerical tag as response (y)
y = df_data.tag_num 

In [155]:
# use cross validation to evaluate the results:
from sklearn.cross_validation import train_test_split
from sklearn import metrics

In [156]:
sent_train, sent_dev, tag_train, tag_dev = train_test_split(X, y, test_size = 0.25, random_state = 1)
print(sent_train.shape)
print(sent_dev.shape)
print(tag_train.shape)
print(tag_dev.shape)

(285, 1)
(95, 1)
(285,)
(95,)


In [157]:
# Try different algorithms
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC 

In [158]:
# initialize classifiers
knn = KNeighborsClassifier(n_neighbors = 1)
lr = LogisticRegression()
svc = LinearSVC()

In [159]:
def tryClassifier(clf):
    print(type(clf))
    clf.fit(sent_train, tag_train)
    tag_pred = clf.predict(sent_dev)
    print('Acc: ', metrics.accuracy_score(tag_dev, tag_pred))
    print(compute_PRF(tag_dev, tag_pred, 1))

In [160]:
tryClassifier(knn)
tryClassifier(lr)
tryClassifier(svc)

<class 'sklearn.neighbors.classification.KNeighborsClassifier'>
Acc:  0.9263157894736842
Precision=0.82 Recall=0.92 F_Measure=0.87
<class 'sklearn.linear_model.logistic.LogisticRegression'>
Acc:  0.9263157894736842
Precision=0.82 Recall=0.92 F_Measure=0.87
<class 'sklearn.svm.classes.LinearSVC'>
Acc:  0.9263157894736842
Precision=0.82 Recall=0.92 F_Measure=0.87


In [81]:
def process_file_sent(path):   # path of the file as string
    with open (path, encoding = "utf-8") as f:   
        raw = f.read()
        tokenized_text = word_tokenize(raw)              # tokenizing
        tagged_text = pos_tagger_en.tag(tokenized_text)  # pos-tagging
        
    with open (path + "processed", "w", encoding = "utf-8") as processed_f:
        write_text = ""
        for (a, b) in tagged_text:
            write_text += a + "_" + b +" "   # combine word and tag together in the format: "My_PRP$ "
            
            # add newline character after the character which marks the end of a sentence 
        result = write_text.replace("_. ", "_.\n").replace(";_:", ";_:\n").replace(
            ":_:", ":_:\n")   # "_." include "!", "?" these sentence closers
                
        processed_f.write(result)       # write all the results into corresponding .txt file         
            
    with open (path + "processed") as f:
        sentences = []
        for row in f:
            sentences.append(row)  

# Get corresponding .xlsx files which contain every sentence of the speech in the first column.
        
        df_sent = pd.DataFrame({"sentence" : sentences[:len(sentences)]})
        df_sent.to_csv (path + "processed_sent.csv", encoding = "utf-8", index = False)
        
            

In [549]:
def transform (path): # path of the file as string
    with open (path + "processed_sent.csv") as f:
        reader = csv.reader(f, delimiter = ",")
        rows = [row for row in reader]
        transformed_data = []
        for [sent] in rows[1:]:   # "sentence" in the first row?
            features = []
            simi = similarity(sent) > 0
            features.extend([sent, simi])
            transformed_data.append(features)  # use append to maintain list form
        
        df_data = pd.DataFrame(transformed_data, columns = ['sent','simi'])
        
    return df_data

In [557]:
def tag_file_sklearn (path, clf): # path of the file as string
    df_data = transform(path)
    X_features_test = df_data[feature_cols]
    y_tag_pred = clf.predict(X_features_test)
    df_data["tag"] = y_tag_pred
    df_data["tag"] = df_data.tag.map({ 0 : 'FALSE' , 1 : 'TRUE'})
    df_data.to_csv(path + "tagged.csv", encoding = "utf-8", index = False)

In [558]:
def tag_file_nltk (path, clf): # path of the file as string
    with open (path, encoding = "utf-8") as f:   
        raw = f.read()
        tokenized_text = word_tokenize(raw)              # tokenizing
        tagged_text = pos_tagger_en.tag(tokenized_text)  # pos-tagging
        
    with open (path + "processed", "w", encoding = "utf-8") as processed_f:
        write_text = ""
        for (a, b) in tagged_text:
            write_text += a + "_" + b +" "   # combine word and tag together in the format: "My_PRP$ "
            
            # add newline character after the character which marks the end of a sentence 
        result = write_text.replace("_. ", "_.\n").replace(";_:", ";_:\n").replace(
            ":_:", ":_:\n")   # "_." include "!", "?" these sentence closers
                
        processed_f.write(result)       # write all the results into corresponding .txt file         
            
    with open (path + "processed") as f:
        sentences = []
        for row in f:
            sentences.append(row)
        df_data = pd.DataFrame({"sentence" : sentences[:len(sentences)]})    
        tag_pred = [clf.classify(get_features(sent)) for sent in sentences] 
        df_data["tag"] = tag_pred
        df_data["tag"] = df_data.tag.map({ "f" : 'FALSE' , 't' : 'TRUE'})
        df_data.to_csv(path + "tagged.csv", encoding = "utf-8", index = False)

In [559]:
process_file_sent("/Users/genecosmo/Desktop/Computer Applications in Linguistics/test_classifier/phase2_svc/testted.txt")
transform("/Users/genecosmo/Desktop/Computer Applications in Linguistics/test_classifier/phase2_svc/testted.txt")
tag_file_sklearn("/Users/genecosmo/Desktop/Computer Applications in Linguistics/test_classifier/phase2_svc/testted.txt", svc)

In [560]:
process_file_sent("/Users/genecosmo/Desktop/Computer Applications in Linguistics/test_classifier/phase2_nb/testted.txt")
transform("/Users/genecosmo/Desktop/Computer Applications in Linguistics/test_classifier/phase2_nb/testted.txt")
tag_file_nltk("/Users/genecosmo/Desktop/Computer Applications in Linguistics/test_classifier/phase2_nb/testted.txt", classifier1)