In [25]:
import codecs
EN_train = "./EN/train"
EN_test = "./EN/dev.in"
EN_output = "./EN/dev.p2.out"
EN_gold = "./EN/dev.out"
EN_viterbi = "./EN/dev.p3.out"
EN_modified = "./EN/modified_train"
EN_maxmin = "./EN/dev.p4.out"

CN_train = "./CN/train"
CN_test = "./CN/dev.in"
CN_output = "./CN/dev.p2.out"
CN_gold = "./CN/dev.out"
CN_modified = "./CN/modified_train"
CN_viterbi = "./CN/dev.p3.out"

SG_train = "./SG/train"
SG_modified = "./SG/modified_train"
SG_test = "./SG/dev.in"
SG_output = "./SG/dev.p2.out"
SG_gold = "./SG/dev.out"
SG_viterbi = "./SG/dev.p3.out"

FR_train = "./FR/train"
FR_modified = "./FR/modified_train"
FR_test = "./FR/dev.in"
FR_output = "./FR/dev.p2.out"
FR_gold = "./FR/dev.out"
FR_viterbi = "./FR/dev.p3.out"
FR_maxmin = "./FR/dev.p4.out"


def modified_training_set(train_file,modified_train_set):
    #fout=open ('modified_train_set','w')
    with open (train_file, encoding='utf-8') as file:
        tag_count={}
        modified_words=[]
        for line in file:
            pair=line.split()
            if len(line.split())!=0:
                tag=pair[0]
                observ=pair[1]
                if tag in tag_count.keys():
                    tag_count[tag]+=1
                else:
                    tag_count[tag]=1
        for tag in tag_count:
            if tag_count[tag]<3:
                modified_words.append(tag)
    with open (train_file, encoding='utf-8') as file2, codecs.open(modified_train_set, 'w', 'utf-8-sig') as fout:
        for line in file2:
            pair2=line.split()
            if len(line.split())!=0:
                word=pair2[0]
                sentiment=pair2[1]
                if word in modified_words:
                    fout.write("#UNK"+" "+sentiment+"\n")
                else:
                    fout.write(word+" "+sentiment+"\n")


def emission_params(train_file):
    with open(train_file, encoding = 'utf-8') as file:
        emission_count= {}
        label_count={}
        for line in file:
            pair = line.split()
            if len(line.split())!=0:
                #add 1 to count of (Xi, Yi)
                word = pair[0]
                sentiment = pair[1]
                if word in emission_count.keys():
                    if sentiment in emission_count[word].keys():
                        emission_count[word][sentiment] +=1
                    else:
                        sentiments = emission_count[word]
                        sentiments[sentiment] = 1
                else:
                    sentiment_count = {}
                    sentiment_count[sentiment] = 1
                    emission_count[word]=sentiment_count
    
                #add 1 to count of label Yi
                if sentiment in label_count.keys():
                    label_count[sentiment]+=1
                else:
                    label_count[sentiment]=1
        for keya in emission_count.keys():
            for keyb in emission_count[keya].keys():
                emission_count[keya][keyb]/=(label_count[keyb]+1)
        new_word = {}
        for key in label_count.keys():
            new_word[key] = 1/(label_count[key]+1)
        emission_count['#UNK#'] = new_word
       
        return (emission_count,label_count)
                          

def sentiment_analysis(test_file,output_file,emission_params, label_count):
    with open(test_file, encoding ='utf-8') as ifile, codecs.open(output_file, 'w', 'utf-8-sig') as ofile:
        for line in ifile:
            if len(line.split())!=0:
                word = line.split()[0]
                if word in emission_params.keys():
                    value = emission_params[word]
                    a = max(value,key=value.get)
                    ofile.write(word+" "+a+'\n')
                else:
                    value = emission_params['#UNK#']
                    a = max(value,key=value.get)
                    ofile.write(word+" "+a+'\n')
            else:
                ofile.write('\n')

                
modified_training_set(EN_train, EN_modified)                
emission_params_EN, label_count_EN = emission_params(EN_modified)
sentiment_analysis(EN_test,EN_output,emission_params_EN, label_count_EN)

modified_training_set(FR_train, FR_modified)                
emission_params_FR, label_count_FR = emission_params(FR_modified)
sentiment_analysis(FR_test,FR_output,emission_params_FR, label_count_FR)

modified_training_set(CN_train, CN_modified)                
emission_params_CN,label_count_CN = emission_params(CN_modified)
sentiment_analysis(CN_test,CN_output,emission_params_CN, label_count_CN)

modified_training_set(SG_train, SG_modified)                
emission_params_SG,label_count_SG = emission_params(SG_modified)
sentiment_analysis(SG_test,SG_output,emission_params_SG, label_count_SG)

In [26]:
def transition_params(train_file):
    transition_count= {}
    state_count={}
    prev = 'START'
    end = 'STOP'
    state_count[prev] = 0
    state_count[end] = 0
    transition_count[end] = {}
    with open(train_file, encoding = 'utf-8') as file:    
        for line in file:
            pair = line.split()
            if len(pair)!= 0:
                sentiment = pair[1]
                # add prev to sentiment transition count
                if sentiment in transition_count.keys():
                    sentiment_list = transition_count[sentiment]
                    if prev in sentiment_list.keys():
                        sentiment_list[prev] += 1
                    else:
                        sentiment_list[prev] = 1
                else:
                    new_sentiment = {}
                    new_sentiment[prev] = 1
                    transition_count[sentiment] = new_sentiment

                # add to start and stop state counts
                if prev == 'START':
                    state_count[prev] += 1
                    state_count[end] += 1

                # add to state count  
                if sentiment in state_count.keys():
                    state_count[sentiment]+=1
                else:
                    state_count[sentiment]=1
              
                prev = sentiment

            else:
                sentiment_list = transition_count[end]
                if prev in sentiment_list.keys():
                    sentiment_list[prev] +=1
                else:
                    sentiment_list[prev] =1   
                prev = 'START'
    for V in transition_count.keys():
        for U in transition_count[V].keys():
            transition_count[V][U] /= state_count[U]
    return transition_count


def viterbi_algo(test_file, output_file, transition_params, emission_params, labels):
    sentences = []

    with open(test_file, encoding ='utf-8') as ifile, codecs.open(output_file, 'w', 'utf-8-sig') as ofile:
        sentence = []
        for line in ifile:
            if len(line.split())!=0:
                sentence.append(line.split()[0])
            else:
                sentences.append(sentence)
                sentence = []
        
        for s in sentences:
            nodes = calculate_node_scores(s,transition_params, emission_params, labels)
            labelled_sentence = backtracking(s,nodes)
            for word in labelled_sentence:
                ofile.write(word+'\n')
            ofile.write("\n")

        
def calculate_node_scores(s, transition_params, emission_params, labels):
    nodes = {}
    #base case
    nodes[0] = {'START':[1,'nil']}
    #recursive
    for k in range (1, len(s)+1): #for each word
        X = s[k-1]
        for V in labels.keys(): #for each node
            prev_nodes_dict = nodes[k-1] #access prev nodes
            highest_score = 0
            parent = 'nil'
            #emission params
            if X in emission_params.keys():
                emission_labels = emission_params[X]

                if V in emission_labels:
                    b = emission_labels[V]
                else:
                    b = 0
            else:
                b = emission_params['#UNK#'][V]  
                
            for U in prev_nodes_dict.keys():
                #transitionparams
                prev_states = transition_params[V]
                if U in prev_states.keys():
                    a = prev_states[U]
                else:
                    a = 0
                
                #prev node score
                prev_score = prev_nodes_dict[U][0]
                score = prev_score*a*b
                
                if score>= highest_score:
                    highest_score = score
                    parent = U
            if k in nodes.keys():
                nodes[k][V] = [highest_score,parent]
            else:
                new_dict = {V:[highest_score,parent]}
                nodes[k] = new_dict
            
    #end case
    prev_nodes_dict = nodes[len(s)]
    highest_score = 0
    parent = 'nil'
    for U in prev_nodes_dict.keys():
        #transition
        prev_states = transition_params['STOP']
        if U in prev_states.keys():
            a = prev_states[U]
        else:
            a = 0
        #prev node score
        prev_score = prev_nodes_dict[U][0]
        score = prev_score*a
        if score>= highest_score:
            highest_score = score
            parent = U
    indiv_node = {'STOP': [highest_score,parent]}
    nodes[len(s)+1]=indiv_node

    return nodes


def backtracking(s, nodes):
    prev_state = 'STOP'
    for i in range(len(s)+1, 1,-1):
        prev_node = nodes[i][prev_state]
        prev_state = prev_node[1]
        s[i-2] += " "+prev_state
    return s

transition_params_EN = transition_params(EN_train)
viterbi_algo(EN_test, EN_viterbi, transition_params_EN, emission_params_EN, label_count_EN)
transition_params_FR = transition_params(FR_train)
viterbi_algo(FR_test, FR_viterbi, transition_params_FR, emission_params_FR, label_count_FR)
transition_params_CN = transition_params(CN_train)
viterbi_algo(CN_test, CN_viterbi, transition_params_CN, emission_params_CN, label_count_CN)
transition_params_SG = transition_params(SG_train)
viterbi_algo(SG_test, SG_viterbi, transition_params_SG, emission_params_SG, label_count_SG)

# EN #
# #Entity in gold data: 226
# #Entity in prediction: 584
# #Correct Entity: 109
# Entity Precision: 0.1866
# Entity recall: 0.4823
# Entity F: 0.2691
# #Correct Sentiment: 57
# Sentiment precision: 0.0976
# Sentiment recall: 0.2522
# Sentiment F: 0.1407

# FR #
# #Entity in gold data: 223
# #Entity in prediction: 582
# #Correct Entity: 107
# Entity Precision: 0.1838
# Entity recall: 0.4798
# Entity F: 0.2658
# #Correct Sentiment: 60
# Sentiment precision: 0.1031
# Sentiment recall: 0.2691
# Sentiment F: 0.1491

# CN #
# #Entity in gold data: 362
# #Entity in prediction: 666
# #Correct Entity: 92
# Entity Precision: 0.1381
# Entity recall: 0.2541
# Entity F: 0.1790
# #Correct Sentiment: 46
# Sentiment precision: 0.0691
# Sentiment recall: 0.1271
# Sentiment F: 0.0895

# SG #
# #Entity in gold data: 1382
# #Entity in prediction: 1559
# #Correct Entity: 456
# Entity Precision: 0.2925
# Entity recall: 0.300
# Entity F: 0.3101
# #Correct Sentiment: 249
# Sentiment precision: 0.1597
# Sentiment recall: 0.1802
# Sentiment F: 0.1693

In [42]:


#Part 4
def maxmin_topk(test_file, output_file, transition_params, emission_params, labels, top_k, i_th):
    sentences = []

    with open(test_file, encoding ='utf-8') as ifile, codecs.open(output_file, 'w', 'utf-8-sig') as ofile:
        sentence = []
        for line in ifile:
            if len(line.split())!=0:
                sentence.append(line.split()[0])
            else:
                sentences.append(sentence)
                sentence = []
        
        for s in sentences:
            nodes = calculate_topk_node_scores(s,transition_params, emission_params, labels, top_k)
            labelled_sentence = backtracking_topk(s,nodes, i_th)
            for word in labelled_sentence:
                ofile.write(word+'\n')
            ofile.write("\n")


def calculate_topk_node_scores(s, transition_params, emission_params, labels, top_k):
    nodes = {}
    #base case
    nodes[0] = {'START':[[1,'nil',0]]}
    #recursive
    for k in range (1, len(s)+1): #for each word
        X = s[k-1]
        for V in labels.keys(): #for each node
            prev_nodes_dict = nodes[k-1] #access prev nodes
            #emission params
            if X in emission_params.keys():
                emission_labels = emission_params[X]

                if V in emission_labels:
                    b = emission_labels[V]
                else:
                    b = 0
            else:
                b = emission_params['#UNK#'][V]  
            scores = []
            for U in prev_nodes_dict.keys():
                #transitionparams
                prev_states = transition_params[V]
                if U in prev_states.keys():
                    a = prev_states[U]
                else:
                    a = 0
                index = 0
                for prev_k_nodes in prev_nodes_dict[U]:
                    #prev node score
                    score = prev_k_nodes[0]*a*b
                    scores.append([score, U, index])
                    index += 1
            
            #take top k scores
            scores.sort(key=lambda x: x[0],reverse=True)
            topk_scores = scores[:top_k]
            if k in nodes.keys():
                nodes[k][V] = topk_scores
            else:
                new_dict = {V:topk_scores}
                nodes[k] = new_dict
            
    #end case
    prev_nodes_dict = nodes[len(s)]
    scores = []
    for U in prev_nodes_dict.keys():
        #transition
        prev_states = transition_params['STOP']
        if U in prev_states.keys():
            a = prev_states[U]
        else:
            a = 0
        #prev node score
        index = 0
        for prev_k_nodes in prev_nodes_dict[U]:
            score = prev_k_nodes[0]*a
            scores.append([score, U, index])
            index += 1
    scores.sort(key=lambda x: x[0], reverse=True)
    topk_scores = scores[:top_k]
    indiv_node = {'STOP': topk_scores}
    nodes[len(s)+1]=indiv_node
    
    return nodes


def backtracking_topk(s, nodes, i_th):
    prev_state = 'STOP'
    prev_index = 0
    for i in range(len(s)+1, 1,-1):
        if i==len(s)+1:
            prev_node = nodes[i][prev_state][i_th-1]
        else:
            prev_node = nodes[i][prev_state][prev_index]
        prev_state = prev_node[1]
        prev_index = prev_node[2]
        s[i-2] += " "+prev_state
    return s

maxmin_topk(EN_test, EN_maxmin, transition_params_EN, emission_params_EN, label_count_EN, 7, 1)
maxmin_topk(FR_test, FR_maxmin, transition_params_FR, emission_params_FR, label_count_FR, 7, 1)


def fwd_bkw(s, labels, transition_params, emission_params):
    # forward part of the algorithm
    end_st='STOP'
    fwd = []
    f_prev = {}
    for i, observation_i in enumerate(s):
        f_curr = {}
        for st in labels:
            if i == 0:
                # base case for the forward part #TODO
                prev_f_sum = transition_params[0][st]
            else:
                prev_f_sum = sum(f_prev[k]*transition_params[k][st] for k in labels)

            f_curr[st] = emission_params[st][observation_i] * prev_f_sum

        fwd.append(f_curr)
        f_prev = f_curr

    p_fwd = sum(f_curr[k] * transition_params[k][end_st] for k in labels)

    # backward part of the algorithm
    bkw = []
    b_prev = {}
    for i, observation_i_plus in enumerate(reversed(s[1:]+(None,))):
        b_curr = {}
        for st in labels:
            if i == 0:
                # base case for backward part
                b_curr[st] = transition_params[st][end_st]
            else:
                b_curr[st] = sum(transition_params[st][l] * emission_params[l][observation_i_plus] * b_prev[l] for l in labels)

        bkw.insert(0,b_curr)
        b_prev = b_curr
#Start prob here too
    p_bkw = sum(transition_params[0][st]* emission_params[l][s[0]] * b_curr[l] for l in labels)

    # merging the two parts
    posterior = []
    for i in range(len(s)):
        posterior.append({st: fwd[i][st] * bkw[i][st] / p_fwd for st in labels})

    assert p_fwd == p_bkw
    return fwd, bkw, posterior

def maxmin_topk2(test_file, output_file, transition_params, emission_params, labels):
    sentences = []

    with open(test_file, encoding ='utf-8') as ifile, codecs.open(output_file, 'w', 'utf-8-sig') as ofile:
        sentence = []
        for line in ifile:
            if len(line.split())!=0:
                sentence.append(line.split()[0])
            else:
                sentences.append(sentence)
                sentence = []
        
        for s in sentences:
            nodes = fwd_bkw(s,labels,transition_params, emission_params)
maxmin_topk2(EN_test, EN_maxmin2, transition_params_EN, emission_params_EN, label_count_EN)


In [28]:
#part 5
from collections import Counter
from operator import itemgetter
import copy

def remove_tag(training_file): # generate train_no_tag file
    f = open(training_file,"r",encoding="utf8")
    new_file = training_file+"_perceptron_100"
    f1 = open(new_file,"w+",encoding="utf8")
    for line in f:
        line = line.split()
        if len(line)>1:
            f1.write(line[0]+"\n")
        else:
            f1.write("\n")

    f.close()
    f1.close()
    return

def para(training_file): # output a dictionary of emission_parameter
    cnt_y = Counter() # count the y tags
    e_para = Counter() # count the y|x tags
    t_para = Counter()
    state = "start"
    y0_y1 = ""
    # init_e_para = Counter()
    # init_t_para = Counter()
    f = open(training_file, "r", encoding='utf8')
    for line in f:
        line = line.rstrip('\n')
        if (len(line) == 0):
            y0_y1 = state + " stop"
            state = "stop"
        else:
            e_para[line]+=1
            line = line.split()[1]
            y0_y1 = state + " " + line
            state = line
            cnt_y[line]+=1
        t_para[y0_y1]+=1

    for tag in cnt_y:
        e_para["new_word "+tag]=1

    f.close()
    return  e_para, t_para

def init_para(training_file):
    cnt_y = Counter() # count the y tags
    e_para = Counter() # count the y|x tags
    t_para = Counter()
    state = "start"
    y0_y1 = ""
    # init_e_para = Counter()
    # init_t_para = Counter()
    f = open(training_file, "r", encoding='utf8')
    for line in f:
        line = line.rstrip('\n')
        if (line == ""):
            y0_y1 = state + " stop"
            state = "stop"
        else:
            e_para[line]=0
            line = line.split()[1]
            y0_y1 = state + " " + line
            state = line
            cnt_y[line]=0
        t_para[y0_y1]=0

    for tag in cnt_y:
        e_para["new_word "+tag]=0

    f.close()
    return  e_para, t_para

def test_phase_perceptron(test_file,emission_parameter,transition_parameter,top_k=1, run = 1):
    tag = ["O","B-neutral","B-positive","B-negative","I-neutral","I-positive","I-negative"]
    step_k = 1
    v_label = "O"
    u_label = "start"
    optimal_value = -1
    pi = [step_k,v_label,optimal_value,u_label]
    sentence = []
    output = []
    list_u = [0,1,2,3,4,5,6]
    dict_u = {}
    state = (step_k, list_u)
    data_x = set()
    for entry in emission_parameter.keys():
        entry = entry.split(' ')[0]
        data_x.add(entry)


    file_dir = test_file[:-3]+str(int(test_file[-3:])+run)
    f = open(test_file, "r", encoding='utf8')
    f1 = open(file_dir, "w+", encoding='utf8')

    for line in f:
        line = line.rstrip()
        if line!="":
            for id_v,v in enumerate(tag):
                best = -1
                best_add = -1
                for id_u,u in enumerate(state[1]): #enumerate list_u stored in state
                    if(line in data_x):
                        b_word = line + " " + v
                        b_value = emission_parameter[b_word]
                    else:
                        b_word = "new_word " + v
                        b_value = emission_parameter[b_word]

                    if state[0] == 1:
                        u_label = "start"
                        uv_label = u_label+" " + v
                        a_uv_value = transition_parameter[uv_label]
                        optimal_value = a_uv_value+b_value

                    else:
                        u_label = u[1]
                        uv_label = u_label + " " + v
                        a_uv_value = transition_parameter[uv_label]
                        optimal_value =  u[2] + a_uv_value + b_value

                    if optimal_value>best:
                        best = optimal_value
                        pi = [step_k, v, best, u_label]

                dict_u[(step_k,v)] = pi[3]
                list_u[id_v] = pi

            sentence.insert(0,line)
            list_u2 = copy.deepcopy(list_u)
            step_k+=1
            state = (step_k,list_u2)

        else: # when it reaches the empty line
            best = -1
            for id_u,u in enumerate(state[1]):
                uv_label = u[1] + " stop"
                a_uv_value = transition_parameter[uv_label]
                optimal_value = u[2]+a_uv_value
                pi = [state[0],"stop",optimal_value,u[1]]
                list_u[id_u] = pi


            list_u = sorted(list_u,key=itemgetter(2),reverse=True)  # arrange by descending order to choose top_k
            u_label = list_u[top_k-1][3]
            output.append(sentence[0]+" "+u_label+"\n")

            for i in range(1,len(sentence)): # back tracking to find the y_labels
                search_key = (step_k-i,u_label)
                u_label = dict_u[search_key]
                output.insert(0,sentence[i]+" "+u_label+"\n")

            for i in output:
                f1.write(i)
            f1.write("\n")
            sentence = []
            output=[]
            dict_u={}
            step_k=1
            state = (step_k,[0,1,2,3,4,5,6])
    f1.close()
    return


import subprocess


def update_para1(training_file, n = 3):
    parameter1 = para(training_file)
    e_c1 = copy.deepcopy(parameter1[0])
    t_c1 = copy.deepcopy(parameter1[1])
    init_parameter = init_para(training_file) #initialize epara and tpara to 0
    e_para = copy.deepcopy(init_parameter[0])
    t_para = copy.deepcopy(init_parameter[1])

    remove_tag(training_file) # generate training file without tags
    blank_file = training_file+"_perceptron_100"
    for run_num in range(n):
        print("Running %d of %d:" %(run_num+1,n) )
        test_phase_perceptron(blank_file,e_para,t_para,run = run_num+1) # generates prediction file
        predicted_file = blank_file[:-3]+str(int(blank_file[-3:])+run_num+1)
        parameter2 = para(predicted_file) # to get e_c2, t_c2
        e_c2 = copy.deepcopy(parameter2[0])
        t_c2 = copy.deepcopy(parameter2[1])
        
        # follow formula - e_para = e_para+c1-c2
        e_para += e_c1  
        e_para.subtract(e_c2)

        t_para += t_c1
        t_para.subtract(t_c2)
        proc = subprocess.Popen(["ls"], stdout=subprocess.PIPE, shell=True)
        (out,err) = proc.communicate()
        print ("score for "+str(int(blank_file[-3:])+run_num+1))
        print (out)
        #"python", "evalResult.py","train", "train_perceptron_"+str(int(blank_file[-3:])+run_num+1)
        

    return e_para ,t_para

def update_para2(training_file,e_para,t_para, n = 3):
    parameter1 = para(training_file)
    e_c1 = copy.deepcopy(parameter1[0])
    t_c1 = copy.deepcopy(parameter1[1])

    remove_tag(training_file) # generate training file without tags
    blank_file = training_file+"_perceptron_100"
    for run_num in range(n):
        print("Running %d of %d:" %(run_num+1,n) )
        test_phase_perceptron(blank_file,e_para,t_para,run = run_num+1)
        predicted_file = blank_file[:-3]+str(int(blank_file[-3:])+run_num+1)
        parameter2 = para(predicted_file)
        e_c2 = copy.deepcopy(parameter2[0])
        t_c2 = copy.deepcopy(parameter2[1])

        e_para += e_c1
        e_para.subtract(e_c2)

        t_para += t_c1
        t_para.subtract(t_c2)

    return e_para ,t_para


import os
training_file = "./EN/train"
# test_file = "./dev.in"
parameter = update_para1(training_file, n = 1)

Running 1 of 1:
score for 101
b'CN\nEN\nFR\nInstruction.txt\nML Proj Final.ipynb\nML Project Part 3.ipynb\nML Project Q2_FinalFinal.ipynb\nProject.pdf\nREADME.md\nSG\ndev.out\ndev.prediction\nevalResult.py\n'
