In [None]:
import os
import pickle
import nltk
from nltk.stem.porter import *
import string
import pandas as pd
import json
import re
import string
import demoji
import pickle

In [None]:
stopwords = stopwords = nltk.corpus.stopwords.words("english")
stopwords.append("#")
stopwords.append("<unk>")
other_exclusions = ["#ff", "ff", "rt"]
stopwords.extend(other_exclusions)


def preprocess(text_string):
    """
    Accepts a text string and replaces:
    1) urls with URLHERE
    2) lots of whitespace with one instance
    3) mentions with MENTIONHERE

    This allows us to get standardized counts of urls and mentions
    Without caring about specific people mentioned
    """
    space_pattern = '\s+'
    giant_url_regex = ('http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|'
                       '[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+')
    mention_regex = '@[\w\-]+'
    parsed_text = re.sub(space_pattern, ' ', text_string)
    parsed_text = re.sub(giant_url_regex, '', parsed_text)
    parsed_text = re.sub(mention_regex, '', parsed_text)
    return parsed_text


def basic_tokenize(tweet):
    """Same as tokenize but without the stemming"""
    tweet = " ".join(re.split("[^a-zA-Z.,!?]+", tweet.lower())).strip()
    tweet = preprocess(tweet)
    tweet = extra_preprocess(tweet)
    return tweet

In [None]:
def removeUsernames(string):
    return re.sub('@[^\s]+', '@user', string)


def specialUnicodeRemover(string):
    return re.sub(r"(\xe9|\362)", "", string)


def punctuationRemover(tweet):
    ls = list(string.punctuation)
    ls.remove('@')
    try:
        tweet = tweet.translate(str.maketrans('', '', ls))
        return tweet
    except:
        return tweet


def RTRemover(string):
    string = string.strip()
    if 'RT' in string[0:2]:
        string = string[2:]
        return string
    else:
        return string


def EmojiRemover(string):
    return demoji.replace(string, "")


def DotRemover(string):
    if '...' in string:
        string = string.replace('...', '')
    elif '..' in string:
        string = string.replace('..', '')
    return string


def extra_preprocess(string):
    string = removeUsernames(string)
    string = specialUnicodeRemover(string)
    string = punctuationRemover(string)
    string = RTRemover(string)
    string = EmojiRemover(string)
    string = DotRemover(string)
    return string

In [None]:
folder = "ext_eval/"

In [None]:
## Prep pair for neutral
def f1():
    model_file = "neutral_455.pkl"
    file = os.path.join(folder, model_file)
    with open(file, "rb") as f:
        org_file = pickle.load(f)
    # print(type(org_file))
    ground = org_file["input"]
    pred = org_file["pred"]
    assert len(ground) == len(pred)
    print(len(ground))
    ground_final = []
    pred_final = []
    for g, p in zip(ground, pred):
        g = basic_tokenize(g).strip()
        p = basic_tokenize(p).strip()
        if not g or not p:
            continue
        ground_final.append(g)
        pred_final.append(p)
    print(len(ground_final))

    out = {"ground":ground_final,"pred":pred_final}
    model_name = model_file.split("_")[0]
    file = os.path.join(folder, model_name + "_for_ext_eval.pkl")
    with open(file, "wb") as f:
        pickle.dump(out, f)

In [None]:
## Prep pair for DRG
def f2():
    model_file = "drgpreds_455.pkl"
    file = os.path.join(folder, model_file)
    with open(file, "rb") as f:
        org_file = pickle.load(f)
    print(len(org_file))
    file = os.path.join(folder, "test_small_455.pkl")
    with open(file,"rb") as f:
        test_file = pickle.load(f)
    assert len(org_file)==len(test_file)
    ground_final = []
    pred_final = []
    for g, p in zip(test_file, org_file):
        g = basic_tokenize(g).strip()
        p = basic_tokenize(p).strip()
        if not g or not p:
            continue
        ground_final.append(g)
        pred_final.append(p)
    print(len(ground_final))

    out = {"ground":ground_final,"pred":pred_final}
    model_name = model_file.split("_")[0]
    file = os.path.join(folder, model_name + "_for_ext_eval.pkl")
    with open(file, "wb") as f:
        pickle.dump(out, f)

In [None]:
## Prep pair for NTP
def f3():
    model_file = "ntpcares_454.pkl"
    file = os.path.join(folder, model_file)
    with open(file, "rb") as f:
        org_file = pickle.load(f)
    print(len(org_file))
    file = os.path.join(folder, "test_dev_454.pkl")
    with open(file,"rb") as f:
        test_file = pickle.load(f)
    assert len(org_file)==len(test_file)
    ground_final = []
    pred_final = []
    for g, p in zip(test_file, org_file):
        g = basic_tokenize(g).strip()
        p = basic_tokenize(p).strip()
        if not g or not p:
            continue
        ground_final.append(g)
        pred_final.append(p)
    print(len(ground_final))

    out = {"ground":ground_final,"pred":pred_final}
    model_name = model_file.split("_")[0]
    file = os.path.join(folder, model_name + "_for_ext_eval.pkl")
    with open(file, "wb") as f:
        pickle.dump(out, f)

In [None]:
## Prep pair for FGST
def f4():
    model_file = "fgst_910.pkl"
    file = os.path.join(folder, model_file)
    with open(file, "rb") as f:
        org_file = pickle.load(f)
    print(len(org_file))
    print(org_file.keys())
    ground = org_file["input"]
    pred1 = org_file[1]
    pred2 = org_file[2]
    pred3 = org_file[3]
    pred4 = org_file[4]
    pred5 = org_file[5]
    gr1=[]
    gr2=[]
    gr3=[]
    gr4=[]
    gr5=[]
    pr1=[]
    pr2=[]
    pr3=[]
    pr4=[]
    pr5=[]
    for g,p1,p2,p3,p4,p5 in zip(ground,pred1,pred2,pred3,pred4,pred5):
        g = basic_tokenize(g).strip()
        p1 = basic_tokenize(p1).strip()
        p2 = basic_tokenize(p2).strip()
        p3 = basic_tokenize(p3).strip()
        p4 = basic_tokenize(p4).strip()
        p5 = basic_tokenize(p5).strip()
        if not g:
            continue
        if p1:  
            gr1.append(g)
            pr1.append(p1)
        if p2:  
            gr2.append(g)
            pr2.append(p2)
        if p3:  
            gr3.append(g)
            pr3.append(p3)
        if p4:  
            gr4.append(g)
            pr4.append(p4)
        if p5:  
            gr5.append(g)
            pr5.append(p5)
    assert len(gr1)==len(pr1)
    assert len(gr2)==len(pr2)
    assert len(gr3)==len(pr3)
    assert len(gr4)==len(pr4)
    assert len(gr5)==len(pr5)
    print(len(gr1))
    print(len(gr2))
    print(len(gr3))
    print(len(gr4))
    print(len(gr5))
    out1 = {"ground":gr1,"pred":pr1}
    out2 = {"ground":gr2,"pred":pr2}
    out3 = {"ground":gr3,"pred":pr3}
    out4 = {"ground":gr4,"pred":pr4}
    out5 = {"ground":gr5,"pred":pr5}
    out = {1:out1,2:out2,3:out3,4:out4,5:out5}

    model_name = model_file.split("_")[0]
    file = os.path.join(folder, model_name + "_for_ext_eval.pkl")
    with open(file, "wb") as f:
            pickle.dump(out, f)

In [None]:
## Prepare pairs for Style
def f5():
    model_file = "style_910.pkl"
    file = os.path.join(folder, model_file)
    with open(file, "rb") as f:
        org_file = pickle.load(f)
    print(org_file.keys())
    test = org_file['input']
    pred_rev = org_file['pred_rev']
    pred_raw = org_file['pred_raw']
    assert len(test)==len(pred_rev)==len(pred_raw)
    print(len(test))
    gr1=[]
    gr2=[]
    pr1=[]
    pr2=[]
    for g,p1,p2 in zip(test,pred_rev,pred_raw):
        g = basic_tokenize(g).strip()
        p1 = basic_tokenize(p1).strip()
        p2 = basic_tokenize(p2).strip()
        if not g:
            continue
        if p1:  
            gr1.append(g)
            pr1.append(p1)
        if p2:  
            gr2.append(g)
            pr2.append(p2)
    assert len(gr1)==len(pr1)
    assert len(gr2)==len(pr2)
    print(len(gr1))
    print(len(gr2))
    out1 = {"ground":gr1,"pred":pr1}
    out2 = {"ground":gr2,"pred":pr2}
    out = {1:out1,2:out2}

    model_name = model_file.split("_")[0]
    file = os.path.join(folder, model_name + "_for_ext_eval.pkl")
    with open(file, "wb") as f:
            pickle.dump(out, f) 

In [None]:
## Prep pairs for NACL
def f6():
    ground_final = []
    pred_final = []
    model_file = "nacl_pred.csv"
    file = os.path.join(folder, model_file)
    df = pd.read_csv(file)
    for g,p in zip(df['Original_text'],df['Pred1']):
        if g=="Next Cross Validation":
            continue
        g = g[2:-2]
        p = p[2:-2]
        g = basic_tokenize(g).strip()
        p = basic_tokenize(p).strip()
        if g and p:
            ground_final.append(g)
            pred_final.append(p)
    assert len(ground_final)==len(pred_final)
    print(len(ground_final))
    out = {"ground":ground_final,"pred":pred_final}
    model_name = model_file.split("_")[0]
    file = os.path.join(folder, model_name + "_for_ext_eval.pkl")
    with open(file, "wb") as f:
        pickle.dump(out, f)

In [None]:
def execute_():
    f1()
    f2()
    f3()
    f4()
    f5()
    f6()

In [None]:
execute_() 