# Ejecución modelo Bag of words

In [3]:
!pip install jsonlines

Collecting jsonlines
  Downloading jsonlines-4.0.0-py3-none-any.whl (8.7 kB)
Installing collected packages: jsonlines
Successfully installed jsonlines-4.0.0


In [None]:
!python -m spacy download en_core_web_sm

In [4]:
import jsonlines
import pandas as pd

In [11]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_selection import RFE
import numpy as np
from scipy.sparse import csr_matrix
import spacy
from spacy.lang.en import English

In [12]:
from spacy.lang.en.stop_words import STOP_WORDS

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [35]:
data_path = '/content/drive/MyDrive/prueba_tecnica_bancolombia/Prueba practica/2020_acl_diplomacy/data/'
with jsonlines.open(data_path+'train.jsonl', 'r') as reader:
    train = list(reader)
with jsonlines.open(data_path+'validation.jsonl', 'r') as reader:
    valid = list(reader)
with jsonlines.open(data_path+'test.jsonl', 'r') as reader:
    test = list(reader)

In [24]:
list(STOP_WORDS)[:10], type(STOP_WORDS)

(['otherwise',
  'everywhere',
  'made',
  'whole',
  'last',
  'his',
  'please',
  'and',
  'since',
  'seems'],
 set)

In [33]:
def log_reg(train, test):
    #Convert train data into a vector
    vectorizer = CountVectorizer(tokenizer=spacy_tokenizer, stop_words=list(STOP_WORDS), strip_accents = 'unicode')
    if TASK == "SENDER":
        corpus = [message['message'].lower() for message in aggregate(train)]
    elif TASK == "RECEIVER": #for receivers, drop all missing annotations
        corpus = [message['message'].lower() for message in aggregate(train) if message['receiver_annotation'] != "NOANNOTATION"]
    X = vectorizer.fit_transform(corpus)

    #Convert test data into a vector, only based on train vocab
    newVec = CountVectorizer(tokenizer=spacy_tokenizer, vocabulary=vectorizer.vocabulary_, stop_words=list(STOP_WORDS), strip_accents = 'unicode')
    if TASK == "SENDER":
        y = newVec.fit_transform([message['message'].lower() for message in aggregate(test)])
    elif TASK == "RECEIVER": #for receivers, drop all missing annotations
        y = newVec.fit_transform([message['message'].lower() for message in aggregate(test) if message['receiver_annotation'] != "NOANNOTATION"])

    #only used for getting lie/not lie labels
    train = convert_to_binary(aggregate(train))
    print("len(train): ", len(train))
    #validation set not used for consistency with neural
    test = convert_to_binary(aggregate(test))
    print("len(test): ", len(test))
    train = split_xy(train)
    test = split_xy(test)

    #append power to the matrix
    append_power_x = np.append(X.toarray(), train[0], axis = 1)
    append_power_y = np.append(y.toarray(), test[0], axis = 1)

    #code to scale features, if power is added as a raw value, not binary feature.  Worse than binary so not sued
    #    from sklearn.preprocessing import StandardScaler
    #    scaler = StandardScaler()
    #    append_power_x = scaler.fit_transform(append_power_x)
    #    append_power_y = scaler.fit_transform(append_power_y)

    #convert matrix back to sparse format
    X = csr_matrix(append_power_x)
    y = csr_matrix(append_power_y)

    #balanced class weight is important, since otherwise it will only learn majority class
    logmodel = LogisticRegression(class_weight = 'balanced', max_iter=1000)


#    RFE VERSION.  Worse than log regression and long run time so not used
#    rfe = RFE(logmodel, n_features_to_select = 1000, step = 100, verbose = 1)
#    rfe = rfe.fit(X, train[1])
#    print(rfe.support_)
#    print(rfe.ranking_)
#    predictions = rfe.predict(y)
#    print(rfe.score(y, test[1]))
#    print(classification_report(test[1],predictions))

    logmodel.fit(X, train[1])
    predictions = logmodel.predict(y)
    #code to print out top words
#    print ("Examples of words that skew towards a lie are:")
#    for index,a in enumerate(logmodel.coef_[0]):
#        if a > 2:
#            print(vectorizer.get_feature_names()[index], a)
#
#    for index,a in enumerate(logmodel.coef_[0]):
#        if a < -2:
#            print(vectorizer.get_feature_names()[index], a)

    print(classification_report(test[1],predictions, digits=3))

    return

In [14]:
def is_number(tok):
    try:
        float(tok)
        return True
    except ValueError:
        return False

def spacy_tokenizer(text):
    return [tok.text if not is_number(tok.text) else '_NUM_' for tok in nlp(text)]



#change the format from list of lists into a single list
def aggregate(dataset):
    messages = []
    rec = []
    send = []
    power = []
    for dialogs in dataset:
        messages.extend(dialogs['messages'])
        rec.extend(dialogs['receiver_labels'])
        send.extend(dialogs['sender_labels'])
        #ONLY FOR POWER VERSION
        power.extend(dialogs['game_score_delta'])
    #print(len(rec), len(send), len(messages))
    merged = []
    for i, item in enumerate(messages):
        merged.append({'message':item, 'sender_annotation':send[i], 'receiver_annotation':rec[i], 'score_delta':int(power[i])})
    return merged

In [15]:
def convert_to_binary(dataset):
    binary_data = []
    matrix = []
    for message in dataset:
        #drop the instances that were not annotated
        if message['receiver_annotation'] == True or message['receiver_annotation'] == False:
            pass
        else:
            if TASK == "SENDER":
                pass
            elif TASK == "RECEIVER":
                continue

        binary = []

        #a severe power skew (a difference of 5 or more supply centers) has the best result
        if POWER == "y":
            if message['score_delta'] > 4:
                binary.append(1)
            else:
                binary.append(0)

            if message['score_delta'] < -4:
                binary.append(1)
            else:
                binary.append(0)

        if TASK == "SENDER":
            annotation ='sender_annotation'
        elif TASK == "RECEIVER":
            annotation ='receiver_annotation'
        #add class label
        if message[annotation] == False:
            binary.append(0)
        else:
            binary.append(1)

        binary_data.append(binary)
    return binary_data

In [16]:
#split up x and y label in data
def split_xy(data):
    X, y = [], []
    for line in data:
        x = line[:len(line)-1]
        single_y = line[len(line)-1]
        X.append(x)
        y.append(single_y)
    return(X, y)

In [28]:
TASK = "SENDER"
POWER = "N"
nlp = English()


In [34]:
log_reg(train, test)



len(train):  13132
len(test):  2741
              precision    recall  f1-score   support

           0      0.149     0.242     0.185       240
           1      0.923     0.868     0.895      2501

    accuracy                          0.813      2741
   macro avg      0.536     0.555     0.540      2741
weighted avg      0.855     0.813     0.832      2741

