In [None]:
#IMPORT USED PACKAGES and set seed
import numpy as np
import matplotlib.pyplot as plt
from scipy.sparse import *
import pickle
import random
from datetime import datetime
import pandas as pd
import csv
import pbs
import os
import sys
#import ...
random.seed(30)

In [None]:
## LOADING DATASETS BEFORE PREPROCESSING
positive_dataset = 'Datasets/train_pos_full.txt'
negative_dataset = 'Datasets/train_neg_full.txt'


#Define the datasets that should be read
positive_tweets = [line.rstrip('\n') for line in open(positive_dataset, encoding = 'UTF-8')]
negative_tweets = [line.rstrip('\n') for line in open(negative_dataset, encoding = 'UTF-8')]



In [None]:
##APPLYING DESIRED PREPROCESSING

#Define the desired preprocessing method in this function
def preprocess(tweet_data):
    return tweet_data


positive_preprocessed = preprocess(positive_tweets)
negative_preprocessed = preprocess(negative_tweets)

time = datetime.strftime(datetime.now(), "%Y_%m_%d_%H_%M_%S")
if not os.path.exists("Processed_data"):
    os.makedirs("Processed_data")

with open('Processed_data/' + 'positive_preprocessed' + time + '.txt', "w+", encoding = "UTF-8") as f:
    for line in positive_preprocessed:
        f.write("%s\n" % line)
        
with open('Processed_data/' + 'negative_preprocessed' + time + '.txt', "w+", encoding = "UTF-8") as f:
    for line in negative_preprocessed:
        f.write("%s\n" % line)


In [None]:
##APPLY build_vocab.sh AND cutting vocab

#Choose the desired cutting parameter here (Tokens with >= cut_threshold occurrences are kept)
cut_threshold = 5


arg1 = 'positive_preprocessed' + time + '.txt'
arg2 = 'negative_preprocessed' + time + '.txt'
arg3 = 'vocab_' + time + '.txt'
vocab_succesful = os.system("build_vocab.sh " + arg1 + " " + arg2 + " " + arg3)

if (vocab_succesful != 0):
    sys.exit("Building vocabulary failed.")
    
def cut_and_save_vocab(file_in, file_out):
    full_vocab = pd.read_csv(file_in, sep = "(\s+)", header=None, engine = 'python')
    cutted_vocab = full_vocab[full_vocab[0] >= cut_threshold][2]
    with open(file_out + time + '.txt', 'w+') as f:
        f.write(cutted_vocab.to_string(header = False, index = False))
    
cut_and_save_vocab('Processed_data/vocab_' + time + '.txt', 'Processed_data/vocab_cut' + time + '.txt')
    

In [None]:
##DUMP THE BUILT VOCABULARY TO A PICKLE FILE
vocab = dict()
with open('Processed_data/vocab_cut' + time + '.txt') as f:
    for idx, line in enumerate(f):
        vocab[line.strip()] = idx

with open('Processed_data/vocab_' + time + '.pkl', 'wb') as f:
    pickle.dump(vocab, f, pickle.HIGHEST_PROTOCOL)

In [None]:
##CREATE A CO-OCCURRENCE MATRIX
def create_cooc(vocab_file, negative_file, positive_file, output_file):
    with open(vocab_file, 'rb') as f:
        vocab = pickle.load(f)
    vocab_size = len(vocab)
    data, row, col = [], [], []
    counter = 1
    for fn in [negative_file, positive_file]:
        with open(fn,encoding="utf8") as f:
            for line in f:
                tokens = [vocab.get(t, -1) for t in line.strip().split()]
                tokens = [t for t in tokens if t >= 0]
                for t in tokens:
                    for t2 in tokens:
                        data.append(1)
                        row.append(t)
                        col.append(t2)

                if counter % 10000 == 0:
                    print(counter)
                counter += 1

                if counter % 200000 == 0:
                    print(len(data))
                    cooc = coo_matrix((data, (row, col)))
                    print("summing duplicates (this can take a while)")
                    cooc.sum_duplicates()
                    data=list(cooc.data)
                    row=list(cooc.row)
                    col=list(cooc.col)
                    print(len(data))

    print(len(data))
    cooc = coo_matrix((data, (row, col)))
    print("summing duplicates (this can take a while)")
    cooc.sum_duplicates()
    
    with open(output_file, 'wb') as f:
        pickle.dump(cooc, f, pickle.HIGHEST_PROTOCOL)
        
#Used datasets should be defined on the second code block
create_cooc('Processed_data/vocab_' + time + '.pkl', negative_dataset, positive_dataset,
            'Processed_data/cooc_pickle' + time + '.pkl')

In [None]:
## APPLY glove.py
random.seed(30)
def glove(cooc_pickle, output_file):
    print("loading cooccurrence matrix")
    with open(cooc_pickle, 'rb') as f:
        cooc = pickle.load(f)
    print("{} nonzero entries".format(cooc.nnz))

    nmax = 100
    print("using nmax =", nmax, ", cooc.max() =", cooc.max())

    print("initializing embeddings")
    embedding_dim = 20
    xs = np.random.normal(size=(cooc.shape[0], embedding_dim))
    ys = np.random.normal(size=(cooc.shape[1], embedding_dim))

    eta = 0.001
    alpha = 3 / 4

    epochs = 10

    for epoch in range(epochs):
        print("epoch {}".format(epoch))
        for ix, jy, n in zip(cooc.row, cooc.col, cooc.data):
            logn = np.log(n)
            fn = min(1.0, (n / nmax) ** alpha)
            x, y = xs[ix, :], ys[jy, :]
            scale = 2 * eta * fn * (logn - np.dot(x, y))
            xs[ix, :] += scale * y
            ys[jy, :] += scale * x
    np.save(output_file, xs)

glove('Processed_data/cooc_pickle' + time + '.pkl', 'Processed_data/embeddings' + time + '.npy')


In [None]:
##PREPARE DATA FOR TRAINING A CLASSIFIER
def load_train_data(data_path):
    """Loads data and returns ids (event ids) and X (feature vector)"""
    f = open(data_path, encoding="utf-8")
    x = f.readlines()
    x = pd.DataFrame(x, columns=['Tweets'])
    return x

def representation(tweet, we, vocab):
    acc = np.array(0)
    n_ignored_word = 0
    for word in tweet.split():
        if word not in vocab.keys():
            n_ignored_word += 1
        else:
            try:
                acc = np.add(acc,we[vocab[word]])
            except: 
                #print("problem with " + word) #last word from vocab is missing in cooc
                n_ignored_word += 1
    n = len(tweet.split()) - n_ignored_word
    acc = acc / n
    return(acc)


def create_train_data(positive_path, negative_path, vocab, we):
    pos = load_train_data(positive_path)
    neg = load_train_data(negative_path)
    pos["y"] = 1
    neg["y"] = -1
    train = pd.concat([pos, neg])
    train.reset_index(drop = True, inplace = True)
    train["w"] = train["Tweets"].apply(lambda x: representation(x, we, vocab))
    train.drop("Tweets", axis=1, inplace = True)
    return train

#The names of the datasets are defined in the second code block
with open(vocab_path, 'rb') as f:
    vocab = pickle.load(f)
we = np.load('Processed_data/embeddings' + time + '.npy')
create_train_data(positive_dataset, negative_dataset, vocab, we)

In [None]:
##TRAIN A CLASSIFIER
from sklearn.linear_model import SGDClassifier
X = train_full.drop("y", axis=1)
y = train_full["y"]
clf = SGDClassifier()
clf.fit(X, y) 


In [None]:
##LOAD THE TEST DATA AND USE THE MODEL TO PREDICT THE SENTIMENTS

def load_and_prepare_test_data(data_path,vocab, we):
    """Loads data and returns ids (event ids) and X (feature vector)"""
    f = open(data_path, encoding='utf-8')
    test = f.readlines()
    tweets = []
    ids = []
    for str in test:
        id, tweet = str.split(',',1)
        tweets.append(tweet)
        ids.append(id)
    test = pd.DataFrame(tweets,ids,columns=['Tweets'])
    test["w"] = test["Tweets"].apply(lambda x: representation(x, we, vocab_full))
    test[col] = test["w"].apply(pd.Series)
    test.drop("w", axis=1, inplace = True)
    test.drop("Tweets", axis=1, inplace = True)
    return x

test = load_test_data('Datasets/test_data.txt', vocab, we)
test["p"] = clf.predict(test)
