In [1]:
#IMPORT USED PACKAGES AND SET SEED
import numpy as np
import matplotlib.pyplot as plt
from scipy.sparse import *
import pickle
import random
from datetime import datetime
import pandas as pd
import csv
import pbs
import os
import sys
from multiprocessing import Pool
random.seed(123)

In [2]:
## LOADING DATASETS BEFORE PREPROCESSING

#Define the used datasets here
positive_dataset = 'Datasets/train_pos.txt'
negative_dataset = 'Datasets/train_neg.txt'

def load_in_pd(data_path):
    with open(data_path, encoding="utf-8") as f :
        x = f.readlines()
        x = pd.DataFrame(x,columns=['Tweets'])
    return (x)

positive_pd = load_in_pd(positive_dataset)
negative_pd = load_in_pd(negative_dataset)

In [3]:
### APPLYING DESIRED PREPROCESSING

#Define the desired preprocessing method in this function
def preprocess(tweet_data):
    #tweet_data.drop_duplicates(inplace=True)
    return tweet_data

positive_preprocessed = preprocess(positive_pd)
negative_preprocessed = preprocess(negative_pd)

time = datetime.strftime(datetime.now(), "%Y_%m_%d_%H_%M_%S")
if not os.path.exists("Processed_data"):
    os.makedirs("Processed_data")
    
with open('Processed_data/' + 'positive_preprocessed' + time + '.txt', "w+", encoding = "UTF-8") as f:
    for tweet in positive_preprocessed["Tweets"]:
        f.write("%s" % tweet)
        
with open('Processed_data/' + 'negative_preprocessed' + time + '.txt', "w+", encoding = "UTF-8") as f:
    for tweet in negative_preprocessed["Tweets"]:
        f.write("%s" % tweet)

In [4]:
##APPLY build_vocab.sh AND CUT THE VOCABULARY USING THE CHOSEN THRESHOLD

#Choose the desired cutting parameter here (Tokens with >= cut_threshold occurrences are kept)
cut_threshold = 5


arg1 = 'positive_preprocessed' + time + '.txt'
arg2 = 'negative_preprocessed' + time + '.txt'
arg3 = 'vocab_' + time + '.txt'
vocab_successful = os.system("build_vocab.sh " + arg1 + " " + arg2 + " " + arg3)

if (vocab_successful != 0):
    sys.exit("Building vocabulary failed.")
    
def cut_and_save_vocab(file_in, file_out):
    full_vocab = pd.read_csv(file_in, sep = "(\s+)", header=None, engine = 'python')
    cutted_vocab = full_vocab[full_vocab[0] >= cut_threshold][2]
    with open(file_out, 'w+') as f:
        f.write(cutted_vocab.to_string(header = False, index = False))
    
cut_and_save_vocab('Processed_data/vocab_' + time + '.txt', 'Processed_data/vocab_cut' + time + '.txt')
    

In [5]:
##DUMP THE BUILT VOCABULARY TO A PICKLE FILE
vocab = dict()
with open('Processed_data/vocab_cut' + time + '.txt') as f:
    for idx, line in enumerate(f):
        vocab[line.strip()] = idx

with open('Processed_data/vocab_' + time + '.pkl', 'wb') as f:
    pickle.dump(vocab, f, pickle.HIGHEST_PROTOCOL)

In [6]:
##CREATE A CO-OCCURRENCE MATRIX
def create_cooc(vocab_file, negative_file, positive_file, output_file):
    with open(vocab_file, 'rb') as f:
        vocab = pickle.load(f)
    vocab_size = len(vocab)
    data, row, col = [], [], []
    counter = 1
    for fn in [negative_file, positive_file]:
        with open(fn,encoding="utf8") as f:
            for line in f:
                tokens = [vocab.get(t, -1) for t in line.strip().split()]
                tokens = [t for t in tokens if t >= 0]
                for t in tokens:
                    for t2 in tokens:
                        data.append(1)
                        row.append(t)
                        col.append(t2)

                if counter % 10000 == 0:
                    print(counter)
                counter += 1

                if counter % 200000 == 0:
                    print(len(data))
                    cooc = coo_matrix((data, (row, col)))
                    print("summing duplicates (this can take a while)")
                    cooc.sum_duplicates()
                    data=list(cooc.data)
                    row=list(cooc.row)
                    col=list(cooc.col)
                    print(len(data))

    print(len(data))
    cooc = coo_matrix((data, (row, col)))
    print("summing duplicates (this can take a while)")
    cooc.sum_duplicates()
    
    with open(output_file, 'wb') as f:
        pickle.dump(cooc, f, pickle.HIGHEST_PROTOCOL)
        
#Used datasets should be defined on the second code block
create_cooc('Processed_data/vocab_' + time + '.pkl', 'Processed_data/negative_preprocessed' + time + '.txt', 'Processed_data/positive_preprocessed' + time + '.txt',
            'Processed_data/cooc_pickle' + time + '.pkl')

10000
20000
30000
40000
50000
60000
70000
80000
90000
100000
110000
120000
130000
140000
150000
160000
170000
180000
190000
55288893
summing duplicates (this can take a while)
6496177
200000
6497138
summing duplicates (this can take a while)


In [7]:
## APPLY glove.py
random.seed(123)
def glove(cooc_pickle, output_file):
    print("loading cooccurrence matrix")
    with open(cooc_pickle, 'rb') as f:
        cooc = pickle.load(f)
    print("{} nonzero entries".format(cooc.nnz))

    nmax = 100
    print("using nmax =", nmax, ", cooc.max() =", cooc.max())

    print("initializing embeddings")
    embedding_dim = 20
    xs = np.random.normal(size=(cooc.shape[0], embedding_dim))
    ys = np.random.normal(size=(cooc.shape[1], embedding_dim))

    eta = 0.001
    alpha = 3 / 4

    epochs = 10

    for epoch in range(epochs):
        print("epoch {}".format(epoch))
        for ix, jy, n in zip(cooc.row, cooc.col, cooc.data):
            logn = np.log(n)
            fn = min(1.0, (n / nmax) ** alpha)
            x, y = xs[ix, :], ys[jy, :]
            scale = 2 * eta * fn * (logn - np.dot(x, y))
            xs[ix, :] += scale * y
            ys[jy, :] += scale * x
    np.save(output_file, xs)

glove('Processed_data/cooc_pickle' + time + '.pkl', 'Processed_data/embeddings' + time + '.npy')

loading cooccurrence matrix
6496259 nonzero entries
using nmax = 100 , cooc.max() = 207302
initializing embeddings
epoch 0
epoch 1
epoch 2
epoch 3
epoch 4
epoch 5
epoch 6
epoch 7
epoch 8
epoch 9


In [8]:
##### PREPARE DATA FOR TRAINING A CLASSIFIER

def load_train_data(data_path):
    """Loads data and returns ids (event ids) and X (feature vector)"""
    f = open(data_path, encoding="utf-8")
    x = f.readlines()
    x = pd.DataFrame(x, columns=['Tweets'])
    return x

def representation(tweet, we, vocab):
    acc = np.array(0)
    n_ignored_word = 0
    for word in tweet.split():
        if word not in vocab.keys():
            n_ignored_word += 1
        else:
            try:
                acc = np.add(acc,we[vocab[word]])
            except: 
                #print("problem with " + word) #last word from vocab is missing in cooc
                n_ignored_word += 1
    n = len(tweet.split()) - n_ignored_word
    acc = acc / n
    return(acc)


def create_train_data(positive_path, negative_path, vocab, we):
    pos = load_train_data(positive_path)
    neg = load_train_data(negative_path)
    pos["y"] = 1
    neg["y"] = -1
    train = pd.concat([pos, neg])
    train.reset_index(drop = True, inplace = True)
    train["w"] = train["Tweets"].apply(lambda x: representation(x, we, vocab))
    train.drop("Tweets", axis=1, inplace = True)
    
    col = ["w" + str(k) for k in range(np.shape(we)[1])]
    train[col] = train["w"].apply(pd.Series)
    train.drop("w",axis=1,inplace=True)
    
    #remove the tweets which do not have any words used more than 5 times in the training dataset
    train.dropna(inplace=True) 
    
    return train

with open('Processed_data/vocab_' + time + '.pkl', 'rb') as f:
    vocab = pickle.load(f)
we = np.load('Processed_data/embeddings' + time + '.npy')
#The names of the datasets are defined in the second code block
train = create_train_data(positive_dataset, negative_dataset, vocab, we)

train

Unnamed: 0,y,w0,w1,w2,w3,w4,w5,w6,w7,w8,...,w10,w11,w12,w13,w14,w15,w16,w17,w18,w19
0,1,-0.219749,0.745540,0.041948,-0.099107,-0.665878,0.359475,0.517536,-0.460393,0.669874,...,-0.970730,-0.016734,-0.228777,-0.398726,-0.647413,-0.863147,0.863024,-1.209167e-01,0.264795,0.286755
1,1,-0.232397,0.850126,-0.037780,0.066273,-0.651975,0.188648,0.418423,-0.463095,0.676643,...,-0.768517,-0.029335,-0.202768,-0.266039,-0.397046,-0.868932,0.954230,-1.418888e-01,0.315455,0.378468
2,1,-0.196106,0.772137,0.079920,-0.029689,-0.568763,0.388558,0.736844,-0.390120,0.842148,...,-0.768206,0.210843,-0.268442,-0.497167,-0.581499,-1.010879,1.005777,-2.150340e-02,0.430640,0.456720
3,1,-0.383225,0.748733,-0.060198,-0.186418,-0.609153,0.211827,0.437397,-0.214325,0.671407,...,-0.721701,-0.068031,-0.292208,-0.296721,-0.561167,-0.742961,0.776625,-5.845626e-02,0.295820,0.140495
4,1,-0.380697,0.744709,-0.158541,0.050900,-0.395674,0.376496,0.563328,-0.106119,1.026240,...,-0.715954,-0.304611,-0.364311,-0.318147,-0.473483,-0.850380,0.720548,-3.077979e-01,0.300544,0.151082
5,1,-0.224387,0.797913,-0.039702,-0.063968,-0.815195,0.278249,0.537060,-0.489618,0.899919,...,-0.972511,-0.145349,-0.313597,-0.479365,-0.736033,-1.020937,0.980624,-3.996038e-02,0.259327,0.328890
6,1,-0.233821,0.725418,0.022844,-0.001590,-0.422095,0.204225,0.437934,-0.515468,0.757393,...,-0.759927,-0.094563,-0.068202,-0.320739,-0.601955,-0.913937,0.780618,-5.914505e-02,0.088157,0.355329
7,1,-0.290973,0.569031,-0.111407,-0.027402,-0.800628,-0.207526,0.311777,-0.328260,0.434084,...,-0.556016,-0.001879,-0.172092,-0.189740,-0.744734,-0.622042,0.625976,-6.882269e-02,0.375339,0.071695
8,1,-0.102458,0.751312,0.052945,-0.063175,-0.738349,0.324254,0.594351,-0.436006,0.860143,...,-0.962797,-0.120437,-0.275296,-0.330930,-0.613480,-0.983612,0.993938,-3.253364e-02,0.280994,0.421303
9,1,-0.355700,0.629268,-0.324685,0.365296,-0.611271,0.228094,0.545079,-0.128297,0.588963,...,-0.993842,-0.266877,-0.104110,-0.082733,-0.245762,-0.946366,1.243997,2.275627e-01,0.294945,0.487482


In [9]:
##TRAIN A CLASSIFIER

from sklearn.linear_model import SGDClassifier
X = train.drop("y", axis=1)
y = train["y"]
random.seed(123)
clf = SGDClassifier()
clf.fit(X, y) 

SGDClassifier(alpha=0.0001, average=False, class_weight=None,
              early_stopping=False, epsilon=0.1, eta0=0.0, fit_intercept=True,
              l1_ratio=0.15, learning_rate='optimal', loss='hinge',
              max_iter=1000, n_iter_no_change=5, n_jobs=None, penalty='l2',
              power_t=0.5, random_state=None, shuffle=True, tol=0.001,
              validation_fraction=0.1, verbose=0, warm_start=False)

In [10]:
##### LOAD THE TEST DATA, USE THE MODEL TO PREDICT THE SENTIMENTS AND SAVE THE PREDICTIONS

def load_and_prepare_test_data(data_path,vocab, we):
    """Loads data and returns ids (event ids) and X (feature vector)"""
    f = open(data_path, encoding='utf-8')
    test = f.readlines()
    tweets = []
    ids = []
    for str in test:
        id, tweet = str.split(',',1)
        tweets.append(tweet)
        ids.append(id)
    test = pd.DataFrame(tweets,ids,columns=['Tweets'])
    test["w"] = test["Tweets"].apply(lambda x: representation(x, we, vocab))
    test[col] = test["w"].apply(pd.Series)
    test.drop("w", axis=1, inplace = True)
    test.drop("Tweets", axis=1, inplace = True)
    
    drops = test[test.isnull().any(axis=1)].index
    #for the tweets which do not have any words used more than 5 times in the training dataset, predict 1
    test.fillna(1, inplace=True)
    
    return test,drops

test, drops = load_and_prepare_test_data('Datasets/test_data.txt', vocab, we)
test["Prediction"] = clf.predict(test)
#test["Id"] = test.index
test["Prediction"].to_csv("Submissions/submission" + time + ".csv", header= True, index_label = "Id")

NameError: name 'col' is not defined