In [1]:
import pandas as pd
import numpy as np
from sklearn.svm import LinearSVC
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_extraction.text import TfidfVectorizer
from collections import defaultdict
from nltk.tokenize import word_tokenize
import preprocessor as p
from nltk.corpus import stopwords
import string
import re
from nltk.stem.snowball import EnglishStemmer
import pickle
from gensim import models
from gensim.models import Word2Vec
from tqdm import tqdm

In [2]:
class TfidfEmbeddingVectorizer(object):
    def __init__(self, model):
        self.model = model
        self.modelweight = None
        self.dim = 200
        # self.dim = len(model[model.keys()[0]])

    def fit(self, X):
        tfidf = TfidfVectorizer(analyzer=lambda x: x)
        tfidf.fit(X)
        # if a word was never seen - it must be at least as infrequent
        # as any of the known words - so the default idf is the max of 
        # known idf's
        max_idf = max(tfidf.idf_)
        self.modelweight = defaultdict(
		    lambda: max_idf,
		    [(w, tfidf.idf_[i]) for w, i in tfidf.vocabulary_.items()])

        return self

    def transform(self, X):
        return np.array([
		        np.mean([self.model[w] * self.modelweight[w]
		                 for w in words if w in self.model] or
		                [np.zeros(self.dim)], axis=0)
		        for words in X
		    ])

In [3]:
def word_embeddings(tweets, embedding):
    if embedding == "word2vec":
        X = word2vec(tweets)
        w2v = models.Word2Vec(X, vector_size=200, window=5, sg=0)
        model = dict(zip(w2v.wv.index_to_key, w2v.wv.vectors))
        
    elif embedding == "glove":
        with open("./glove.twitter.27B.200d.txt", "rb") as lines:
            model = {line.split()[0]: np.array(map(float, line.split()[1:]))
                for line in lines}


    vec = TfidfEmbeddingVectorizer(model)
    vec.fit(tweets)
    matrix = vec.transform(tweets)

    return matrix

In [4]:
def word2vec(tweets):
    texts = []

    for tweet in tweets:
        texts.append(tweet.split())

    return texts

In [5]:
train_text = pd.read_table('../dataFinal/preprocessed_train_text.txt', engine="python-fwf")
train_text = train_text['Text']
print(train_text.loc[0])

test_text = pd.read_table('../dataFinal/preprocessed_test_text.txt', engine="python-fwf")
test_text = test_text['Text']
print(test_text.loc[0])

trial_text = pd.read_table('../dataFinal/preprocessed_trial_text.txt', engine="python-fwf")
trial_text = trial_text['Text']
print(trial_text.loc[0])

tweet thursday night party next got
tweet summer squash wave real kabob
tweet still favorite muggle carytown cupcakes


In [6]:
train_emoji = (open("../dataFinal/finalTrainLabels.labels", "r").readlines())
for i in tqdm(range(len(train_emoji))):
    train_emoji[i] = int(train_emoji[i][0])
train_labels = pd.Series((np.array(train_emoji)).astype('int8'))
print(train_labels.loc[0])

test_emoji = (open("../dataFinal/finalTestLabels.labels", "r").readlines())
for i in tqdm(range(len(test_emoji))):
    test_emoji[i] = int(test_emoji[i][0])
test_labels = pd.Series((np.array(test_emoji)).astype('int8'))
print(test_labels.loc[0])

trial_emoji = (open("../dataFinal/finalDevLabels.labels", "r").readlines())
for i in tqdm(range(len(trial_emoji))):
    trial_emoji[i] = int(trial_emoji[i][0])
trial_labels = pd.Series((np.array(trial_emoji)).astype('int8'))
print(trial_labels.loc[0])

100%|██████████| 69992/69992 [00:00<00:00, 1360993.81it/s]


4


100%|██████████| 20000/20000 [00:00<00:00, 1582606.92it/s]


6


100%|██████████| 10008/10008 [00:00<00:00, 1302609.60it/s]

7





In [7]:
embedding = "glove"
emb_train = word_embeddings(train_text, embedding)
emb_trial = word_embeddings(trial_text, embedding)
emb_test = word_embeddings(test_text, embedding)

In [8]:
vec = TfidfVectorizer(min_df=1, ngram_range=(1,3), decode_error='ignore', max_features=2000)
bow_train = vec.fit_transform(train_text).toarray()
bow_trial = vec.transform(trial_text).toarray()
bow_test =  vec.transform(test_text).toarray()

In [9]:
train = np.concatenate((emb_train, bow_train), axis=1)
trial = np.concatenate((emb_trial, bow_trial), axis=1)
test = np.concatenate((emb_test, bow_test), axis=1)

In [10]:
np.save('glove200_t2_train',train)
np.save('glove200_t2_trial',trial)
np.save('glove200_t2_test',test)