In [3]:
import numpy as np
import gensim.downloader as api
from sklearn.linear_model import LogisticRegression
from sklearn.neural_network import MLPClassifier

# Initialization

In [4]:
def load_dataset():
    tweets = []
    labels = []

    def load_tweets(filename, label):
        with open(filename, 'r', encoding='utf-8') as f:
            for line in f:
                tweets.append(line.rstrip())
                labels.append(label)

    load_tweets('twitter-datasets/train_neg_full.txt', 0)
    load_tweets('twitter-datasets/train_pos_full.txt', 1)

    # Convert to NumPy array to facilitate indexing
    tweets = np.array(tweets)
    labels = np.array(labels)

    print(f'{len(tweets)} tweets loaded')

    return tweets, labels

In [5]:
def split_dataset(tweets):
    np.random.seed(1) # Reproducibility!

    shuffled_indices = np.random.permutation(len(tweets))
    split_idx = int(0.9 * len(tweets))
    train_indices = shuffled_indices[:split_idx]
    val_indices = shuffled_indices[split_idx:]

    print("Size train set:", len(train_indices))
    print("Size validation set:", len(val_indices))

    return train_indices, val_indices

In [6]:
tweets, labels = load_dataset()
train_indices, val_indices = split_dataset(tweets)
Y_train, Y_val = labels[train_indices], labels[val_indices]

2500000 tweets loaded
Size train set: 2250000
Size validation set: 250000


# Models

In [7]:
class Models:
    def __init__(self, embedding, dimension=100):
        self.dimension = dimension

        self.embedding_index = embedding

        print('Loading vectors with dimension', dimension)
        print('Found %s word vectors' % len(self.embedding_index))

    def create_tweet_embeddings(self, tweets, train_idx, val_idx):
        X = np.zeros((len(tweets), self.dimension))

        for i, tw in enumerate(tweets):
            word_count = 0
            tw = tw.split()

            for word in tw:
                if self.embedding_index.has_index_for(word):
                    X[i] += self.embedding_index.get_vector(word)
                    word_count += 1

            if word_count: X[i] = X[i] / word_count

        print("Tweet embeddings created")

        return X[train_idx], X[val_idx]

    def logistic_regression(self, X_train, Y_train, X_val, Y_val):
        model = LogisticRegression(random_state=0, C=1e5, max_iter=300)
        model.fit(X_train, Y_train)

        print("Model trained")

        Y_train_pred = model.predict(X_train)
        Y_val_pred = model.predict(X_val)

        train_accuracy = (Y_train_pred == Y_train).mean()
        val_accuracy = (Y_val_pred == Y_val).mean()

        return train_accuracy, val_accuracy

    def mlp(self, X_train, Y_train, X_val, Y_val):
        model = MLPClassifier(random_state=0, alpha = 0.7, max_iter=400)
        model.fit(X_train, Y_train)

        print("Model trained")

        Y_train_pred = model.predict(X_train)
        Y_val_pred = model.predict(X_val)

        train_accuracy = (Y_train_pred == Y_train).mean()
        val_accuracy = (Y_val_pred == Y_val).mean()

        return train_accuracy, val_accuracy

# Glove 100

In [18]:
glove_embedding = api.load('glove-twitter-100')



In [19]:
glove_model = Models(embedding=glove_embedding, dimension=100)


Loading vectors with dimension 100
Found 1193514 word vectors


In [20]:
X_train, X_val = glove_model.create_tweet_embeddings(tweets, train_indices, val_indices)
print('Shape X_train:', X_train.shape)
print('Shape X_val:', X_val.shape)

Tweet embeddings created
Shape X_train: (2250000, 100)
Shape X_val: (250000, 100)


## Logistic regression

In [21]:
train_accuracy, val_accuracy = glove_model.logistic_regression(X_train, Y_train, X_val, Y_val)

print(f'Accuracy (training set): {train_accuracy:.05f}')
print(f'Accuracy (validation set): {val_accuracy:.05f}')

Model trained
Accuracy (training set): 0.76393
Accuracy (validation set): 0.76321


## MLP

In [22]:
train_accuracy, val_accuracy = glove_model.mlp(X_train, Y_train, X_val, Y_val)

print(f'Accuracy (training set): {train_accuracy:.05f}')
print(f'Accuracy (validation set): {val_accuracy:.05f}')

Model trained
Accuracy (training set): 0.77693
Accuracy (validation set): 0.77805


# Glove 200

In [13]:
glove_embedding = api.load('glove-twitter-200')



In [14]:
glove_model = Models(embedding=glove_embedding, dimension=200)


Loading vectors with dimension 200
Found 1193514 word vectors


In [15]:
X_train, X_val = glove_model.create_tweet_embeddings(tweets, train_indices, val_indices)
print('Shape X_train:', X_train.shape)
print('Shape X_val:', X_val.shape)

Tweet embeddings created
Shape X_train: (2250000, 200)
Shape X_val: (250000, 200)


## Logistic regression

In [16]:
train_accuracy, val_accuracy = glove_model.logistic_regression(X_train, Y_train, X_val, Y_val)

print(f'Accuracy (training set): {train_accuracy:.05f}')
print(f'Accuracy (validation set): {val_accuracy:.05f}')

Model trained
Accuracy (training set): 0.78214
Accuracy (validation set): 0.78108


## MLP

In [17]:
train_accuracy, val_accuracy = glove_model.mlp(X_train, Y_train, X_val, Y_val)

print(f'Accuracy (training set): {train_accuracy:.05f}')
print(f'Accuracy (validation set): {val_accuracy:.05f}')

Model trained
Accuracy (training set): 0.79331
Accuracy (validation set): 0.79408


# Word2Vec

In [8]:
word2vec_embedding = api.load('word2vec-google-news-300')



In [9]:
word2vec_model = Models(embedding=word2vec_embedding, dimension=300)

Loading vectors with dimension 300
Found 3000000 word vectors


In [10]:
X_train, X_val = word2vec_model.create_tweet_embeddings(tweets, train_indices, val_indices)
print('Shape X_train:', X_train.shape)
print('Shape X_val:', X_val.shape)

Tweet embeddings created
Shape X_train: (2250000, 300)
Shape X_val: (250000, 300)


## Logisic regression

In [11]:
train_accuracy, val_accuracy = word2vec_model.logistic_regression(X_train, Y_train, X_val, Y_val)

print(f'Accuracy (training set): {train_accuracy:.05f}')
print(f'Accuracy (validation set): {val_accuracy:.05f}')

Model trained
Accuracy (training set): 0.74975
Accuracy (validation set): 0.74992


## MLP

In [12]:
train_accuracy, val_accuracy = word2vec_model.mlp(X_train, Y_train, X_val, Y_val)

print(f'Accuracy (training set): {train_accuracy:.05f}')
print(f'Accuracy (validation set): {val_accuracy:.05f}')

Model trained
Accuracy (training set): 0.74993
Accuracy (validation set): 0.74994
