In [320]:
import numpy as np
import pandas as pd
from scipy import spatial
from tqdm import tqdm
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn import preprocessing
import string
import numpy as np
import random
from sklearn.metrics import classification_report,accuracy_score,balanced_accuracy_score

# 1. TEXT CLASSIFICATION: IMDB data

# Load data

In [321]:
data    = pd.read_csv("imdb.csv",index_col=0)
X_train,X_test,y_train,y_test = ?

#### Preprocessing

In [322]:
categories      = ["Family","Sci-Fi","Thriller", "Romance"]
n_classes       = len(categories)
train_texts     = list(X_train["Plot"].values)
test_texts      = list(X_test["Plot"].values)
train_labels_genre=list(y_train.values)
test_labels_genre=list(y_test.values)
le = preprocessing.LabelEncoder()
le.fit(categories)
train_labels = le.transform(train_labels_genre)
test_labels = le.transform(test_labels_genre)
print("Class balances:")
for i,c in enumerate(categories):
    print(c,np.mean(train_labels==i))

Family 0.14511424404472534
Sci-Fi 0.33908604764219735
Thriller 0.1276130286825474
Romance 0.3881866796305299


# GloVe

#### Cleaning the text for GloVe:

In [1]:
def clean(text):
    return text.translate(str.maketrans('', '', string.punctuation)).lower()

#### Converting the words to vectors and take the average for each movie plot

# PCA of movie plots

# Classify genres based on the first n principal components of GloVe vectors

# FastText Classification using 3-gram embeddings

In [333]:
#########################################################################################################
# Adapted from: https://pytorch.org/tutorials/beginner/text_sentiment_ngrams_tutorial.html
#########################################################################################################

import numpy as np
import random
from tqdm import tqdm
import torch
from torch.utils.data import DataLoader
import time
import torch.nn as nn
import torch.nn.functional as F
from torchtext.data.utils import ngrams_iterator
from torchtext.data.utils import get_tokenizer
from torchtext.datasets import TextClassificationDataset
from torchtext.vocab import Vocab
from torchtext.vocab import build_vocab_from_iterator

#####################################################################################################################
# Auxilary functions
#####################################################################################################################

tokenizer = get_tokenizer("basic_english")


def token_iterator(texts, ngrams):
    for text in texts:
        tokens = tokenizer(text)
        yield ngrams_iterator(tokens, ngrams)


def construct_vocab(texts, ngrams):
    vocab = build_vocab_from_iterator(token_iterator(texts, ngrams))
    return vocab


def text_to_tensor(text, vocab, ngrams):
    tokens = ngrams_iterator(tokenizer(text), ngrams=ngrams)
    token_ids = list(filter(lambda x: x is not Vocab.UNK, [vocab[token] for token in tokens]))
    tokens = torch.tensor(token_ids)
    return tokens


def make_torchdataset(vocab, texts, labels, ngrams):
    tokens = [text_to_tensor(text, vocab, ngrams) for text in tqdm(texts)]
    pairs = list(zip(labels, tokens))
    return TextClassificationDataset(vocab, pairs, set(labels))

def generate_batch(batch):
    label = torch.tensor([entry[0] for entry in batch])
    text = [entry[1] for entry in batch]
    offsets = [0] + [len(entry) for entry in text]
    # torch.Tensor.cumsum returns the cumulative sum
    # of elements in the dimension dim.
    # torch.Tensor([1.0, 2.0, 3.0]).cumsum(dim=0)

    offsets = torch.tensor(offsets[:-1]).cumsum(dim=0)
    text = torch.cat(text)
    return text, offsets, label

#####################################################################################################################
# Model
#####################################################################################################################


class TextClassificationModel(nn.Module):
    def __init__(self, vocab_size, embed_dim, num_class):
        super().__init__()
        self.embedding = nn.EmbeddingBag(vocab_size, embed_dim, sparse=True)
        self.fc = nn.Linear(embed_dim, num_class)
        self.init_weights()

    def init_weights(self):
        initrange = 0.5
        self.embedding.weight.data.uniform_(-initrange, initrange)
        self.fc.weight.data.uniform_(-initrange, initrange)
        self.fc.bias.data.zero_()

    def forward(self, text, offsets):
        embedded = self.embedding(text, offsets)
        return self.fc(embedded)

#####################################################################################################################
# FastText
#####################################################################################################################


class FastText(object):

    def __init__(self, texts, labels, embed_dim, ngrams=3, num_epochs=5, seed=0):

        # set seed
        np.random.seed(seed)
        random.seed(seed)
        torch.manual_seed(seed)
        if torch.cuda.is_available():
            torch.cuda.manual_seed_all(seed)


        self.texts = texts
        self.labels = labels
        self.embed_dim = embed_dim
        self.ngrams = ngrams

        # construct vocab
        print('Constructing vocabulary...')
        self.vocab = construct_vocab(texts, ngrams)
        self.vocab_size = len(self.vocab)

        # prepare dataset
        print('Preparing dataset...')
        self.train_dataset = make_torchdataset(self.vocab, texts, labels, ngrams)
        self.num_classes = len(self.train_dataset.get_labels())

        # prepare device ref and model
        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        self.model = TextClassificationModel(self.vocab_size, self.embed_dim, self.num_classes).to(self.device)

        # loss function & optimization
        self.criterion = torch.nn.CrossEntropyLoss().to(self.device)
        self.optimizer = torch.optim.SGD(self.model.parameters(), lr=4.0)
        self.scheduler = torch.optim.lr_scheduler.StepLR(self.optimizer, 1, gamma=0.9)
        self.batch_size = 16

        self.tokenizer = get_tokenizer("basic_english")
        self.ngrams = ngrams



        if num_epochs > 0:
            print('Training model...')
            self.train(self.train_dataset, num_epochs)


    def train_step(self, sub_train_):

        # Train the model
        train_loss = 0
        train_acc = 0
        data = DataLoader(sub_train_, batch_size=self.batch_size, shuffle=True, collate_fn=generate_batch)
        for i, (text, offsets, cls) in enumerate(data):
            self.optimizer.zero_grad()
            text, offsets, cls = text.to(self.device), offsets.to(self.device), cls.to(self.device)
            output = self.model(text, offsets)
            loss = self.criterion(output, cls)
            train_loss += loss.item()
            loss.backward()
            self.optimizer.step()
            train_acc += (output.argmax(1) == cls).sum().item()

        # Adjust the learning rate
        self.scheduler.step()

        return train_loss / len(sub_train_),  train_acc / len(sub_train_)

    def compute_loss(self, data_):
        loss = 0
        acc = 0
        data = DataLoader(data_, batch_size=self.batch_size, collate_fn=generate_batch)
        for text, offsets, cls in data:
            text, offsets, cls = text.to(self.device), offsets.to(self.device), cls.to(self.device)
            with torch.no_grad():
                output = self.model(text, offsets)
                loss = self.criterion(output, cls)
                loss += loss.item()
                acc += (output.argmax(1) == cls).sum().item()

        return loss / len(data_), acc / len(data_)

    def train(self, train_dataset, n_epochs=5):

        min_valid_loss = float('inf')

        for epoch in range(n_epochs):
            start_time = time.time()
            train_loss, train_acc  = self.train_step(train_dataset)

            secs = int(time.time() - start_time)
            mins = secs / 60
            secs = secs % 60

            print('Epoch: %d' % (epoch + 1), " | time in %d minutes, %d seconds" % (mins, secs))
            print(f'\tLoss: {train_loss:.4f}(train)\t|\tAcc: {train_acc * 100:.1f}%(train)')
        print('')

    def predict(self, text_, return_prob=False):

        with torch.no_grad():
            text = text_to_tensor(text_, self.vocab, self.ngrams)
            output = self.model(text, torch.tensor([0]))

            if return_prob:
                return F.softmax(output, 1).detach().numpy()
            else:
                return output.argmax(1).item()

    def get_text_embedding(self, text_):
        with torch.no_grad():
            text = text_to_tensor(text_, self.vocab, self.ngrams)
            return self.model.embedding(text, offsets=torch.LongTensor([0])).detach().numpy()

    def word_in_vocab(self, word):
        return word in self.vocab.stoi





In [334]:
# Usage example:
model = FastText(train_texts,train_labels,embed_dim=100,num_epochs=10)

1021lines [00:00, 10202.66lines/s]Constructing vocabulary...
4114lines [00:00, 11792.53lines/s]
 18%|█▊        | 739/4114 [00:00<00:00, 7383.58it/s]Preparing dataset...
100%|██████████| 4114/4114 [00:00<00:00, 6135.94it/s]
Training model...
Epoch: 1  | time in 0 minutes, 1 seconds
	Loss: 0.0815(train)	|	Acc: 42.7%(train)
Epoch: 2  | time in 0 minutes, 1 seconds
	Loss: 0.0600(train)	|	Acc: 59.9%(train)
Epoch: 3  | time in 0 minutes, 1 seconds
	Loss: 0.0371(train)	|	Acc: 80.1%(train)
Epoch: 4  | time in 0 minutes, 1 seconds
	Loss: 0.0207(train)	|	Acc: 92.4%(train)
Epoch: 5  | time in 0 minutes, 1 seconds
	Loss: 0.0114(train)	|	Acc: 98.2%(train)
Epoch: 6  | time in 0 minutes, 1 seconds
	Loss: 0.0072(train)	|	Acc: 99.7%(train)
Epoch: 7  | time in 0 minutes, 1 seconds
	Loss: 0.0050(train)	|	Acc: 99.8%(train)
Epoch: 8  | time in 0 minutes, 1 seconds
	Loss: 0.0038(train)	|	Acc: 99.9%(train)
Epoch: 9  | time in 0 minutes, 1 seconds
	Loss: 0.0031(train)	|	Acc: 99.9%(train)
Epoch: 10  | time in 

# Classification on test set

# PCA on FastText embeddings

# Write your own plot and see what genre GloVe+PCA+classifier and FastText guess it is

# Optional: Do classification/regression on another variable in the dataset, such as predicting movie ratings based on year and plot.

# 2. SENTIMENT ANALYSIS: The Donald

In [339]:
text = np.loadtxt("Donald.txt",dtype='str')
text_cleaned = []
for word in text:
    
    # to lower
    doc_cleaned = word.lower()
    
    #doc_cleaned = ' '.join([word for word in doc_cleaned.split() if word not in stop_words])    
    doc_cleaned = doc_cleaned.replace('.', '')
    doc_cleaned = doc_cleaned.replace(',', '')
    doc_cleaned = doc_cleaned.replace('?', '')
    doc_cleaned = doc_cleaned.replace('!', '')
    text_cleaned.append(doc_cleaned)


In [340]:
print(text_cleaned)

['Chief' 'Justice' 'Roberts,' ... 'God' 'bless' 'America.']
['chief', 'justice', 'roberts', 'president', 'carter', 'president', 'clinton', 'president', 'bush', 'president', 'obama', 'fellow', 'americans', 'and', 'people', 'of', 'the', 'world', 'thank', 'you', 'we', 'the', 'citizens', 'of', 'america', 'are', 'now', 'joined', 'in', 'a', 'great', 'national', 'effort', 'to', 'rebuild', 'our', 'country', 'and', 'restore', 'its', 'promise', 'for', 'all', 'of', 'our', 'people', 'together', 'we', 'will', 'determine', 'the', 'course', 'of', 'america', 'and', 'the', 'world', 'for', 'many', 'many', 'years', 'to', 'come', 'we', 'will', 'face', 'challenges', 'we', 'will', 'confront', 'hardships', 'but', 'we', 'will', 'get', 'the', 'job', 'done', 'every', 'four', 'years', 'we', 'gather', 'on', 'these', 'steps', 'to', 'carry', 'out', 'the', 'orderly', 'and', 'peaceful', 'transfer', 'of', 'power', 'and', 'we', 'are', 'grateful', 'to', 'president', 'obama', 'and', 'first', 'lady', 'michelle', 'obama', 

# Calculate window-wise sentiment and have the window size and stride as variables that can easily be changed

# Plot the sentiment over time

# Low pass filter using a smoothing filter

# Optional: multiply the sentiment-lexicon with a positive constant of your own choosing and comment on what effects it has on the sentiment analysis plot. 