In [1]:
import os
import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow.python.keras.preprocessing.text import Tokenizer
from tensorflow.python.keras.preprocessing.sequence import pad_sequences
from Preprocessing.to_embedding import WordEmbedding
from Preprocessing.data_format import formatting
from Preprocessing.helper_functions import import_embedding, embedding_matrix_word2vec
from sklearn.model_selection import train_test_split

Using TensorFlow backend.


In [2]:
data = formatting("phase1_movie_reviews-train.csv")

y = pd.get_dummies(data['polarity'])
X_train, X_dev, y_train, y_dev = train_test_split(data['reviewText'], y, test_size = 0.10, random_state=42)

embedding_size = 300 #number of feature weights in embeddings
max_len = 400

## 2. Vectorize text data

In [None]:
#Basic Vectorization of data
#Review data
tokenizer = Tokenizer()
tokenizer.fit_on_texts(X_train)
word_index = tokenizer.word_index

def vectorize(data, tokenizer ,max_len):
    sequences = tokenizer.texts_to_sequences(data)
    padding = pad_sequences(sequences, maxlen = max_len)
    
    return padding

X_train = vectorize(X_train, tokenizer , max_len)
X_dev = vectorize(X_dev, tokenizer, max_len)

print('Found %s unique tokens.' % len(word_index))
print('Shape of train tensor', X_train.shape)
print('Shape of dev tensor', X_dev.shape)

In [2]:
import torch
torch.cuda.is_available()
print(torch.__version__)
torch.cuda.get_device_name(0)

1.1.0


'GeForce RTX 2070'

In [3]:
from flair.data_fetcher import NLPTaskDataFetcher
from flair.embeddings import WordEmbeddings, FlairEmbeddings, DocumentLSTMEmbeddings
from flair.models import TextClassifier
from flair.trainers import ModelTrainer
from pathlib import Path

corpus = NLPTaskDataFetcher.load_classification_corpus(Path('flair_data/'), test_file = 'test.csv', dev_file = 'dev.csv', train_file = 'train.csv').downsample(0.1)
word_embeddings = [WordEmbeddings('glove'), FlairEmbeddings('news-forward-fast'), FlairEmbeddings('news-backward-fast')]
document_embeddings = DocumentLSTMEmbeddings(word_embeddings, hidden_size = 512, reproject_words = True, reproject_words_dimension = 256)
classifier = TextClassifier(document_embeddings, label_dictionary = corpus.make_label_dictionary(), multi_label = False)
trainer = ModelTrainer(classifier, corpus)

2019-05-03 13:47:10,352 Reading data from flair_data
2019-05-03 13:47:10,353 Train: flair_data/train.csv
2019-05-03 13:47:10,353 Dev: flair_data/dev.csv
2019-05-03 13:47:10,353 Test: flair_data/test.csv


  if __name__ == '__main__':


In [None]:
trainer.train('./', max_epochs=10, mini_batch_size = 4)

2019-05-03 13:52:28,276 ----------------------------------------------------------------------------------------------------
2019-05-03 13:52:28,276 Evaluation method: MICRO_F1_SCORE
2019-05-03 13:52:28,277 ----------------------------------------------------------------------------------------------------
2019-05-03 13:52:28,586 epoch 1 - iter 0/1800 - loss 0.16469663
2019-05-03 13:53:36,934 epoch 1 - iter 180/1800 - loss 0.19226121
2019-05-03 13:54:49,481 epoch 1 - iter 360/1800 - loss 0.18357563
2019-05-03 13:56:01,184 epoch 1 - iter 540/1800 - loss 0.18081521
2019-05-03 13:57:11,476 epoch 1 - iter 720/1800 - loss 0.18064812
