In [None]:
import os
import re
import nltk
import random
import pickle
import numpy as np
import pandas as pd
from tqdm import tqdm 
import matplotlib.pyplot as plt
from nltk.corpus import stopwords
from gensim.models import word2vec
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer

nltk.download('stopwords')

os.environ["CUDA_VISIBLE_DEVICES"] = "1"
# from nltk.tokenize import TweetTokenizer # a tweet tokenizer from nltk.
# tokenizer = TweetTokenizer()


In [None]:
VECTOR_SIZE = 300

stopword_list = stopwords.words("english")
print(stopword_list)


In [None]:
def get_cutted_sentences(raw_lines):
    sentences = []
    for line in raw_lines:
        line = line.strip()
        line = line.replace(" ' ", "'")
        line = re.sub("[^a-zA-Z']", " ", line)

        words = line.lower().split()
        words = [word for word in words if word not in stopword_list and len(word)>1]
        sentences.append(words)
        
    print(len(sentences))
    return sentences
    

In [None]:
no_labeled_path = os.path.join(os.getcwd(), "data", "training_nolabel.csv")
total_sentences = None

with open(no_labeled_path, 'r') as no_labeled_file:
    total_sentences = get_cutted_sentences(no_labeled_file.readlines())


In [None]:
word2vec_model_path = os.path.join(os.getcwd(), "saved_model", 'dimension_300_window_7_skip_gram')
word2vec_model = word2vec.Word2Vec.load(word2vec_model_path)

print(word2vec_model.wv.vectors.shape)
print(len(word2vec_model.wv.vocab))
# print(word2vec_model.wv.vocab)

print(word2vec_model['bye'])
print(word2vec_model.most_similar('fever'))

In [None]:
labeled_path = os.path.join(os.getcwd(), "data", "training_label.csv")
labeled_data = []

with open(labeled_path, 'r') as labeled_file:
    for line in labeled_file.readlines():
        (label, text) = line.split("+++$+++")
        labeled_data.append([label.strip(), text.strip()])

labeled_dataframe = pd.DataFrame(labeled_data, columns =['Label', 'Text']) 
labeled_dataframe.head()


In [None]:
testing_path = os.path.join(os.getcwd(), "data", "testing_data.csv")
testing_data = []

with open(testing_path, 'r') as testing_file:
    for line in testing_file.readlines()[1:]:
        line_split = line.split(",")
        testing_id = line_split[0]
        text = ",".join(line_split[1:])
        testing_data.append([testing_id.strip(), text.strip()])

testing_dataframe = pd.DataFrame(testing_data, columns =['Id', 'Text']) 
testing_dataframe.head()


In [None]:
training_frame, validation_frame = train_test_split(labeled_dataframe, test_size=0.1, random_state=42)
print(training_frame['Label'].value_counts())
print(validation_frame['Label'].value_counts())


In [None]:
training_x = training_frame['Text'].tolist()
training_x = get_cutted_sentences(training_x)
print(len(training_x))
training_x = [" ".join(sentence) for sentence in training_x]
print(training_x[:20])

validation_x = validation_frame['Text'].tolist()
validation_x = get_cutted_sentences(validation_x)
validation_x = [" ".join(sentence) for sentence in validation_x]

testing_x = testing_dataframe['Text'].tolist()
testing_x = get_cutted_sentences(testing_x)
testing_x = [" ".join(sentence) for sentence in testing_x]


In [None]:
max_length = max(len(x.split()) for x in training_x)
print(max_length)

max_length = max(len(x.split()) for x in validation_x)
print(max_length)

max_length = max(len(x.split()) for x in testing_x)
print(max_length)


In [None]:
training_y = training_frame['Label'].as_matrix()
print(training_y.shape)
print(training_y[:20])

validation_y = validation_frame['Label'].as_matrix()


In [None]:
import spacy
from texcla import corpus, data, experiment
from texcla.models import TokenModelFactory, AveragingEncoder
from texcla.preprocessing import SpacyTokenizer

MAX_LEN = 30
N_GRAMS = 2
EMB_DIMS = 50
EPOCHS = 5
WORDS_LIMIT = 15000


In [None]:
# # use the special tokenizer used for constructing the embeddings
# tokenizer = SpacyTokenizer()
# tokenizer = experiment.setup_data(
#     training_x, training_y, tokenizer, 'twitter_train.bin', max_len=MAX_LEN, ngrams=N_GRAMS, limit_top_tokens=WORDS_LIMIT)
# experiment.setup_data(validation_x, validation_y, tokenizer, 'twitter_validation.bin', max_len=MAX_LEN)


In [None]:
ds_train = data.Dataset.load('twitter_train.bin')
ds_val = data.Dataset.load('twitter_validation.bin')

factory = TokenModelFactory(
    ds_train.num_classes, ds_train.tokenizer.token_index, max_tokens=MAX_LEN, embedding_dims=EMB_DIMS, embedding_type=None)
word_encoder_model = AveragingEncoder()
model = factory.build_model(
        token_encoder_model=word_encoder_model, trainable_embeddings=True)
# print(ds_val.X)
experiment.train(x=ds_train.X, y=ds_train.y, validation_data=(ds_val.X, ds_val.y), model=model,
                 word_encoder_model=word_encoder_model, epochs=EPOCHS)


In [None]:
experiment