In [1]:
import os
import torch
from torch.utils import data
import torch.nn as nn
import torch.nn.functional as F
import json
import nltk
import string
import pandas as pd
import numpy as np
from datetime import datetime
from functools import reduce
from sklearn.feature_extraction.stop_words import ENGLISH_STOP_WORDS

In [2]:
path_to_dataset = "./data.json"
with open(path_to_dataset) as fp:
    dataset = json.load(fp)
dataset.keys()

dict_keys(['Index', 'From', 'To', 'Subject', 'Cc', 'Content', 'Action'])

In [3]:
df = pd.DataFrame.from_dict(dataset)
df = df[['Content','Action']]
df.head()

Unnamed: 0,Content,Action
0,Here is our forecast,0
1,Traveling to have a business meeting takes the...,1
2,test successful. way to go!!!,1
3,"Randy, Can you send me a schedule of the salar...",0
4,Let's shoot for Tuesday at 11:45.,1


In [4]:
len(df)

100000

## Load GloVe Model

In [5]:
def load_glove_model(gloveFile):
    print("Loading Glove Model")
    f = open(gloveFile,'r')
    model = {}
    for line in f:
        splitLine = line.split()
        word = splitLine[0]
        embedding = np.array([float(val) for val in splitLine[1:]])
        model[word] = embedding
    print("Done.",len(model)," words loaded!")
    return model

In [6]:
path_to_glove = "./glove.6B.50d.txt"
glove = load_glove_model(path_to_glove)

Loading Glove Model
Done. 400000  words loaded!


# Data Preprocess
1. Tokenization
2. Remove Stopwords
3. Replace number with NUM. Replace OOV with UNK
4. Selected 10000 most frequent tokens
5. Pad with NIL and truncate to 300
6. Split dataset to train, dev by 7:3

## Tokenization

In [7]:
# remove punctuations
df['Content'] = df['Content'].str.translate(str.maketrans('', '', string.punctuation))

In [8]:
# df['raw_tokens'] = df['Content'].apply(lambda sentence: nltk.word_tokenize(sentence))
df['raw_tokens'] = df['Content'].apply(lambda sentence: sentence.split())

In [9]:
df['tokens'] = df['raw_tokens']

## Remove StopWords

In [10]:
df['tokens'] = df['tokens'].apply(lambda tokens: [token for token in tokens if token.lower() not in ENGLISH_STOP_WORDS])

In [11]:
df['tokens']

0                                               [forecast]
1        [Traveling, business, meeting, takes, fun, tri...
2                                  [test, successful, way]
3        [Randy, send, schedule, salary, level, schedul...
4                             [Lets, shoot, Tuesday, 1145]
                               ...                        
99995    [Forwarded, Daren, J, FarmerHOUECT, 122799, 08...
99996    [occurred, point, prior, months, allocated, vo...
99997    [AimeePGEs, numbers, correct, Aimee, Lannou, 1...
99998    [Forwarded, Daren, J, FarmerHOUECT, 122299, 06...
99999    [Vintage, 93730Julie, Meyers122099, 0227, PMTo...
Name: tokens, Length: 100000, dtype: object

## Replace number with NUM; Replace OOV with UNK

In [12]:
def replace_token(token):
    if token.isdigit():
        return '<NUM>'
    elif token.lower() not in glove:
        return '<UNK>'
    else:
        return token

df['tokens'] = df['tokens'].apply(lambda tokens: [replace_token(token) for token in tokens])

In [13]:
df['tokens']

0                                               [forecast]
1        [Traveling, business, meeting, takes, fun, tri...
2                                  [test, successful, way]
3        [Randy, send, schedule, salary, level, schedul...
4                            [Lets, shoot, Tuesday, <NUM>]
                               ...                        
99995    [Forwarded, Daren, J, <UNK>, <NUM>, <NUM>, SUS...
99996    [occurred, point, prior, months, allocated, <U...
99997    [<UNK>, numbers, correct, Aimee, <UNK>, <NUM>,...
99998    [Forwarded, Daren, J, <UNK>, <NUM>, <NUM>, Enr...
99999    [Vintage, <UNK>, <UNK>, <NUM>, <UNK>, Daren, J...
Name: tokens, Length: 100000, dtype: object

## Select 10000 most frequent tokens

In [14]:
token2freq = {}
for tokens in df['tokens']:
    for token in tokens:
        if token.lower() in token2freq:
            token2freq[token.lower()] += 1
        else:
            token2freq[token.lower()] = 1

In [15]:
selected_tokens = set(sorted(token2freq.keys(), key=lambda token: -token2freq[token])[:10000])

In [16]:
df['tokens'] = df['tokens'].apply(lambda tokens: [token for token in tokens if token.lower() in selected_tokens])

In [19]:
df['tokens']

0                                               [forecast]
1        [Traveling, business, meeting, takes, fun, tri...
2                                  [test, successful, way]
3        [Randy, send, schedule, salary, level, schedul...
4                            [Lets, shoot, Tuesday, <NUM>]
                               ...                        
99995    [Forwarded, Daren, J, <UNK>, <NUM>, <NUM>, SUS...
99996    [occurred, point, prior, months, allocated, <U...
99997    [<UNK>, numbers, correct, Aimee, <UNK>, <NUM>,...
99998    [Forwarded, Daren, J, <UNK>, <NUM>, <NUM>, Enr...
99999    [<UNK>, <UNK>, <NUM>, <UNK>, Daren, J, <UNK>, ...
Name: tokens, Length: 100000, dtype: object

## Pad with NIL; Truncate to 300

In [20]:
def pad_or_truncate_tokens(tokens):
    if len(tokens) >= 300:
        return tokens[:300]
    else:
        return tokens + ['<NIL>'] * (300 - len(tokens))

In [21]:
df['lengths'] = df['tokens'].apply(lambda tokens: len(tokens))

In [22]:
df['tokens'] = df['tokens'].apply(lambda tokens: pad_or_truncate_tokens(tokens))

# Word Embedding

In [23]:
all_tokens = reduce(lambda x,y: x | set(y), df['tokens'], set([]))

In [24]:
token2id = {token: idx for idx, token in enumerate(all_tokens)}
id2token = all_tokens

In [25]:
df['token_ids'] = df['tokens'].apply(lambda tokens: np.array([token2id[token] for token in tokens], dtype=np.int))

In [26]:
embed_dim1 = 50
embed_dim2 = 10
embed_dim = embed_dim1 + embed_dim2
embedding_matrix = np.ndarray((len(token2id), embed_dim))

for idx, token in enumerate(id2token):
    if token == '<NIL>':
        embedding_matrix[idx, :50] = 0
    elif token == '<NUM>':
        embedding_matrix[idx, :50] = glove['num']
    elif token == '<UNK>':
        embedding_matrix[idx, :50] = glove['unk']
    else:
        embedding_matrix[idx, :50] = glove[token.lower()]
    embedding_matrix[idx, 50:] = np.random.rand(embed_dim2)

# Split and make datasets

In [27]:
shuffle = np.arange(len(df))
np.random.shuffle(shuffle)

In [28]:
features = np.ndarray((len(df), 300), dtype=np.int)[shuffle]
labels = np.ndarray(len(df),dtype=np.long)[shuffle]

for i, (feature, label) in enumerate(zip(df['token_ids'], df['Action'])):
    features[i] = feature
    labels[i] = label

In [39]:
data = np.c_[features, labels]
np.savetxt("data.npy", data)

In [42]:
np.savetxt("embedding.npy", embedding_matrix)
with open("freeze_id", "w") as fp:
    fp.write(str(token2id['<NIL>']))