In [95]:
import pandas as pd
import numpy as np
import sqlite3
import json
import gensim
import re
from gensim.models import Word2Vec
from nltk import word_tokenize
from nltk.corpus import reuters
import spacy
nlp = spacy.load('en_core_web_lg')

In [57]:
def load_json(df):
    for col in df.columns:
        df[col] = df[col].apply(lambda x: json.loads(x))
    return df


def load_data():
    db_path = '../data/DB.sqlite'
    conn = sqlite3.connect(db_path)
    cur = conn.cursor()
    
    df_emails = pd.read_sql('SELECT * FROM Data', con=conn).drop('index', axis=1).reset_index(drop=True)
    df_emails = load_json(df_emails)
    
    return df_emails


def preprocess_mail_body(x):
    mail_body =  x['Mail_1']
    
    if 'Mail_2' in x.keys():
        mail_body = mail_body + ' ' + x['Mail_2']
        
    pattern_1 = re.compile(r'[\w\.-_]+@[\w\.-_]+')
    
    text = pattern_1.sub('', mail_body)
    
    pattern_2 = re.compile(r'(?:(?:https?|ftp):\/\/)?[\w/\-?=%.]+\.[\w/\-?=%.]+')
    
    text = pattern_2.sub('', text)
    
    text = ' '.join(word_tokenize(text))
        
    pattern_3 = re.compile(r'[^A-Za-z\s]*')
    
    text = pattern_3.sub('', text)
    
#     text = ' '.join(x for x in text.split() if not any(c.isdigit() for c in x))

    text = text.lower()

    return text



def preprocess_text(text):
    pattern_1 = re.compile(r'[^A-Za-z\s]*')
    text = pattern_1.sub('', text)
#     text = ' '.join(x for x in text.split() if not any(c.isdigit() for c in x))
    
    text = text.lower()

    text = word_tokenize(text)

    return text

In [58]:
df_email = load_data()

cats_to_consider = cats_to_consider = ['1_Class_Add_Invoice', '2_Class_Payment_Query']

df_email = df_email.loc[df_email.CLASS.isin(cats_to_consider)]

In [59]:
df_email.shape

(4341, 15)

In [60]:
df_email['BODY'] = df_email.BODY.apply(preprocess_mail_body)

df_email['text'] = df_email.SUBJECT + ' ' + df_email.BODY

df_email['text_tokens'] = df_email.text.apply(preprocess_text)

In [61]:
df_email.text_tokens.head()

20    [adam, po, please, process, the, attached, inv...
21    [adp, invoice, please, process, the, attached,...
22    [brownstein, inv, please, process, the, attach...
23    [c, anon, po, amerihealth, caritas, newarkfebr...
24           [canon, performcare, bedfordfebruary, xls]
Name: text_tokens, dtype: object

In [67]:
def load_data():
    train_documents, train_categories = zip(
        *[(reuters.raw(i), reuters.categories(i)) for i in reuters.fileids() if i.startswith('training/')])
    test_documents, test_categories = zip(
        *[(reuters.raw(i), reuters.categories(i)) for i in reuters.fileids() if i.startswith('test/')])

    return train_documents, train_categories, test_documents, test_categories


train_documents, train_categories, test_documents, test_categories = load_data()

df_train = pd.DataFrame({'Document': train_documents, 'Category': [x[0] for x in train_categories]})
df_test = pd.DataFrame({'Document': test_documents, 'Category': [x[0] for x in test_categories]})

In [70]:
all_data = df_train.Document.append(df_test.Document)

In [71]:
all_data.shape

(10788,)

In [73]:
all_data = all_data.apply(preprocess_text)

In [79]:
word_vec_model = Word2Vec(sentences=all_data.values.tolist(), iter=5, window=5)

In [80]:
word_vec_model.most_similar('pig')

  """Entry point for launching an IPython kernel.


[('steers', 0.8733002543449402),
 ('experiencing', 0.8558189868927002),
 ('landed', 0.8551946878433228),
 ('bean', 0.8502511978149414),
 ('potatoes', 0.8498648405075073),
 ('destroying', 0.849547803401947),
 ('varieties', 0.8483351469039917),
 ('hrw', 0.8461303114891052),
 ('seeds', 0.8440544605255127),
 ('rye', 0.8425315618515015)]