In [58]:
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Dense, Flatten, LSTM, Conv1D, MaxPooling1D, Dropout, Activation
from keras.layers.embeddings import Embedding
## Plotly
import plotly.offline as py
import plotly.graph_objs as go
py.init_notebook_mode(connected=True)
# Others
import nltk
import string
import numpy as np
import pandas as pd
from nltk.corpus import stopwords
import re
from nltk.stem.snowball import SnowballStemmer 

from sklearn.manifold import TSNE

In [8]:
from scipy.stats import itemfreq

In [68]:
df = pd.read_csv("Account Type Data.csv",encoding="ISO-8859-1")
del df['CA_Account_Number']
del df['CA_FS_Caption']
#del df['CA_PL_Indicator']

In [69]:
df1=df[df['CA_PL_Indicator']=="BS"]
df2=df[df['CA_PL_Indicator']=="PL"]
del df1['CA_PL_Indicator']
del df2['CA_PL_Indicator']

In [70]:
df1.head()

Unnamed: 0,CA_Account_Name,CA_Account_Type
0,Total Assets,Assets
1,Cash,Assets
2,Payroll Cash Account,Assets
3,Short Term Investments,Assets
4,Long Term Investments,Assets


In [71]:
df2.head()

Unnamed: 0,CA_Account_Name,CA_Account_Type
59,Maintenance Material,Expense
170,Sales Revenue - Earned Billed,Revenue
171,Sales Revenue - Billed,Revenue
172,Sales Revenue - Unbilled,Revenue
173,Hardware,Revenue


In [72]:
df1 = df1.dropna()
df1 = df1[df1['CA_Account_Name'].apply(lambda x: not x.isnumeric())]
df1 = df1.dropna(subset=['CA_Account_Name','CA_Account_Type'])

df2 = df2.dropna()
df2 = df2[df2['CA_Account_Name'].apply(lambda x: not x.isnumeric())]
df2 = df2.dropna(subset=['CA_Account_Name','CA_Account_Type'])

In [73]:
itemfreq(df1['CA_Account_Name'])

array([[' Payable Assets - Guarantee', 1],
       [' VAT Payable', 1],
       [' Zapasy pó?fabrykatów', 1],
       ...,
       ['Zero VAT - output', 1],
       ['Zysk niepodzielony', 1],
       ['liaison', 1]], dtype=object)

In [74]:
itemfreq(df1['CA_Account_Type'])

array([['Assets', 2351],
       ['Equity', 213],
       ['Liabilities', 1280]], dtype=object)

In [75]:
itemfreq(df2['CA_Account_Type'])

array([['Expense', 2293],
       ['Revenue', 651]], dtype=object)

In [76]:
len(df1)

3844

In [77]:
len(df2)

2944

In [78]:
labels1 = df1['CA_Account_Type'].map(lambda x : 0 if x == 'Assets' else 1 if x == 'Equity' else 2)
labels2 = df2['CA_Account_Type'].map(lambda x : 0 if x == 'Expense' else 1)

In [79]:
itemfreq(labels1)

array([[   0, 2351],
       [   1,  213],
       [   2, 1280]], dtype=int64)

In [80]:
itemfreq(labels2)

array([[   0, 2293],
       [   1,  651]], dtype=int64)

In [81]:
def words(text): 
    return re.findall(r'\w+', text.lower())

WORDS = Counter(words(open('accounting terms v1.txt',encoding='utf-8').read()))

#"Probability of `word`."
def P(word, N=sum(WORDS.values())): 
    return WORDS[word] / N

#"Most probable spelling correction for word."
def correction(word): 
    return max(candidates(word), key=P)

#"Generate possible spelling corrections for word."
def candidates(word): 
    return (known([word]) or known(edits1(word)) or known(edits2(word)) or [word])

#"The subset of `words` that appear in the dictionary of WORDS."
def known(words): 
    return set(w for w in words if w in WORDS)

#"All edits that are one edit away from `word`."
def edits1(word):    
    letters    = 'abcdefghijklmnopqrstuvwxyz'
    splits     = [(word[:i], word[i:])    for i in range(len(word) + 1)]
    deletes    = [L + R[1:]               for L, R in splits if R]
    transposes = [L + R[1] + R[0] + R[2:] for L, R in splits if len(R)>1]
    replaces   = [L + c + R[1:]           for L, R in splits if R for c in letters]
    inserts    = [L + c + R               for L, R in splits for c in letters]
    return set(deletes + transposes + replaces + inserts)

#"All edits that are two edits away from `word`."
def edits2(word): 
    return (e2 for e1 in edits1(word) for e2 in edits1(e1))


def clean_text(text):
    ## Remove puncuation
    text = text.translate(string.punctuation)
    
    ## Convert words to lower case and split them
    text = text.lower().split()
    
    ## Remove stop words
    stops = set(stopwords.words("english"))
    #text = [w for w in text if not w in stops and len(w) >= 3]
    
    text = " ".join(text)
    ## Clean the text
    
    text = re.sub(r"\bi/c\b", "intercompany ", text)
    text = re.sub(r"[^A-Za-z0-9^,!.\/'+-=]", " ", text)
    text = re.sub(r"what's", "what is ", text)    
    text = re.sub(r"\'s", " ", text)
    text = re.sub(r"\'ve", " have ", text)
    text = re.sub(r"n't", " not ", text)
    text = re.sub(r"i'm", "i am ", text)
    text = re.sub(r"\'re", " are ", text)
    text = re.sub(r"\'d", " would ", text)
    text = re.sub(r"\'ll", " will ", text)
    text = re.sub(r",", " ", text)
    text = re.sub(r"\.", " ", text)
    text = re.sub(r"!", " ! ", text)
    text = re.sub(r"\/", " ", text)
    text = re.sub(r"\^", " ^ ", text)
    text = re.sub(r"\+", " + ", text)
    text = re.sub(r"\-", " - ", text)
    text = re.sub(r"\=", " = ", text)
    text = re.sub(r"'", " ", text)
    text = re.sub(r"(\d+)(k)", r"\g<1>000", text)
    text = re.sub(r":", " : ", text)
    text = re.sub(r" e g ", " eg ", text)
    text = re.sub(r" b g ", " bg ", text)
    text = re.sub(r" u s ", " american ", text)
    text = re.sub(r"\0s", "0", text)
    text = re.sub(r" 9 11 ", "911", text)
    text = re.sub(r"e - mail", "email", text)
    text = re.sub(r"j k", "jk", text)
    text = re.sub(r"\s{2,}", " ", text)
    text = re.sub(r"\bvat\b", "value added tax ", text)
    text = re.sub(r'\bwip\b', "work in progress ", text)
    text = re.sub(r"\bar\b", "accounts receivable ", text)
    text = re.sub(r"\bap\b", "accounts payable ", text)
    text = re.sub(r"\bcip\b", "carriage and insurance paid ", text)
    text = re.sub(r"\bfsb\b", "financial services board ", text)
    text = re.sub(r"\bemea\b", "europe middle east africa ", text)
    text = re.sub(r"\baccum\b", "accumulated ", text)
    text = re.sub(r"\bacc\b", "accumulated ", text)
    text = re.sub(r"\bprov\b", "provisions ", text)
    text = re.sub(r"\best\b", "estimated ", text)
    text = re.sub(r"\bdepr\b", "depreciation ", text)
    text = re.sub(r"\binterco\b", "intercompany ", text)
    text = re.sub(r"\bee\b", "employee ", text)
    text = re.sub(r"\bcogs\b", "cost of goods ", text)
    text = re.sub(r"\bfs\b", "financial services ", text)
    
    
    text = text.split()
    
    ## Stemming
    #stemmer = SnowballStemmer('english')
    #stemmed_words = [stemmer.stem(word) for word in text]
    
    ## Correct the spellings of text
    text = [correction(w) for w in text]
    ## Remove the stop words
    text = [w for w in text if not w in stops and len(w) >= 3]
    text = " ".join(text)
    #print(text)
    
    #text = " ".join(stemmed_words)
    return text

In [82]:
df1['CA_Account_Name'] = df1['CA_Account_Name'].map(lambda x: clean_text(x))
df2['CA_Account_Name'] = df2['CA_Account_Name'].map(lambda x: clean_text(x))

In [83]:
df1.head()

Unnamed: 0,CA_Account_Name,CA_Account_Type
0,total asset,Assets
1,cash,Assets
2,payrol cash account,Assets
3,short term invest,Assets
4,long term invest,Assets


In [84]:
df2.head()

Unnamed: 0,CA_Account_Name,CA_Account_Type
59,mainten materi,Expense
170,sale revenu earn bill,Revenue
171,sale revenu bill,Revenue
172,sale revenu unbil,Revenue
173,hardwar,Revenue


In [85]:
### Create sequence
vocabulary_size = 20000
tokenizer1 = Tokenizer(num_words= vocabulary_size)
tokenizer1.fit_on_texts(df1['CA_Account_Name'])

tokenizer2 = Tokenizer(num_words= vocabulary_size)
tokenizer2.fit_on_texts(df2['CA_Account_Name'])

In [86]:
sequences1 = tokenizer1.texts_to_sequences(df1['CA_Account_Name'])
data1 = pad_sequences(sequences1, maxlen=8)

sequences2 = tokenizer2.texts_to_sequences(df2['CA_Account_Name'])
data2 = pad_sequences(sequences2, maxlen=8)

In [87]:
sequences1

[[175, 11],
 [10],
 [106, 10, 3],
 [85, 24, 33],
 [59, 24, 33],
 [215, 56],
 [357, 184, 113, 215, 56],
 [10, 4],
 [3, 1, 70],
 [114, 1],
 [300, 1],
 [3, 1, 728],
 [26, 264, 1],
 [26, 23, 1],
 [729, 265, 1],
 [530, 23, 1],
 [1],
 [730, 1],
 [153, 4],
 [154, 197, 4],
 [266, 301, 1],
 [53, 1],
 [73, 358, 1],
 [73, 1],
 [86, 10],
 [302, 1],
 [1, 120, 133],
 [121, 34, 4],
 [10, 4],
 [37, 126, 1],
 [99, 141, 65],
 [15, 359],
 [23, 1],
 [127, 8, 1],
 [127, 8, 2],
 [87, 6],
 [8],
 [185, 8],
 [235, 8],
 [75, 28],
 [5, 14, 13],
 [155, 5, 14, 13],
 [53, 303],
 [5, 14, 35, 13],
 [155, 5, 14, 35, 13],
 [5, 35, 13],
 [155, 5, 35, 13],
 [5, 80, 13],
 [155, 5, 80, 13],
 [5, 78, 48, 13],
 [155, 5, 78, 48, 13],
 [147, 5, 13],
 [429, 5, 13],
 [93, 14, 13],
 [93, 14, 35, 13],
 [93, 35, 13],
 [93, 80, 13],
 [93, 78, 48, 13],
 [126, 14],
 [126, 80],
 [126, 78, 48],
 [126, 35],
 [5, 304, 13],
 [42],
 [21, 45],
 [60, 18],
 [49],
 [38],
 [54, 88],
 [176, 11],
 [11, 4],
 [300, 4],
 [55, 22],
 [55, 4],
 [27, 29,

In [88]:
data1[0]

array([  0,   0,   0,   0,   0,   0, 175,  11])

In [89]:
model1 = Sequential()
model1.add(Embedding(20000, 100, input_length=8))
model1.add(LSTM(100, dropout=0.2, recurrent_dropout=0.2))
model1.add(Dense(1, activation='sigmoid'))
model1.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

model2 = Sequential()
model2.add(Embedding(20000, 100, input_length=8))
model2.add(LSTM(100, dropout=0.2, recurrent_dropout=0.2))
model2.add(Dense(1, activation='sigmoid'))
model2.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

In [90]:
## Fit the model
model1.fit(data1, np.array(labels1), validation_split=0.4, epochs=50)

Train on 2306 samples, validate on 1538 samples
Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


<keras.callbacks.History at 0x26fdc464748>

In [91]:
## Fit the model
model2.fit(data2, np.array(labels2), validation_split=0.4, epochs=50)

Train on 1766 samples, validate on 1178 samples
Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


<keras.callbacks.History at 0x26fe42e8630>