## Steps:
 - Preprocess the text to remove the stopwords 
 - Lemmatize pos tag each word
 - Select only noun and verb POS
 - Compare synsets of each pair of word and select only the overlaping synsets
 - List out all the overlaping synsets and compute their lemmas
 - Also compute frequency of each lemma from the documents
 - Now filter out top K lemmas based on the frequency

In [34]:
from nltk.corpus import stopwords
from nltk.corpus import wordnet as wn
from nltk import pos_tag
from nltk.stem import PorterStemmer
from nltk import word_tokenize, line_tokenize
import pandas as pd
import numpy as np
import re
import spacy
nlp = spacy.load('en_core_web_sm')
from collections import Counter
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
import sqlite3
import json


# with open('../Data/stop_words.txt', 'r') as f:
#     function_words = line_tokenize(f.read())


function_words = stopwords.words('english')


def load_json(df):
    for col in df.columns:
        df[col] = df[col].apply(lambda x: json.loads(x))
    return df


def load_data():
    db_path = '../data/DB.sqlite'
    conn = sqlite3.connect(db_path)
    cur = conn.cursor()
    
    df_emails = pd.read_sql('SELECT * FROM Data', con=conn).drop('index', axis=1).reset_index(drop=True)
    df_emails = load_json(df_emails)
    
    return df_emails


def pos_tag_text(text):
    return pos_tag(text)


def preprocess_text(text):
    pattern_1 = re.compile(r'[^A-Za-z\s]*')
    text = pattern_1.sub('', text)

    text = text.lower()

    text = word_tokenize(text)

    text = pos_tag_text(text)

    return text


def compute_overlap_score(synset, sentence):
    gloss = set(word_tokenize(synset.definition()))

    for i in synset.examples():
        gloss.union(i)

    gloss = gloss.difference(function_words)

    if isinstance(sentence, str):
        sentence = set(sentence.split(" "))

    elif isinstance(sentence, list):
        sentence = set(sentence)

    elif isinstance(sentence, set):
        pass

    else:
        return

    sentence = sentence.difference(function_words)

    return len(gloss.intersection(sentence))


def lesk(word, sentence):
    best_sense = None
    max_overlap = 0
    word = wn.morphy(word) if wn.morphy(word) is not None else word

    for sense in wn.synsets(word):
        overlap = compute_overlap_score(sense, sentence)

        for hyponym in sense.hyponyms():
            overlap += compute_overlap_score(hyponym, sentence)

        for hypernym in sense.hypernyms():
            overlap += compute_overlap_score(hypernym, sentence)

        for meronym in sense.part_meronyms():
            overlap += compute_overlap_score(meronym, sentence)

        for meronym in sense.substance_meronyms():
            overlap += compute_overlap_score(meronym, sentence)

        if overlap > max_overlap:
            max_overlap = overlap
            best_sense = sense

    return best_sense


def get_all_senses(word):
    return wn.synsets(word)


def get_all_hypernyms(sense):
    return sense.hypernyms()


def merge_terms(pos_text):
    text = []

    for pos_word in pos_text:
        if 'NN' in pos_word[1] or 'JJ' in pos_word[1] or 'VB' in pos_word[1]:
            text.append(pos_word[0].lower())
        
    text = ' '.join(text)

    doc_text = nlp(text)

    text = [x.lemma_ for x in doc_text]

    # --------- Using Lesk Algorithm to find best sense of every word ------------

    word_sense_dict = {x: lesk(x, text) for x in text}

    text = np.array(text)

    # ------------- Merging terms with commons meanings --------------------------

    for i in range(len(text)-1):
        if word_sense_dict[text[i]] is not None:
            for j in range(i+1, len(text)):
                if text[i] != text[j]:
                    if word_sense_dict[text[i]] in get_all_senses(text[j]):
                        # print(f'Merged...{text[i]} and {text[j]}')
                        text = np.where(text == text[j], text[i], text)

    # ------------------- Merging terms with Hypernyms ---------------------------

    for i in range(len(text)-1):
        try:
            if word_sense_dict[text[i]] is not None:
                for j in range(i+1, len(text)):
                    try:
                        if (text[i] != text[j]) and (word_sense_dict[text[j]] is not None):
                            word_sense_i = word_sense_dict[text[i]]
                            word_sense_j = word_sense_dict[text[j]]

                            hypernyms_i = get_all_hypernyms(word_sense_i)
                            hypernyms_j = get_all_hypernyms(word_sense_j)

                            if word_sense_i in hypernyms_j:
                                # print(f'{text[i]} is a Hypernym of {text[j]}')
                                text = np.where(text == text[j], text[i], text)

                            elif word_sense_j in hypernyms_i:
                                # print(f'{text[j]} is a Hypernym of {text[i]}')
                                text = np.where(text == text[i], text[j], text)

                            elif len(set(hypernyms_i).intersection(set(hypernyms_j)))>0:
                                hypernym_lemma = set(hypernyms_i).intersection(set(hypernyms_j)
                                                                               ).pop().lemmas()[0].name()
                                
                                hypernym_synset = set(hypernyms_i).intersection(set(hypernyms_j)).pop()

                                # print(f'{text[i]} and {text[j]} have common hypernyms: {hypernym_lemma}')

                                text = np.where((text == text[j]) | (text == text[i]), hypernym_lemma, text)
                                
                                word_sense_dict[hypernym_lemma] = hypernym_synset
                                
                    except KeyError as ke:
                        continue

        except KeyError as ke:
            continue

    return pd.Series({'preprocessed_text':' '.join(text), 'word_sense_dict':word_sense_dict})


In [2]:
def build_tfidf_matrix(corpus):
    tfidf_vectorizer = TfidfVectorizer()
    count_vectorizer = CountVectorizer()
    
    tfidf_matrix = tfidf_vectorizer.fit_transform(corpus)
    count_matrix = count_vectorizer.fit_transform(corpus)

    tfidf_feature_names = tfidf_vectorizer.get_feature_names()
    count_feature_names = count_vectorizer.get_feature_names()
    
    df_matrix_tfidf = pd.DataFrame(tfidf_matrix.todense(), columns=tfidf_feature_names)
    df_matrix_count = pd.DataFrame(count_matrix.todense(), columns=count_feature_names)

    return df_matrix_tfidf, df_matrix_count, tfidf_vectorizer, count_vectorizer


In [3]:
def preprocess_mail_body(x):
    mail_body =  x['Mail_1']
    
    if 'Mail_2' in x.keys():
        mail_body = mail_body + ' ' + x['Mail_2']
        
    pattern_1 = re.compile(r'[\w\.-_]+@[\w\.-_]+')
    
    text = pattern_1.sub('', mail_body)
    
    pattern_2 = re.compile(r'(?:(?:https?|ftp):\/\/)?[\w/\-?=%.]+\.[\w/\-?=%.]+')
    
    text = pattern_2.sub('', text)
    
    text = ' '.join(word_tokenize(text))
        
    pattern_3 = re.compile(r'[^A-Za-z\s]*')
    
    text = pattern_3.sub('', text)

    text = text.lower()

    return text

# Loading Data

In [37]:
df_email = load_data()

cats_to_consider = cats_to_consider = ['1_Class_Add_Invoice', '2_Class_Payment_Query']

df_email = df_email.loc[df_email.CLASS.isin(cats_to_consider)]

df_email.reset_index(inplace=True, drop=True)

df_email = df_email.head(50)

# Preprocessing and Merging terms using Lesk Algorithm

In [38]:
df_email['BODY'] = df_email.BODY.apply(preprocess_mail_body)

In [39]:
df_email['text'] = df_email.SUBJECT + ' ' + df_email.BODY

In [40]:
df_email['pos_text'] = df_email.text.apply(preprocess_text)

In [41]:
df_email = pd.concat([df_email, pd.DataFrame(df_email.pos_text.apply(merge_terms))], axis=1)

In [83]:
df_tfidf, df_count, tfidf_vectorizer, count_vectorizer = build_tfidf_matrix(df_email.preprocessed_text)

In [84]:
df_email.shape, df_tfidf.shape

((50, 19), (50, 210))

# Feature reduction using path similarity measure from wordnet

In [111]:
def compute_distance(xi, xj, word_sense_dict):
    word_sense_i = word_sense_dict[xi]
    word_sense_j = word_sense_dict[xj]
    
    if (word_sense_i is None) or (word_sense_j is None):
        return None
    
    return wn.path_similarity(word_sense_i, word_sense_j, simulate_root=False)


def update_tfidf_scores(xi, xj, index, distance):
    try:
        tfidf_xi = df_tfidf.loc[index, xi]
        tfidf_xj = df_tfidf.loc[index, xj]


        tfidf_xi, tfidf_xj = np.array([tfidf_xi, tfidf_xj]) * (1 - distance)

        df_tfidf.loc[index, xi] = tfidf_xi
        df_tfidf.loc[index, xj] = tfidf_xj

    except:
        pass
    
    return 


scores_list = []


def compute_path_based_similarity(x):
    index = x['index']
    text = x.preprocessed_text
    x_tokens = word_tokenize(text)
    word_sense_dict = x.word_sense_dict
    threshold = 1
    
    dropped_tokens = []
    
    for i in range(len(x_tokens)-1):
        token_is_related = False
        
        xi = x_tokens[i]
        
        for j in range(i+1, len(x_tokens)):
            xj = x_tokens[j]
            
            if xi != xj:
                distance = compute_distance(xi, xj, word_sense_dict)
                                
                if (distance is not None) and (distance <= threshold):
                    token_is_related = True
                    
                    print(f'{xi} :: {xj} with score {distance}')
                    
                    update_tfidf_scores(xi, xj, index, distance)
        
        if not token_is_related:
            pattern = re.compile(fr'\b{xi}\b')
            
            text = pattern.sub('', text)
            
            dropped_tokens.append(xi)
            
    print(f'Dropped the following words: {dropped_tokens}')
    
    return text


In [114]:
# df_email.reset_index().apply(compute_path_based_similarity, axis=1)

# Using Chi-Square to compute best features for each category

In [13]:
df_count['Category'] = df_email.CLASS.values

In [14]:
df_count.head()

Unnamed: 0,aamerican,aamva,ab,aba,abatement,abc,ability,able,abm,above,...,zpos,zpr,zrp,zsn,zuora,zuras,zurich,zurora,zycus,Category
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1_Class_Add_Invoice
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1_Class_Add_Invoice
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1_Class_Add_Invoice
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1_Class_Add_Invoice
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1_Class_Add_Invoice


In [15]:
df_term_category = df_count.groupby('Category').sum().T

In [16]:
N = df_term_category.sum().sum()

In [17]:
N_j_dot = df_term_category.sum(axis=1)

In [18]:
N_dot_k = df_term_category.sum(axis=0)

In [19]:
def sign(a, b):
    return 1 if a>=b else -1

def compute_feature_contribution(njk, nj, nk, N):
    fjk = njk/N
    fjfk = nj*nk/N**2
    
    X2 = (fjk - fjfk)**2/(fjfk)*sign(fjk, fjfk)
    
    return X2

In [20]:
df_chi_sq = pd.DataFrame(index=df_term_category.index, columns=df_term_category.columns)

In [21]:
for term in df_term_category.index:
    for cat in df_term_category.columns:
        df_chi_sq[cat][term] = compute_feature_contribution(df_term_category[cat][term], N_j_dot[term], N_dot_k[cat], N)

In [22]:
df_chi_sq.head()

Category,1_Class_Add_Invoice,2_Class_Payment_Query
aamerican,4.12431e-08,-8.25901e-07
aamva,1.23729e-07,-2.4777e-06
ab,4.12431e-08,-8.25901e-07
aba,-1.30436e-06,2.612e-05
abatement,1.37477e-08,-2.753e-07


In [23]:
for cat in df_chi_sq.columns:
    print(cat, ':', df_chi_sq[cat].sort_values(ascending=False)[:10].index)

1_Class_Add_Invoice : Index(['communication', 'intend', 'attachment', 'recipient', 'sender',
       'delete', 'dissemination', 'privilege', 'reader', 'confidential'],
      dtype='object')
2_Class_Payment_Query : Index(['gregorian_calendar_month', 'give', 'status', 'update', 'due',
       'payment', 'usd', 're', 'receive', 'get'],
      dtype='object')


In [24]:
df_term_category = df_term_category.T

# Converting the count matrix to tfidf matrix

In [25]:
df_term_category = pd.DataFrame(df_term_category.values/ df_term_category.sum(axis=1).values.reshape(-1, 1), columns=df_term_category.columns, 
                               index=df_term_category.index)

In [26]:
df_term_category = pd.DataFrame(df_term_category.values * np.log(df_term_category.shape[0]/(df_term_category>0).sum()).values.reshape(1, -1), columns=df_term_category.columns,
            index=df_term_category.index)

# Generating category vector for top K categories

In [27]:
df_chi_sq.head()

Category,1_Class_Add_Invoice,2_Class_Payment_Query
aamerican,4.12431e-08,-8.25901e-07
aamva,1.23729e-07,-2.4777e-06
ab,4.12431e-08,-8.25901e-07
aba,-1.30436e-06,2.612e-05
abatement,1.37477e-08,-2.753e-07


In [28]:
def generate_weighted_vector_cat(df_term_cat, df_chi_sq, k):
    cat_vec = {}
    
    for cat in df_term_cat.index:
        top_k_cats = df_chi_sq[cat].sort_values(ascending=False)[:k].index.tolist()
        cat_vec[cat] = (top_k_cats, df_term_cat.loc[cat, top_k_cats].values.tolist())
    
    return cat_vec

In [29]:
cat_vecs = generate_weighted_vector_cat(df_term_category, df_chi_sq, 800)

In [30]:
from sklearn.metrics.pairwise import cosine_similarity

# Class prediction using cosine similarity

In [31]:
def predict_category(x, cat_vectors):
    prediction = None
    best_score = -1
    
    for cat in cat_vectors.keys():
        top_terms = cat_vectors[cat][0]
        cat_vec = np.array(cat_vectors[cat][1]).reshape(1, -1)
        
        x_cat = x[top_terms].values.reshape(1, -1)
        
        score = cosine_similarity(x_cat, cat_vec)
        
        if score > best_score:
            best_score = score
            prediction = cat
    
    return prediction

In [32]:
df_email['prediction'] = df_tfidf.apply(predict_category, axis=1, cat_vectors=cat_vecs).values

In [33]:
from sklearn.metrics import classification_report

In [34]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
df_email.CLASS = le.fit_transform(df_email.CLASS)
df_email.prediction = le.transform(df_email.prediction)

In [35]:
from sklearn.metrics import f1_score, recall_score

In [36]:
f1_score(df_email.CLASS, df_email.prediction, average=None)

array([0.99428299, 0.83216783])

In [37]:
f1_score(df_email.CLASS, df_email.prediction, average='macro')

0.9132254120343687

In [38]:
recall_score(df_email.CLASS, df_email.prediction, average=None)

array([1.        , 0.71257485])

In [79]:
df_email.columns

Index(['FILENAME', 'DATE', 'FROM', 'TO', 'CC', 'BCC', 'SUBJECT', 'BODY',
       'GREETING', 'SIGNATURE', 'ATTACHMENT_FOUND', 'ATTACHMENTS_DETAILS',
       'INVOICE_NO', 'CLASS', 'CUSTOMER', 'text', 'pos_text',
       'preprocessed_text', 'prediction'],
      dtype='object')

# ML based classification

In [49]:
def top_k_terms(df_term_cat, df_chi_sq, k):
    top_cats = []
    
    for cat in df_term_cat.index:
        top_cats = top_cats + df_chi_sq[cat].sort_values(ascending=False)[:k].index.tolist()
    
    return top_cats

In [50]:
terms_to_consider = top_k_terms(df_term_category, df_chi_sq, 500)

In [51]:
from sklearn.model_selection import train_test_split

In [52]:
X_train, X_test, y_train, y_test = train_test_split(df_tfidf.loc[:, terms_to_consider], df_email.CLASS, test_size=0.2,
                                                   stratify = df_email.CLASS)

In [53]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression

In [62]:
# model = RandomForestClassifier(n_estimators=100, max_depth=9, n_jobs=-1, random_state=123, class_weight='balanced')
model = LogisticRegression(class_weight='balanced', multi_class='ovr')

In [63]:
model.fit(X_train, y_train)



LogisticRegression(C=1.0, class_weight='balanced', dual=False,
                   fit_intercept=True, intercept_scaling=1, l1_ratio=None,
                   max_iter=100, multi_class='ovr', n_jobs=None, penalty='l2',
                   random_state=None, solver='warn', tol=0.0001, verbose=0,
                   warm_start=False)

In [64]:
preds = model.predict(X_test)

In [65]:
f1_score(y_test, preds, average=None)

array([0.9786975 , 0.63157895])