In [94]:
from nltk.corpus import stopwords
from nltk.corpus import wordnet as wn
from nltk import pos_tag
from nltk.stem import PorterStemmer
from nltk import word_tokenize, line_tokenize
import pandas as pd
import numpy as np
import re
import spacy
nlp = spacy.load('en_core_web_sm')
from collections import Counter
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
import sqlite3
import json


with open('../Data/stop_words.txt', 'r') as f:
    function_words = line_tokenize(f.read())
    

# function_words = stopwords.words('english')


def load_json(df):
    for col in df.columns:
        df[col] = df[col].apply(lambda x: json.loads(x))
    return df


def load_data():
    db_path = '../data/DB.sqlite'
    conn = sqlite3.connect(db_path)
    cur = conn.cursor()
    
    df_emails = pd.read_sql('SELECT * FROM Data', con=conn).drop('index', axis=1).reset_index(drop=True)
    df_emails = load_json(df_emails)
    
    return df_emails


def pos_tag_text(text):
    return pos_tag(text)


def preprocess_text(text):
    pattern_1 = re.compile(r'[^A-Za-z\s]*')
    text = pattern_1.sub('', text)
#     text = ' '.join(x for x in text.split() if not any(c.isdigit() for c in x))

    text = text.lower()

    text = word_tokenize(text)

    text = pos_tag_text(text)

    return text


def compute_overlap_score(synset, sentence):
    gloss = set(word_tokenize(synset.definition()))

    for i in synset.examples():
        gloss.union(i)

    gloss = gloss.difference(function_words)

    if isinstance(sentence, str):
        sentence = set(sentence.split(" "))

    elif isinstance(sentence, list):
        sentence = set(sentence)

    elif isinstance(sentence, set):
        pass

    else:
        return

    sentence = sentence.difference(function_words)

    return len(gloss.intersection(sentence))


def lesk(word, sentence):
    best_sense = None
    max_overlap = 0
    word = wn.morphy(word) if wn.morphy(word) is not None else word

    for sense in wn.synsets(word, pos=[wn.NOUN, wn.VERB]):
        overlap = compute_overlap_score(sense, sentence)

        for hyponym in sense.hyponyms():
            overlap += compute_overlap_score(hyponym, sentence)

        for hypernym in sense.hypernyms():
            overlap += compute_overlap_score(hypernym, sentence)

        for meronym in sense.part_meronyms():
            overlap += compute_overlap_score(meronym, sentence)

        for meronym in sense.substance_meronyms():
            overlap += compute_overlap_score(meronym, sentence)

        if overlap > max_overlap:
            max_overlap = overlap
            best_sense = sense

    return best_sense


def get_all_senses(word):
    return wn.synsets(word, pos=[wn.NOUN, wn.VERB])


def get_all_hypernyms(sense):
    return sense.hypernyms()


def merge_terms(pos_text):
    text = []

    for pos_word in pos_text:
        if ('NN' in pos_word[1] or 'JJ' in pos_word[1] or 'VB' in pos_word[1]) and (pos_word[0] not in function_words):
            text.append(pos_word[0].lower())
        
#         if 'NN' in pos_word[1] or 'VB' in pos_word[1]:
#             text.append(pos_word[0].lower())

    text = ' '.join(text)

    doc_text = nlp(text)

    text = [x.lemma_ for x in doc_text]

    # --------- Using Lesk Algorithm to find best sense of every word ------------

    word_sense_dict = {x: lesk(x, text) for x in text}

    text = np.array(text)

    # ------------- Merging terms with commons meanings --------------------------

    for i in range(len(text)-1):
        if word_sense_dict[text[i]] is not None:
            for j in range(i+1, len(text)):
                if text[i] != text[j]:
                    if word_sense_dict[text[i]] in get_all_senses(text[j]):
#                         print(f'Merged...{text[i]} and {text[j]}')
                        
                        text = np.where(text == text[j], text[i], text)
                            
                        text = np.array(list(text) + [x.name() for x in word_sense_dict[text[i]].lemmas()])
                        
    # ------------------- Merging terms with Hypernyms ---------------------------

    for i in range(len(text)-1):
        try:
            if word_sense_dict[text[i]] is not None:
                for j in range(i+1, len(text)):
                    try:
                        if (text[i] != text[j]) and (word_sense_dict[text[j]] is not None):
                            word_sense_i = word_sense_dict[text[i]]
                            word_sense_j = word_sense_dict[text[j]]

                            hypernyms_i = get_all_hypernyms(word_sense_i)
                            hypernyms_j = get_all_hypernyms(word_sense_j)

                            if word_sense_i in hypernyms_j:
#                                 print(f'{text[i]} is a Hypernym of {text[j]}')
                                text = np.where(text == text[j], text[i], text)
                                
                                # Appending all the lemmas of the matched hypernym to the original text
                                text = np.array(list(text) + [x.name() for x in word_sense_i.lemmas()])

                            elif word_sense_j in hypernyms_i:
#                                 print(f'{text[j]} is a Hypernym of {text[i]}')
                                text = np.where(text == text[i], text[j], text)
                                
                                text = np.array(list(text) + [x.name() for x in word_sense_j.lemmas()])

                            elif len(set(hypernyms_i).intersection(set(hypernyms_j)))>0:
#                                 hypernym_lemma = set(hypernyms_i).intersection(set(hypernyms_j)
#                                                                                ).pop().lemmas()[0].name()

#                                 print(f'{text[i]} and {text[j]} have common hypernyms: {hypernym_lemma}')

#                                 text = np.where((text == text[j]) | (text == text[i]), hypernym_lemma, text)

                                hypernym_lemmas = set(hypernyms_i).intersection(set(hypernyms_j)
                                                                               ).pop().lemmas()
                                
                                hypernym_lemmas = [x.name() for x in hypernym_lemmas]
            
                                text = np.array(list(text) + hypernym_lemmas)
                                
                    except KeyError as ke:
                        continue

        except KeyError as ke:
            continue

    return ' '.join(text)


In [95]:
def build_tfidf_matrix(corpus):
    tfidf_vectorizer = TfidfVectorizer()
    count_vectorizer = CountVectorizer()
    
    tfidf_matrix = tfidf_vectorizer.fit_transform(corpus)
    count_matrix = count_vectorizer.fit_transform(corpus)

    tfidf_feature_names = tfidf_vectorizer.get_feature_names()
    count_feature_names = count_vectorizer.get_feature_names()
    
    df_matrix_tfidf = pd.DataFrame(tfidf_matrix.todense(), columns=tfidf_feature_names)
    df_matrix_count = pd.DataFrame(count_matrix.todense(), columns=count_feature_names)

    return df_matrix_tfidf, df_matrix_count


In [96]:
def preprocess_mail_body(x):
    mail_body =  x['Mail_1']
    
    if 'Mail_2' in x.keys():
        mail_body = mail_body + ' ' + x['Mail_2']
        
    pattern_1 = re.compile(r'[\w\.-_]+@[\w\.-_]+')
    
    text = pattern_1.sub('', mail_body)
    
    pattern_2 = re.compile(r'(?:(?:https?|ftp):\/\/)?[\w/\-?=%.]+\.[\w/\-?=%.]+')
    
    text = pattern_2.sub('', text)
    
    text = ' '.join(word_tokenize(text))
        
    pattern_3 = re.compile(r'[^A-Za-z\s]*')
    
    text = pattern_3.sub('', text)
    
#     text = ' '.join(x for x in text.split() if not any(c.isdigit() for c in x))

    text = text.lower()

    return text

In [97]:
df_email = load_data()

cats_to_consider = cats_to_consider = ['1_Class_Add_Invoice', '2_Class_Payment_Query']

df_email = df_email.loc[df_email.CLASS.isin(cats_to_consider)]

In [98]:
df_email['BODY'] = df_email.BODY.apply(preprocess_mail_body)

In [99]:
df_email['text'] = df_email.SUBJECT + ' ' + df_email.BODY

In [100]:
df_email['pos_text'] = df_email.text.apply(preprocess_text)

In [101]:
df_email['preprocessed_text'] = df_email.pos_text.apply(merge_terms)

In [102]:
df_tfidf, df_count = build_tfidf_matrix(df_email.preprocessed_text)

In [103]:
df_tfidf.head()

Unnamed: 0,24,aamerican,aamva,ab,aba,abatement,abc,ability,able,abm,...,zpopdf,zpos,zpr,zrp,zsn,zuora,zuras,zurich,zurora,zycus
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [104]:
df_count.head()

Unnamed: 0,24,aamerican,aamva,ab,aba,abatement,abc,ability,able,abm,...,zpopdf,zpos,zpr,zrp,zsn,zuora,zuras,zurich,zurora,zycus
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


# Using Chi-Square to compute best features for each category

In [105]:
from sklearn.feature_selection import SelectKBest, chi2

In [106]:
df_count['Category'] = df_email.CLASS.values

In [107]:
df_count.head()

Unnamed: 0,24,aamerican,aamva,ab,aba,abatement,abc,ability,able,abm,...,zpos,zpr,zrp,zsn,zuora,zuras,zurich,zurora,zycus,Category
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1_Class_Add_Invoice
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1_Class_Add_Invoice
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1_Class_Add_Invoice
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1_Class_Add_Invoice
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1_Class_Add_Invoice


In [108]:
df_term_category = df_count.groupby('Category').sum().T

In [109]:
N = df_term_category.sum().sum()

In [110]:
N_j_dot = df_term_category.sum(axis=1)

In [111]:
N_dot_k = df_term_category.sum(axis=0)

In [112]:
def sign(a, b):
    return 1 if a>=b else -1

def compute_feature_contribution(njk, nj, nk, N):
    fjk = njk/N
    fjfk = nj*nk/N**2
    
    X2 = (fjk - fjfk)**2/(fjfk)*sign(fjk, fjfk)

    return X2

In [113]:
df_chi_sq = pd.DataFrame(index=df_term_category.index, columns=df_term_category.columns)

In [114]:
for term in df_term_category.index:
    for cat in df_term_category.columns:
        df_chi_sq[cat][term] = compute_feature_contribution(df_term_category[cat][term], N_j_dot[term], N_dot_k[cat], N)

In [115]:
df_chi_sq.head()

Category,1_Class_Add_Invoice,2_Class_Payment_Query
24,-2.43334e-06,4.01417e-05
aamerican,5.21735e-08,-8.60685e-07
aamva,1.56521e-07,-2.58205e-06
ab,5.21735e-08,-8.60685e-07
aba,-8.56595e-07,1.41309e-05


In [116]:
for cat in df_chi_sq.columns:
    print(cat, ':', df_chi_sq[cat].sort_values(ascending=False)[:10].index)

1_Class_Add_Invoice : Index(['communication', 'intend', 'attachment', 'recipient', 'sender',
       'delete', 'privilege', 'dissemination', 'confidential', 'reader'],
      dtype='object')
2_Class_Payment_Query : Index(['gregorian_calendar_month', 'ensure', 'mean', 'think_of',
       'have_in_mind', 'give', 'decision_maker', 'total', 'activity', 'ask'],
      dtype='object')


In [117]:
df_term_category = df_term_category.T

# Converting the count matrix to tfidf matrix

In [118]:
df_term_category = pd.DataFrame(df_term_category.values/ df_term_category.sum(axis=1).values.reshape(-1, 1), 
                                columns=df_term_category.columns, index=df_term_category.index)

In [119]:
df_term_category = pd.DataFrame(df_term_category.values * np.log(df_term_category.shape[0]/(df_term_category>0).sum()
                                                                ).values.reshape(1, -1), columns=df_term_category.columns, 
                                index=df_term_category.index)

# Generating category vector for top K categories

In [120]:
df_chi_sq.head()

Category,1_Class_Add_Invoice,2_Class_Payment_Query
24,-2.43334e-06,4.01417e-05
aamerican,5.21735e-08,-8.60685e-07
aamva,1.56521e-07,-2.58205e-06
ab,5.21735e-08,-8.60685e-07
aba,-8.56595e-07,1.41309e-05


In [121]:
def generate_weighted_vector_cat(df_term_cat, df_chi_sq, k):
    cat_vec = {}
    
    for cat in df_term_cat.index:
        top_k_cats = df_chi_sq[cat].sort_values(ascending=False)[:k].index.tolist()
        cat_vec[cat] = (top_k_cats, df_term_cat.loc[cat, top_k_cats].values.tolist())
    
    return cat_vec

In [122]:
cat_vecs = generate_weighted_vector_cat(df_term_category, df_chi_sq, 1000)

In [123]:
from sklearn.metrics.pairwise import cosine_similarity

# Class prediction using cosine similarity

In [124]:
def predict_category(x, cat_vectors):
    prediction = None
    best_score = -1
    
    for cat in cat_vectors.keys():
        top_terms = cat_vectors[cat][0]
        cat_vec = np.array(cat_vectors[cat][1]).reshape(1, -1)
        
        x_cat = x[top_terms].values.reshape(1, -1)
        
        score = cosine_similarity(x_cat, cat_vec)
        
        if score > best_score:
            best_score = score
            prediction = cat
    
    return prediction

In [125]:
df_email['prediction'] = df_tfidf.apply(predict_category, axis=1, cat_vectors=cat_vecs).values

In [126]:
from sklearn.metrics import classification_report

In [127]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
df_email.CLASS = le.fit_transform(df_email.CLASS)
df_email.prediction = le.transform(df_email.prediction)

In [128]:
from sklearn.metrics import f1_score, recall_score

In [129]:
f1_score(df_email.CLASS, df_email.prediction, average=None)

array([0.99463839, 0.84429066])

In [130]:
f1_score(df_email.CLASS, df_email.prediction, average='macro')

0.9194645232866241

In [131]:
recall_score(df_email.CLASS, df_email.prediction, average=None)

array([1.        , 0.73053892])

In [132]:
df_email.loc[df_email.CLASS!=df_email.prediction]

Unnamed: 0,FILENAME,DATE,FROM,TO,CC,BCC,SUBJECT,BODY,GREETING,SIGNATURE,ATTACHMENT_FOUND,ATTACHMENTS_DETAILS,INVOICE_NO,CLASS,CUSTOMER,text,pos_text,preprocessed_text,prediction
5308,Class_2_PaymentQuery_CDK_Dated_22-08-2019_13.0...,{'Mail_1': 'Tue May 21 12:05:55 IST 2019'},"{'Mail_1': ['Di Cicco, Ernest (HQP) <Ernest.Di...","{'Mail_1': ['accountspayable@cdk.com', '']}",{'Mail_1': []},{'Mail_1': []},"100671 - CDK Global, Inc. Invoice 20455206 - ...",the reason for my emailing you today is to con...,"{'Mail_1': ['Hi AP,']}",{'Mail_1': 'Best Regards Ernest Di Cicco Credi...,True,"[{'fileByteArray': None, 'fileName': 'image001...",,1,CDK,"100671 - CDK Global, Inc. Invoice 20455206 - ...","[(cdk, NN), (global, JJ), (inc, NN), (invoice,...",cdk global invoice project reason email today ...,0
5311,Class_2_PaymentQuery_CDK_Dated_22-08-2019_13.0...,"{'Mail_2': 'Tuesday, July 2, 2019 4:25 PM', 'M...","{'Mail_2': ['Alurkar, Onkar'], 'Mail_1': ['Alu...","{'Mail_2': ['AccountsPayable &lt', 'AccountsPa...","{'Mail_2': ['Narawade, Pankaj &lt', 'Pankaj.Na...","{'Mail_2': [], 'Mail_1': []}",RE: Process payment of invoice# 39513,please could you provide an update on payment ...,"{'Mail_2': ['Hi Team,'], 'Mail_1': ['Hi Team,']}",{'Mail_2': 'Thanks amp Regards Onkar Alurkar ...,True,"[{'fileByteArray': None, 'fileName': 'ZPO20520...",,1,CDK,RE: Process payment of invoice# 39513 please c...,"[(re, NN), (process, NN), (payment, NN), (of, ...",process payment invoice provide update payment...,0
5316,Class_2_PaymentQuery_CDK_Dated_22-08-2019_13.0...,"{'Mail_2': 'Thursday, June 06, 2019 12:51 AM',...","{'Mail_2': ['Oberg, Donna'], 'Mail_1': ['Oberg...","{'Mail_2': ['AccountsPayable'], 'Mail_1': ['Ac...","{'Mail_2': [], 'Mail_1': []}","{'Mail_2': [], 'Mail_1': []}",FW: Stoel 4115728.pdf,i do not see that this invoice has been paid ...,"{'Mail_2': [], 'Mail_1': []}","{'Mail_2': '', 'Mail_1': ''}",True,"[{'fileByteArray': None, 'fileName': 'Stoel 41...",,1,CDK,FW: Stoel 4115728.pdf i do not see that this i...,"[(fw, NN), (stoel, NN), (pdf, NN), (i, NN), (d...",fw stoel pdf invoice pay advise donna r senior...,0
5322,Class_2_PaymentQuery_CDK_Dated_22-08-2019_13.0...,{'Mail_1': 'Tue May 21 12:07:19 IST 2019'},"{'Mail_1': ['Accounts_Receivable, Edmunds <acc...","{'Mail_1': ['Chadala.Joshua@cdk.com', '']}","{'Mail_1': ['AccountsPayable@cdk.com', '']}",{'Mail_1': []},Re: 2018 Past Due,i am following up on these past due invoices ...,"{'Mail_1': ['Hi Team,']}","{'Mail_1': 'Thanks Marie On Wed, May 8, 2019 a...",False,,,1,CDK,Re: 2018 Past Due i am following up on these p...,"[(re, NN), (past, IN), (due, JJ), (i, NN), (am...",follow past invoice provide update possible,0
5325,Class_2_PaymentQuery_CDK_Dated_22-08-2019_13.0...,"{'Mail_2': 'Tuesday, April 23, 2019 11:21 AM',...","{'Mail_2': ['Durkin, John'], 'Mail_1': ['Faye ...","{'Mail_2': ['Faye Messenger', 'AccountsPayable...","{'Mail_2': ['Naden Judilla', 'dfmar STS'], 'Ma...","{'Mail_2': [], 'Mail_1': []}",RE: CDK Global inv#9468593189,well april th would be net from feb th some...,"{'Mail_2': [], 'Mail_1': []}","{'Mail_2': '', 'Mail_1': 'Hi John and team I r...",True,"[{'fileByteArray': None, 'fileName': 'image001...",,1,CDK,RE: CDK Global inv#9468593189 well april th w...,"[(re, NN), (cdk, NN), (global, JJ), (inv, NN),...",cdk global inv april th net feb th add end get...,0
5328,Class_2_PaymentQuery_CDK_Dated_22-08-2019_13.0...,{'Mail_1': 'Wed Sep 11 13:11:20 IST 2019'},"{'Mail_1': ['VinAudit.com, Inc. <quickbooks@no...","{'Mail_1': ['AccountsPayable@cdk.com', 'cvr-al...",{'Mail_1': []},{'Mail_1': []},Reminder: Invoice CVR-20190707 from VinAudit.c...,inc dear computerized vehicle registration c...,{'Mail_1': []},{'Mail_1': 'Thanks for your business VinAudit....,True,"[{'fileByteArray': None, 'fileName': 'Invoice_...",,1,CDK,Reminder: Invoice CVR-20190707 from VinAudit.c...,"[(reminder, NN), (invoice, NN), (cvr, NN), (fr...",reminder invoice cvr vinauditcom dear computer...,0
5332,Class_2_PaymentQuery_CDK_Dated_22-08-2019_13.0...,"{'Mail_2': 'Friday, May 17, 2019 6:52 AM', 'Ma...",{'Mail_2': ['Ty D'amore [mailto:tdamore@audioe...,"{'Mail_2': ['Cordova, Angie'], 'Mail_1': ['Acc...","{'Mail_2': [], 'Mail_1': []}","{'Mail_2': [], 'Mail_1': []}",FW: CDK invoice,can someone take a look at zpr please the ve...,"{'Mail_2': ['Hi Angie,'], 'Mail_1': []}",{'Mail_2': 'Thanks Ty Ty d'amore Director of S...,False,,,1,CDK,FW: CDK invoice can someone take a look at zpr...,"[(fw, NN), (cdk, NN), (invoice, NN), (can, MD)...",fw cdk invoice look zpr vendor tell receive pa...,0
5333,Class_2_PaymentQuery_CDK_Dated_22-08-2019_13.0...,{'Mail_1': 'Wed Sep 11 13:10:59 IST 2019'},"{'Mail_1': ['Accounts_Receivable, Edmunds <acc...","{'Mail_1': ['ap-digital@cdk.com', '']}","{'Mail_1': ['KHuynh@edmunds.com', 'stephenp@ed...",{'Mail_1': []},Re: 2018 Past Due,please confirm the next payment to be processe...,"{'Mail_1': ['Hi Ramesh,']}","{'Mail_1': 'Thanks Marie On Mon, Jun 24, 2019 ...",False,,,1,CDK,Re: 2018 Past Due please confirm the next paym...,"[(re, NN), (past, IN), (due, JJ), (please, NN)...",confirm payment process invoice number pay rec...,0
5336,Class_2_PaymentQuery_CDK_Dated_22-08-2019_13.0...,"{'Mail_2': 'mercredi 26 juin 2019 12:34', 'Mai...","{'Mail_2': ['AccountsPayable'], 'Mail_1': ['MU...","{'Mail_2': ['MULAMBA, Madeleine'], 'Mail_1': [...","{'Mail_2': ['McCrary, Adam', 'Pomazal, Bruce']...","{'Mail_2': [], 'Mail_1': []}",FW: 953061 FR : Level 2 : Notice of Intent to ...,we have not received any po from you you will...,"{'Mail_2': ['Hi Madeleine,'], 'Mail_1': ['Hell...","{'Mail_2': 'Thank you Regards, Vennela. -----...",True,"[{'fileByteArray': None, 'fileName': 'ALTMP_IS...",,1,CDK,FW: 953061 FR : Level 2 : Notice of Intent to ...,"[(fw, JJ), (fr, JJ), (level, NN), (notice, NN)...",fw fr level notice intent block sap cdk global...,0
5338,Class_2_PaymentQuery_CDK_Dated_22-08-2019_13.0...,"{'Mail_2': 'Tuesday, May 28, 2019 12:36 PM', '...","{'Mail_2': ['Villa, Claudia (CAI - North Hills...","{'Mail_2': ['Accountspayable@cdk.com'], 'Mail_...","{'Mail_2': ['Roy, Meghna', 'Mastroianni, Jill ...","{'Mail_2': [], 'Mail_1': []}",DEALERTRACK- ACCOUNT # 1151237,please provide payment information regarding a...,"{'Mail_2': ['Good afternoon,'], 'Mail_1': ['Go...",{'Mail_2': 'Thank you Claudia Pay on line at ...,True,"[{'fileByteArray': None, 'fileName': 'image001...",,1,CDK,DEALERTRACK- ACCOUNT # 1151237 please provide...,"[(dealertrack, NN), (account, NN), (please, NN...",dealertrack account provide payment informatio...,0


In [133]:
df_email.columns

Index(['FILENAME', 'DATE', 'FROM', 'TO', 'CC', 'BCC', 'SUBJECT', 'BODY',
       'GREETING', 'SIGNATURE', 'ATTACHMENT_FOUND', 'ATTACHMENTS_DETAILS',
       'INVOICE_NO', 'CLASS', 'CUSTOMER', 'text', 'pos_text',
       'preprocessed_text', 'prediction'],
      dtype='object')

# ML based classification

In [134]:
def top_k_terms(df_term_cat, df_chi_sq, k):
    top_cats = []
    
    for cat in df_term_cat.index:
        top_cats = top_cats + df_chi_sq[cat].sort_values(ascending=False)[:k].index.tolist()
    
    return top_cats

In [135]:
terms_to_consider = top_k_terms(df_term_category, df_chi_sq, 100)

In [48]:
from sklearn.model_selection import train_test_split

In [49]:
X_train, X_test, y_train, y_test = train_test_split(df_tfidf.loc[:, terms_to_consider], df_email.CLASS, test_size=0.2,
                                                   stratify = df_email.CLASS)

In [50]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression

In [51]:
# model = RandomForestClassifier(n_estimators=100, max_depth=9, n_jobs=-1, random_state=123, class_weight='balanced')
model = LogisticRegression(class_weight='balanced', multi_class='ovr')

In [52]:
model.fit(X_train, y_train)



LogisticRegression(C=1.0, class_weight='balanced', dual=False,
                   fit_intercept=True, intercept_scaling=1, l1_ratio=None,
                   max_iter=100, multi_class='ovr', n_jobs=None, penalty='l2',
                   random_state=None, solver='warn', tol=0.0001, verbose=0,
                   warm_start=False)

In [53]:
preds = model.predict(X_test)

In [54]:
f1_score(y_test, preds, average=None)

array([0.97439024, 0.57142857])