In [1]:
import nltk
import pickle
import numpy as np
import pandas as pd
nltk.download('brown')
from scipy.sparse import hstack
import re, difflib
from nltk.tokenize import word_tokenize

def strip_spaces(s):
    return re.sub('[ ]+', ' ', s).strip()

def ContentExtraction(mail, ContentSplitters):
    '''
    Slicing email body content than signature and disclaimers
    '''
    lmtr = [lmtr for lmtr in ContentSplitters if lmtr in mail.lower()]
    if len(lmtr)>0:
        limiter = re.search(lmtr[0], mail.lower()).start()
        return mail[:limiter]
    return mail

def Remove_URLs(x):
    '''
    Removing URLs from the mail body
    '''
    x = word_tokenize(x)
    x = [i for i in x if not len(re.findall(r'[\w\.-]+@[\w\.-]+',i))]
    x = ' '.join(x)
    return x


def preprocess_mail(text, punctuation=False):
    '''
    Preprocessing the email content 
    removing punctuations except few required
    strip spaces
    
    
    '''
    text = re.sub(r'^https?:\/\/.*[\r\n]*', '', text, flags=re.MULTILINE)
    text = re.sub('[^A-Za-z0-9 ,.]+', '', text.replace('\n',' ')).lower()
    text = strip_spaces(text)
    text = text.replace(',',' , ').replace('.',' . ')
    text = ContentExtraction(text, ContentSplitters)
    text = Remove_URLs(text)
    if punctuation:
        text = re.sub('[^A-Za-z0-9 ]+', '', text.replace('\n',' ')).lower()
    
    return text

# Building a trie to add and remove words faster way
class Trie:
    head = {}
    
    def add(self,word):

        cur = self.head
        for ch in word:
            if ch not in cur:
                cur[ch] = {}
            cur = cur[ch]
        cur['*'] = True

    def search(self,word):
        cur = self.head
        for ch in word:
            if ch not in cur:
                return False
            cur = cur[ch]

        if '*' in cur:
            return True
        else:
            return False
    def printf(self):
        print (self.head)
        
        

def get_nearest_word(search_w, stopwords, words1, excess=1000):
    '''
    Function to search nearest possible word, like 
    c[icking -> clicking
    corrections to mail words mentioned in mail body
    
    '''

#     words1 = [list(map(ord, i)) for i in set(words)]
#     max_length = max(map(len, words1))
#     words1 = pad_sequences(words1, max_length)
#     tree = KDTree(words1, leaf_size=8)
    if search_w in stopwords:
        return [search_w, 1]
    dist = 0
    left = 0
    right = len(words1)
    while left<=right:
        mid = (left+right)//2
        d = difflib.SequenceMatcher(None, search_w ,words1[mid]).ratio()
        if d==1:
            break
        elif d<dist:
            left =  mid + 1
        else:
            dist = d
            right = mid - 1
    
    dist = ['',0, 0]
    for ind, w in enumerate(words1[mid-excess:mid+excess]):
        d = difflib.SequenceMatcher(None, search_w, w).ratio()
        if d==1:
            dist = [w, d]
            return dist
        if d>dist[1]:
            dist = [w, d]
    return dist

def concatenate_words(s, i, stopwords, words1, excess, word_size=9):
    '''
    search and concatenating the space delimited words 
    fell ow  -> fellow
    coordin ate  -> coordinate
    
    
    '''
    te = []
    for j in [0,-1]:
        search_w = []
        if j==-1 and i>0:
            search_w.append(s[i-1])
        threshold = 0
        for ind, w in enumerate(s[i:]):
            
            if threshold>word_size: break
            threshold+=len(w)
            search_w.append(w)
#             print(search_w)
#             print(''.join(search_w) in words1)
#             r = get_nearest_word(''.join(search_w), stopwords, words1, excess=excess)
            
#             if r[1]==1:
#                 return ind+1, r[0], j
#             te.append(r)
#             if ''.join(search_w) in words1: #r[1]==1:
            if trie_dict.search(''.join(search_w)):
                return ind+1, ''.join(search_w), j
            
    return None, '', None



def sequence_processing(s, stopwords, words1, excess, word_size=6):
    '''
    sequence processing from to add space delimited words in sequnce
    
    sequence :
    'wion is un iquely positioned as the globa l voice of ind ia , present ing its own perspective on 
    international issues of critical significance .'
    
    Out:
    'wion is uniquely positioned as the global of india , presenting its own perspective on international
    issues of critical significance .'
    
    '''
    res = []
    w = 0
    while w<len(s):
        # d = get_nearest_word(s[w], stopwords, words1, excess=excess)
        if trie_dict.search(s[w]):
            res.append(s[w])
        else:
            d1 = concatenate_words(s, w, stopwords, words1, excess, word_size)
            if d1[2]==-1:
                res.pop()
                res.append(d1[1])
            elif d1[2]==0:
                res.append(d1[1])
                w += d1[0]
            else:
                res.append(s[w])
        w +=1
    return ' '.join(res)


# Building a trie of english available words
import string
stopwords = nltk.corpus.stopwords.words('english')

# Selecting vocal library of english words 
words = nltk.corpus.brown.words()
words1 = []
for w in words:
    w = w.lower()
    w = re.sub('[0-9]','', w)
    words1.append(w)
words1 = list(set(words1))
words1.sort()

# adding all possible words to trie to make ease of search
trie_dict = Trie()
for word in words1:
    if '*' not in word and word not in list(string.ascii_lowercase+string.ascii_uppercase+string.digits):
        trie_dict.add(word)

ContentSplitters  = ['best regards', 'rgds ','b rgds', '\ngreetings', '\nthanks.', '\nthanks,', '\nthank you','\nthank you,', '\nthank you\n', 'sincerely', 'regard ',
                      'regards', 'kind regards', 'the information contained in this','forwarded', '\ntel:', '\nMobile:', '\nall the best,','\ncordially',
                      '[image: image.png]','thx','Tel:','Fax:','greeting', '\nproject manager ', 'from:', 'envoyé :', 
                      'the information contained in this email are confid', '------- forwarded message -----', 
                      'proprietary and confidential.','\nthanks a lot', 'tel. ','Please consider your environmental responsibility',
                      'The administrator of your personal data','the information transmitted in this e-mai',  'Deze email en de bijgevoegde', 'this e-mail is intended only for the person or entity']



[nltk_data] Downloading package brown to
[nltk_data]     C:\Users\nagak\AppData\Roaming\nltk_data...
[nltk_data]   Package brown is already up-to-date!


In [2]:
with open('regressor.pkl', 'rb') as f:
    reg = pickle.load(f)
    word_vectorizer = pickle.load(f)
    char_vectorizer = pickle.load(f)

In [3]:
email_chain_pairs = pickle.load(open('email_chains.pkl','rb'))
ec = email_chain_pairs
ml = pd.DataFrame([i[0] for i in ec])
rs = pd.DataFrame([i[1] for i in ec])

ml['time']= pd.to_datetime(ml['time'], utc=True)
rs['time'] = pd.to_datetime(rs['time'], utc=True)

df = ml.join(rs.add_suffix('_rsp'))
df['mail'] = np.where((df['time_rsp']-df['time'])<pd.to_timedelta('1s'), df['body_rsp'], df['body'])
df['text'] =df['mail'].apply(preprocess_mail)
df['text1'] = df.apply(lambda x: ''.join(sequence_processing(sent.split(' '), stopwords, words1, excess=30000, word_size=15) for sent in nltk.tokenize.sent_tokenize(x['text'])),1)

In [4]:
word_features = word_vectorizer.transform(df['text1'])
char_features = char_vectorizer.transform(df['text1'])



In [5]:
features = hstack([char_features, word_features])
df['target'] = reg.predict(features)
df = df.sort_values('target')

In [6]:

df['body']

931     Caroli na:\n(b) (5)\nBest regar ds,\nTony\nAnt...
470     The CDCtoday alerted Americans to begin to pre...
451     Hi Dr. Fauci:\n1 have been a long time admire ...
535     Att achments:                Task Force VP Bri...
1035    Dear Francis,\nCongratulat ions on your launch...
                              ...                        
268     Tony,\nSince we our last report we have increa...
22      Dear Tony ,\nI hope all is wel l. I had the gr...
23      Dear Tony ,\nI hope all is wel l. I had the gr...
24      Dear Tony ,\nI hope all is wel l. I had the gr...
25      Dear Tony ,\nI hope all is wel l. I had the gr...
Name: body, Length: 1168, dtype: object