**Import required Libraries**

*sklearn - a machine learning library*

In [None]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer,CountVectorizer
from sklearn.metrics.pairwise import linear_kernel

**Read the csv file**

In [None]:
emails = pd.read_csv('email.csv')

**Extract Sender, Receiver, Subject and Message Body for each raw email**  

email['from'] = 'sender1@hcl.com'  
email['to'] = 'receiver1@hcl.com'  
email['subject'] = 'subject1'  
email['body'] = 'first message body'

In [None]:
def parse_raw_email(raw_email):
    lines = raw_email.split('\n')
    email = {}
    message = ''
    keys_to_extract = ['from', 'to', 'subject']
    
    for line in lines:
        if ':' not in line:
            message += line.strip()
            email['body'] = message
        else:
            pairs = line.split(':', 1)
            key = pairs[0].lower()
            val = pairs[1].strip()
            if key in keys_to_extract:
                email[key] = val
    return email

**Create an array of data for each extracted fields (from, to, subject, body)**  

results['sender1@hcl.com', 'sender2@hcl.com', 'sender3@hcl.com']  

In [None]:
def map_to_list(emails, key):
    results = []
    for email in emails:
        if key not in email:
            results.append('')
        else:
            results.append(email[key])
    return results

**Create the array suitable for Pandas Dataframe**  

{  
    'from': [sender1@hcl.com, sender2@hcl.com, sender3@hcl.com],  
    'to': [receiver1@hcl.com, receiver2@hcl.com, receiver3@hcl.com],  
    'subject': [subject1@hcl.com, subject2@hcl.com, subject3@hcl.com],  
    'body': [message1, message2, message3]    
}

In [None]:
def map_extracted_fields(messages):
    emails = [parse_raw_email(message) for message in messages]
    return {
        'from': map_to_list(emails, 'from'),
        'to': map_to_list(emails, 'to'),
        'subject': map_to_list(emails, 'subject'),
        'body': map_to_list(emails, 'body')
    }

**Create the Dataframe using Pandas**  

d = {'col1': [1, 2], 'col2': [3, 4]}  
df = pd.DataFrame(data=d)  
df

 |  | Col1 | Col2  
 |--|------|------  
  0 |   1  |  3  
  1 |   2  |  4

In [None]:
email_df = pd.DataFrame(map_extracted_fields(emails.message))
email_df

**Instantiate TF-IDF Vectorizer**

*max_df - ignore terms that appear in more than 50% of the documents*  
*min_df - ignore terms that appear in less than 2 documents*  

*countvectorizer - this vectorizer will be used for the term frequency*  
*tfvectorizer - this vectorizer will be used for the weight of a term in a document*  
*tfidfvectorizer - this vectorizer will be used for the weight of a term in the entire dataset*  

In [None]:
countvectorizer = CountVectorizer(stop_words='english')
tfvectorizer = TfidfVectorizer(stop_words='english', use_idf=False)
tfidfvectorizer = TfidfVectorizer(stop_words='english', max_df=0.50, min_df=2)

**Convert email body to matrix**  

Result will be a sparse matrix
(A, B) C  

A: Document index  
B: Specific word-vector index  

C: (Term Frequency) TF of word B in document A (count_matrix)  
C: TF weight for word B in document A (tf_matrix)  
C: TFIDF score for word B in document A in the entire data set (tfidf_matrix)  

In [None]:
count_matrix = countvectorizer.fit_transform(email_df.body)
tf_matrix = tfvectorizer.fit_transform(email_df.body)
tfidf_matrix = tfidfvectorizer.fit_transform(email_df.body)

print(tfidf_matrix)

**Retrieve the terms found in the dataset**

In [None]:
count_feats = countvectorizer.get_feature_names()
tf_feats = tfvectorizer.get_feature_names()
tfidf_feats = tfidfvectorizer.get_feature_names()

**Display Vectorizer output in a Dataframe**

*This will display the top 10 words commonly used in an individual email*

*Count - number of times the word appeared in an email*  
*TF - weight score of the word in an email*

In [None]:
count = 10
email_index = 1

# (document_id, token_id) tfidf_score
print("\n\nEmail Message\n")
print(email_df.body[email_index])

print("\n\nCount Vectorizer\n")
count_df = pd.DataFrame(count_matrix[email_index].T.todense(), index=count_feats, columns=["Count"])
count_df = count_df.sort_values('Count', ascending=False)
print (count_df.head(count))

print("\n\nTF Vectorizer\n")
tf_df = pd.DataFrame(tf_matrix[email_index].T.todense(), index=tf_feats, columns=["TF"])
tf_df = tf_df.sort_values('TF', ascending=False)
print (tf_df.head(count))

**This will display the top 10 words for the entire dataset with their TF-IDF score**

In [None]:
tfidf_means = np.mean(tfidf_matrix.toarray(), axis=0)

top_ids = np.argsort(tfidf_means)[::-1][:count]
top_feats = [(tfidf_feats[i], tfidf_means[i]) for i in top_ids]
df_top_feats = pd.DataFrame(top_feats, columns=['words', 'score'])

df_top_feats

**Finding emails by query** 

*Cosine similarity is a measure of similarity between two non-zero vectors of an inner product space that measures the cosine of the angle between them. The cosine of 0° is 1, and it is less than 1 for any other angle.*  

*tfidf_matrix[0:1] - The vector of the first email.*

In [None]:
cosine_sim = linear_kernel(tfidf_matrix[0:1], tfidf_matrix).flatten()
print(cosine_sim)

**Return the top 10 email messages that contains the word you want to query**

In [None]:
query = "pipeline"

vec_query = tfidfvectorizer.transform([query])

cosine_sim = linear_kernel(vec_query, tfidf_matrix).flatten()

related_email_indices = cosine_sim.argsort()[:-10:-1]
print(related_email_indices)

In [None]:
for i in related_email_indices:
  print(i)  
  print(email_df.body[i] + "\n\n")