**Import required Libraries**

In [None]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer,CountVectorizer

**Read the csv file**

In [None]:
emails = pd.read_csv('email.csv')

**Extract Sender, Receiver, Subject and Message Body for each raw email**  

email['from'] = 'sender1@hcl.com'  
email['to'] = 'receiver1@hcl.com'  
email['subject'] = 'subject1'  
email['body'] = 'first message body'

In [None]:
def parse_raw_email(raw_email):
    lines = raw_email.split('\n')
    email = {}
    message = ''
    keys_to_extract = ['from', 'to', 'subject']
    
    for line in lines:
        if ':' not in line:
            message += line.strip()
            email['body'] = message
        else:
            pairs = line.split(':', 1)
            key = pairs[0].lower()
            val = pairs[1].strip()
            if key in keys_to_extract:
                email[key] = val
    return email

**Create an array of data for each extracted fields (from, to, subject, body)**  

results['sender1@hcl.com']  
results['sender2@hcl.com']  
results['sender3@hcl.com']  

In [None]:
def map_to_list(emails, key):
    results = []
    for email in emails:
        if key not in email:
            results.append('')
        else:
            results.append(email[key])
    return results

**Create the array suitable for Pandas Dataframe**  

{  
    'from': [sender1@hcl.com, sender2@hcl.com, sender3@hcl.com],  
    'to': [receiver1@hcl.com, receiver2@hcl.com, receiver3@hcl.com],  
    'subject': [subject1@hcl.com, subject2@hcl.com, subject3@hcl.com],  
    'body': [message1, message2, message3]    
}

In [None]:
def map_extracted_fields(messages):
    emails = [parse_raw_email(message) for message in messages]
    return {
        'from': map_to_list(emails, 'from'),
        'to': map_to_list(emails, 'to'),
        'subject': map_to_list(emails, 'subject'),
        'body': map_to_list(emails, 'body')
    }

**Create the Dataframe using Pandas**  

d = {'col1': [1, 2], 'col2': [3, 4]}  
df = pd.DataFrame(data=d)  
df

 |  | Col1 | Col2  
 |--|------|------  
  0 |   1  |  3  
  1 |   2  |  4

In [None]:
email_df = pd.DataFrame(map_extracted_fields(emails.message))
email_df

**Instantiate TF-IDF Vectorizer**

*ignore terms that appear in more than 50% of the documents*\n
*ignore terms that appear in less than 2 documents*

In [None]:
countvectorizer = CountVectorizer(stop_words='english', max_df=0.50, min_df=2)
tfidfvectorizer = TfidfVectorizer(stop_words='english', max_df=0.50, min_df=2)

**Convert email body to matrix**

In [None]:
count_matrix = countvectorizer.fit_transform(email_df.body)
tfidf_matrix = tfidfvectorizer.fit_transform(email_df.body)

**Retrieve the terms found in corpus**

In [None]:
count_feats = countvectorizer.get_feature_names()
tfidf_feats = tfidfvectorizer.get_feature_names()

**Display Vectorizer output in a Dataframe**

*This result is for an individual email*

In [None]:
# (document_id, token_id) tfidf_score
print("Count Vectorizer\n")
count_df = pd.DataFrame(count_matrix[1].T.todense(), index=count_feats, columns=["Count"])
count_df = count_df.sort_values('Count', ascending=False)
print (count_df.head(25))

print("\n\nTD-IDF Vectorizer\n")
tfidf_df = pd.DataFrame(tfidf_matrix[1].T.todense(), index=tfidf_feats, columns=["TF-IDF"])
tfidf_df = tfidf_df.sort_values('TF-IDF', ascending=False)
print (tfidf_df.head(25))