In [1]:
import pandas as pd
import numpy as np
from nltk.tokenize.regexp import RegexpTokenizer
# from nltk.corpus import stopwords

In [2]:
import glob,os,codecs,sys,re,json
from nltk.corpus import stopwords
from string import punctuation
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.decomposition import NMF, LatentDirichletAllocation

In [3]:
from string import punctuation
my_stop=set(stopwords.words('english')+list(punctuation))

In [4]:
data = pd.read_csv('Enron_data/Enron_clean_email_100.csv')

In [5]:
data.head()

Unnamed: 0.1,Unnamed: 0,email
0,0,Here is our forecast
1,1,Traveling to have a business meeting takes the...
2,2,test successful. way to go!!!
3,3,Can you send me a schedule of the salary and ...
4,4,Let's shoot for Tuesday at 11:45.


In [6]:
def preprocess(sentence):
    sentence = sentence.lower()
    tokenizer = RegexpTokenizer(r'\w+')
    tokens = tokenizer.tokenize(sentence)
    filtered_words = [w for w in tokens if not w in my_stop]
    return " ".join(filtered_words)

In [7]:
def display_topics(H, W, feature_names, documents, no_top_words, no_top_documents):
    for topic_idx, topic in enumerate(H):
        print("Topic %d:" % (topic_idx))
        print(" ".join([feature_names[i] for i in topic.argsort()[:-no_top_words - 1:-1]]))
        top_doc_indices = np.argsort( W[:,topic_idx] )[::-1][0:no_top_documents]
        for doc_index in top_doc_indices:
            print(documents[doc_index])
            

In [8]:
data  = data.replace(np.nan, '', regex=True)

In [9]:
data.drop(columns='Unnamed: 0',axis=1,inplace=True)

In [10]:
data_email = data['email']

In [11]:
data_email=[preprocess(sent)  for sent in data_email]

In [12]:
tfidf_vectorizer = TfidfVectorizer(max_df=0.65, min_df=5, stop_words='english')
tfidf = tfidf_vectorizer.fit_transform(data_email)
tfidf_feature_names = tfidf_vectorizer.get_feature_names()

### max df

- max_df is used for removing terms that appear too frequently, also known as "corpus-specific stop words". For example:

- max_df = 0.50 means "ignore terms that appear in more than 50% of the documents".
- max_df = 25 means "ignore terms that appear in more than 25 documents".
- The default max_df is 1.0, which means "ignore terms that appear in more than 100% of the documents". Thus, the default setting does not ignore any terms.

### min df

- min_df is used for removing terms that appear too infrequently. For example:

- min_df = 0.01 means "ignore terms that appear in less than 1% of the documents".
- min_df = 5 means "ignore terms that appear in less than 5 documents".
- The default min_df is 1, which means "ignore terms that appear in less than 1 document". Thus, the default setting does not ignore any terms.

In [13]:
no_topics = 3

In [14]:
nmf_model = NMF(n_components=no_topics, random_state=1, alpha=.1, l1_ratio=.5, init='nndsvd').fit(tfidf)
nmf_W = nmf_model.transform(tfidf)
nmf_H = nmf_model.components_

In [15]:
no_top_words = 5
no_top_documents = 2
display_topics(nmf_H, nmf_W, tfidf_feature_names, data_email, no_top_words, no_top_documents)

Topic 0:
project need let know week
please respond cbpres austin rr com enclosed preliminary proforma westgate property austinthat told tell proforma projectshould produce truly exceptional return 40 per year 3 years especially attractive project market strong ashave uncovered date austin market smart growth corridor area designated city austinfor preferred development fast tracked completewater treatment ordinances waived estimated lotimprovement costs based 28 lot development investigated northwidening even though property likely require streetwidening less detention retention filtration pondrequirement used data cautious expected impact sales significantly projects quiteis uneven included fence around entire property mayonly put westgate cameron loop gated communities farpreferred good idea screening current buyeran extended escrow enable us probably obtain approved siteplan closing contract mean close intoprofits project discussed san marcos also discusshaving invest lots sell lots

### Using LDA for topic model

In [16]:
from sklearn.decomposition import NMF, LatentDirichletAllocation

In [17]:
lda = LatentDirichletAllocation(n_topics=no_topics, max_iter=5, learning_method='online', learning_offset=50.,random_state=0).fit(tfidf)




In [18]:
def display_topics_lda(model, feature_names, no_top_words):
    results={}
    for topic_idx, topic in enumerate(model.components_):
        topicId='Topic '+str(topic_idx)
        topicName=" ".join([feature_names[i]
                        for i in topic.argsort()[:-no_top_words - 1:-1]])
        results[topicId]=topicName


    return results

In [19]:
display_topics_lda(lda, tfidf_feature_names, no_top_words)

{'Topic 0': 'email want think information gas',
 'Topic 1': 'let receive need know units',
 'Topic 2': 'meeting spreadsheet socal date address'}