In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
plt.style.use(style='seaborn')
%matplotlib inline

# read csv into a dataframe
df_idf=pd.read_csv("output.csv")

df_idf.head()
df_idf.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 497 entries, 0 to 496
Data columns (total 2 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   Unnamed: 0  497 non-null    int64 
 1   text        497 non-null    object
dtypes: int64(1), object(1)
memory usage: 7.9+ KB


In [None]:
import re
def pre_process(text):
    
    # lowercase
    text=text.lower()
    
    #remove tags
    text=re.sub("</?.*?>"," <> ",text)
    
    # remove special characters and digits
    text=re.sub("(\\d|\\W)+"," ",text)
    
    return text

df_idf['text'] = df_idf['text'].apply(lambda x:pre_process(x))

#show the first 'text'
df_idf['text'][2]

'youll love kindle ive mine months never looked back the new big one huge no need remorse'

## Creating the IDF

### CountVectorizer to create a vocabulary and generate word counts
The next step is to start the counting process. We can use the CountVectorizer to create a vocabulary from all the text in our `df_idf['text']` and generate counts for each row in `df_idf['text']`. The result of the last two lines is a sparse matrix representation of the counts, meaning each column represents a word in the vocabulary and each row represents the document in our dataset where the values are the word counts. Note that with this representation, counts of some words could be 0 if the word did not appear in the corresponding document.

In [None]:
from sklearn.feature_extraction.text import CountVectorizer
import re

def get_stop_words(stop_file_path):
    """load stop words """
    
    with open(stop_file_path, 'r', encoding="utf-8") as f:
        stopwords = f.readlines()
        stop_set = set(m.strip() for m in stopwords)
        return frozenset(stop_set)

#load a set of stop words
stopwords=get_stop_words("stopwords.txt")

#get the text column 
docs=df_idf['text'].tolist()

#create a vocabulary of words, 
#ignore words that appear in 85% of documents, 
#eliminate stop words
cv=CountVectorizer(max_df=0.85,stop_words=stopwords)
word_count_vector=cv.fit_transform(docs)

Now let's check the shape of the resulting vector. Notice that the shape below is `(20000,124901)` because we have 20,000 documents in our dataset (the rows) and the vocabulary size is `124901` meaning we have `124901` unique words (the columns) in our dataset minus the stopwords.

In [None]:
word_count_vector.shape

(497, 1951)

Let's limit our vocabulary size to 10,000

In [None]:
cv=CountVectorizer(max_df=0.85,stop_words=stopwords,max_features=124901)
word_count_vector=cv.fit_transform(docs)
word_count_vector.shape

(497, 1951)

Now, let's look at 10 words from our vocabulary. 

In [None]:
list(cv.vocabulary_.keys())[:10]

['reading',
 'kindle',
 'love',
 'lee',
 'childs',
 'good',
 'read',
 'ok',
 'first',
 'assesment']

We can also get the vocabulary by using `get_feature_names()`

In [None]:
list(cv.get_feature_names())[2000:2015]



[]

### TfidfTransformer to Compute Inverse Document Frequency (IDF) 
In the code below, we are essentially taking the sparse matrix from CountVectorizer to generate the IDF when you invoke `fit`. 

In [None]:
from sklearn.feature_extraction.text import TfidfTransformer

tfidf_transformer=TfidfTransformer(smooth_idf=True,use_idf=True)
tfidf_transformer.fit(word_count_vector)

TfidfTransformer()

Let's look at some of the IDF values:

In [None]:
tfidf_transformer.idf_

array([6.5174529 , 6.5174529 , 6.11198779, ..., 6.5174529 , 6.5174529 ,
       6.5174529 ])

## Computing TF-IDF and Extracting Keywords

Once we have our IDF computed, we are now ready to compute TF-IDF and extract the top keywords.

In [None]:
# read test docs into a dataframe and concatenate title and body
df_test=pd.read_csv("output.csv")
df_test['text'] =df_test['text'].apply(lambda x:pre_process(x))

# get test docs into a list
docs_test=df_test['text'].tolist()


In [None]:
def sort_coo(coo_matrix):
    tuples = zip(coo_matrix.col, coo_matrix.data)
    return sorted(tuples, key=lambda x: (x[1], x[0]), reverse=True)

def extract_topn_from_vector(feature_names, sorted_items, topn=10):
    """get the feature names and tf-idf score of top n items"""
    
    #use only topn items from vector
    sorted_items = sorted_items[:topn]

    score_vals = []
    feature_vals = []

    for idx, score in sorted_items:
        fname = feature_names[idx]
        
        #keep track of feature name and its corresponding score
        score_vals.append(round(score, 3))
        feature_vals.append(feature_names[idx])

    #create a tuples of feature,score
    #results = zip(feature_vals,score_vals)
    results= {}
    for idx in range(len(feature_vals)):
        results[feature_vals[idx]]=score_vals[idx]
    
    return results

The next step is to compute the tf-idf value for a given document in our test set by invoking `tfidf_transformer.transform(...)`. This generates a vector of tf-idf scores. Next, we sort the words in the vector in descending order of tf-idf values and then iterate over to extract the top-n items with the corresponding feature names, In the example below, we are extracting keywords for the first document in our test set. 

The `sort_coo(...)` method essentially sorts the values in the vector while preserving the column index. Once you have the column index then its really easy to look-up the corresponding word value as you would see in `extract_topn_from_vector(...)` where we do `feature_vals.append(feature_names[idx])`.

In [None]:
# you only needs to do this once
feature_names=cv.get_feature_names()

# get the document that we want to extract keywords from
doc=docs_test[0]

#generate tf-idf for the given document
tf_idf_vector=tfidf_transformer.transform(cv.transform([doc]))

#sort the tf-idf vectors by descending order of scores
sorted_items=sort_coo(tf_idf_vector.tocoo())

#extract only the top n; n here is 10
keywords=extract_topn_from_vector(feature_names,sorted_items,10)

# now print the results
print("\n=====Title=====")

print(docs_test[0])
print("\n===Keywords===")
for k in keywords:
    print(k,keywords[k])


=====Title=====
reading kindle love lee childs good read

===Keywords===
lee 0.478
childs 0.478
read 0.386
reading 0.376
kindle 0.309
love 0.29
good 0.272




In [None]:
# put the common code into several methods
def get_keywords(idx):

    #generate tf-idf for the given document
    tf_idf_vector=tfidf_transformer.transform(cv.transform([docs_test[idx]]))

    #sort the tf-idf vectors by descending order of scores
    sorted_items=sort_coo(tf_idf_vector.tocoo())

    #extract only the top n; n here is 10
    keywords=extract_topn_from_vector(feature_names,sorted_items,10)
    
    return keywords

def print_results(idx,keywords):
    # now print the results
 
    print("\n===Keywords===")
    for k in keywords:
        print(k,keywords[k])



Now let's look at keywords generated for a much longer question: 


In [None]:
idx=120
keywords=get_keywords(idx)
print_results(idx,keywords)


===Keywords===
marketing 0.47
sitcom 0.291
nbccom 0.291
market 0.291
judd 0.291
fake 0.291
creates 0.291
apatow 0.291
viral 0.242
movie 0.242


## Generate keywords for a batch of documents

In [None]:
#generate tf-idf for all documents in your list. docs_test has 500 documents
tf_idf_vector=tfidf_transformer.transform(cv.transform(docs_test))

results=[]
for i in range(tf_idf_vector.shape[0]):
    
    # get vector for a single document
    curr_vector=tf_idf_vector[i]
    
    #sort the tf-idf vector by descending order of scores
    sorted_items=sort_coo(curr_vector.tocoo())

    #extract only the top n; n here is 10
    keywords=extract_topn_from_vector(feature_names,sorted_items,10)
    
    
    results.append(keywords)

df=pd.DataFrame(zip(docs,results),columns=['doc','keywords'])
df

Unnamed: 0,doc,keywords
0,reading kindle love lee childs good read,"{'lee': 0.478, 'childs': 0.478, 'read': 0.386,..."
1,ok first assesment kindle fucking rocks,"{'assesment': 0.468, 'rocks': 0.439, 'first': ..."
2,youll love kindle ive mine months never looked...,"{'youll': 0.295, 'remorse': 0.295, 'months': 0..."
3,fair enough but kindle i think perfect,"{'fair': 0.457, 'perfect': 0.428, 'enough': 0...."
4,big im quite happy kindle,"{'quite': 0.524, 'big': 0.488, 'happy': 0.462,..."
...,...,...
492,ask programming latex indesign submitted calci...,"{'submitted': 0.366, 'programming': 0.366, 'in..."
493,on note i hate word i hate pages i hate latex ...,"{'hate': 0.616, 'latex': 0.345, 'texn': 0.229,..."
494,ahhh back real text editing environment i lt l...,"{'environment': 0.392, 'editing': 0.392, 'ahhh..."
495,trouble iran i see hmm iran iran far away floc...,"{'iran': 0.747, 'trouble': 0.29, 'hmm': 0.29, ..."


**n-gram Implementation**

In [None]:
save = pd.DataFrame(df).to_csv("result.csv")

In [None]:
save1= pd.read_csv('result.csv')
print(save1.head())

   Unnamed: 0                                                doc  \
0           0           reading kindle love lee childs good read   
1           1            ok first assesment kindle fucking rocks   
2           2  youll love kindle ive mine months never looked...   
3           3             fair enough but kindle i think perfect   
4           4                         big im quite happy kindle    

                                            keywords  
0  {'lee': 0.478, 'childs': 0.478, 'read': 0.386,...  
1  {'assesment': 0.468, 'rocks': 0.439, 'first': ...  
2  {'youll': 0.295, 'remorse': 0.295, 'months': 0...  
3  {'fair': 0.457, 'perfect': 0.428, 'enough': 0....  
4  {'quite': 0.524, 'big': 0.488, 'happy': 0.462,...  
