In [None]:
import nltk        
import string
import re
import numpy as np
import pandas as pd
from nltk.stem import WordNetLemmatizer 
from nltk.corpus import wordnet
from nltk.corpus import stopwords


stop_words = stopwords.words('english')

# import string
wordnet_lemmatizer = WordNetLemmatizer()
lemmaAdj = []
lemmaVerb = []
tagged_tokens = []
token = []

# wordnet and treebank have different tagging systems
# define a mapping between wordnet tags and POS tags as a function
def get_wordnet_pos(pos_tag):
    # if pos tag starts with 'J'
    if pos_tag.startswith('J'):
        # return wordnet tag "ADJ"
        return wordnet.ADJ

    # if pos tag starts with 'V'
    elif pos_tag.startswith('V'):
        # return wordnet tag "VERB"
        return wordnet.VERB

    # if pos tag starts with 'N'
    elif pos_tag.startswith('N'):
        # return wordnet tag "NOUN"
        return wordnet.NOUN

    elif pos_tag.startswith('R'):
        return wordnet.ADV
    else:
        # be default, return wordnet tag "NOUN"
        return wordnet.NOUN
    
def tokenize(text):
    
    wordnet_lemmatizer = WordNetLemmatizer()
    token_count = None
    for i in re.findall("([A-Z]+)", str(text)):
        text = text.replace(i, i.lower())
    
    
    text = re.findall(r'[a-z0-9][a-z0-9._@-]*[a-z0-9]', str(text))
    stop_words = stopwords.words('english')
    
    tagged_tokens= nltk.pos_tag(text)
    

    lemmatized_words=[wordnet_lemmatizer.lemmatize\
          (word, get_wordnet_pos(tag)) \
          for (word, tag) in tagged_tokens \
          # remove stop words
              if word not in stop_words and \
          # remove punctuations
              word not in string.punctuation]
    #print(lemmatized_words)
    word_dist=nltk.FreqDist(lemmatized_words)
    return word_dist
    
    
    


def get_tf_idf(text):
    docs_tokens={idx:tokenize(doc) \
             for idx,doc in enumerate(text)}
    
    
    dtm=pd.DataFrame.from_dict(docs_tokens, orient="index" )
    dtm=dtm.fillna(0)
    #print(dtm)
    
    tf=dtm.values
    doc_len=tf.sum(axis=1)
    tf=np.divide(tf.T, doc_len).T
    
    df=np.where(tf>0,1,0)
    
    smoothed_idf=np.log(np.divide(len(text)+1, np.sum(df, axis=0)+1))+1    
    smoothed_tf_idf=tf*smoothed_idf
    
    return smoothed_tf_idf, docs_tokens



if __name__ == "__main__":
    

    data = pd.read_csv("entireBookList8700.csv", header=0)
    tf_idf,freq_dist= get_tf_idf(data["Description"].values.tolist())
    print("TF_IDF Matrix for description of all 8700 books:\n",tf_idf)
    print("\nFrequency distribution\n", freq_dist)