In [1]:
#based on https://towardsdatascience.com/introduction-to-nlp-part-3-tf-idf-explained-cedb1fc1f7dc
import nltk
nltk.download('stopwords')
nltk.download('wordnet')
import pandas as pd
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import RegexpTokenizer
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\saul1\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\saul1\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


# Text preprocessing

In [2]:
#Documents
d1 = 'I thought, I thought of thinking of thanking you for the gift'
d2 = 'She was thinking of going to go and get you a GIFT!'
print("d1: ", d1)
print("d2: ", d2)

def preprocess_text(document):
    """
    Preprocess an entire document
    """
    # Tokenise words while ignoring punctuation
    tokeniser = RegexpTokenizer(r'\w+')
    tokens = tokeniser.tokenize(document)
    
    # Lowercase and lemmatise 
    lemmatiser = WordNetLemmatizer()
    lemmas = [lemmatiser.lemmatize(token.lower(), pos='v') for token in tokens]
    
    # Remove stopwords, terms
    keywords= [lemma for lemma in lemmas if lemma not in stopwords.words('english')]
    return keywords

#preprocess the documents
d1_preprocessed = preprocess_text(d1)
d2_preprocessed = preprocess_text(d2)
print("d1 preprocessed: ", d1_preprocessed)
print("d2 preprocessed: ", d2_preprocessed)



d1:  I thought, I thought of thinking of thanking you for the gift
d2:  She was thinking of going to go and get you a GIFT!
d1 preprocessed:  ['think', 'think', 'think', 'thank', 'gift']
d2 preprocessed:  ['think', 'go', 'go', 'get', 'gift']


# TF-IDF transformation

In [9]:

def display_tfidfs(X_train_vectorised):
    # Convert sparse matrix to dataframe
    X_train = pd.DataFrame.sparse.from_spmatrix(X_train_vectorised)
    print("sparse X_train")
    print(X_train)
    # Save mapping on which index refers to which words
    #col_map = {v:k for k, v in X_train_vectorised.vocabulary_.items()}
    # Rename each column using the mapping
    #for col in X_train.columns:
        #X_train.rename(columns={col: col_map[col]}, inplace=True)
    #print(X_train)
    


# Create an instance of TfidfVectorizer
# we can send the preprocessing function as part of the tfidf vectoriser
tfidf_vectoriser = TfidfVectorizer(analyzer = preprocess_text)
# Create dataframe, input of the Tfidf vectoriser
X_train = pd.DataFrame({'corpus': [d1, d2]})


# Vectorise the data using the TF-IDF
#The result is encoded in a sparse matrix (i.e, 0 values are not included)
X_train_vectorised = tfidf_vectoriser.fit_transform(X_train['corpus'])
print(X_train_vectorised)
display_tfidfs(X_train_vectorised)




  (0, 1)	0.2889723007058112
  (0, 3)	0.40614048585175794
  (0, 4)	0.8669169021174337
  (1, 0)	0.4078241041497786
  (1, 2)	0.8156482082995572
  (1, 1)	0.29017020899133733
  (1, 4)	0.29017020899133733
sparse X_train
          0         1         2        3         4
0  0.000000  0.288972  0.000000  0.40614  0.866917
1  0.407824  0.290170  0.815648  0.00000  0.290170
