In [1]:
#Prepare data for use in this exercise

import nltk
import os
#Download punkt package, used part of the other commands
nltk.download('punkt')

#Read the base file into a token list
base_file = open(os.getcwd()+ "/data_science.txt", 'rt')
raw_text = base_file.read()
base_file.close()

#Execute the same pre-processing done in module 3
token_list = nltk.word_tokenize(raw_text)

token_list2 = list(filter(lambda token: nltk.tokenize.punkt.PunktToken(token).is_non_punct, token_list))

token_list3=[word.lower() for word in token_list2 ]

nltk.download('stopwords')
from nltk.corpus import stopwords
token_list4 = list(filter(lambda token: token not in stopwords.words('english'), token_list3))

nltk.download('wordnet')
from nltk.stem import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()
token_list6 = [lemmatizer.lemmatize(word) for word in token_list4 ]

print("\n Total Tokens : ",len(token_list6))

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\DELL\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\DELL\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\DELL\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!



 Total Tokens :  79


## 04_01 Build ngrams

In [2]:
from nltk.util import ngrams
from collections import Counter

In [3]:
#Find bigrams and print the most common 5
bigrams = ngrams(token_list6,2)
print("Most common bigrams : ")
print(Counter(bigrams).most_common(5))

Most common bigrams : 
[(('data', 'science'), 2), (('science', 'study'), 1), (('study', 'data'), 1), (('data', 'extract'), 1), (('extract', 'meaningful'), 1)]


In [4]:
#Find trigrams and print the most common 5
trigrams = ngrams(token_list6,3)
print(" \n Most common trigrams : " )
print(Counter(trigrams).most_common(5))

 
 Most common trigrams : 
[(('data', 'science', 'study'), 1), (('science', 'study', 'data'), 1), (('study', 'data', 'extract'), 1), (('data', 'extract', 'meaningful'), 1), (('extract', 'meaningful', 'insight'), 1)]


## 04_02 Parts-of-Speech Tagging

Some examples of Parts-of-Speech abbreviations:
NN : noun
NNS : noun plural
VBP : Verb singular present.

In [5]:
#download the tagger package
nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\DELL\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping taggers\averaged_perceptron_tagger.zip.


True

In [6]:
#Tag and print the first 10 tokens
nltk.pos_tag(token_list4)[:10]

[('data', 'NNS'),
 ('science', 'NN'),
 ('study', 'NN'),
 ('data', 'NNS'),
 ('extract', 'VBP'),
 ('meaningful', 'JJ'),
 ('insights', 'NNS'),
 ('business', 'NN'),
 ('multidisciplinary', 'JJ'),
 ('approach', 'NN')]

## 04_04 Building TF-IDF matrix

In [7]:
#Use scikit-learn library
from sklearn.feature_extraction.text import TfidfVectorizer
import pandas as pd

#Use a small corpus for each visualization
vector_corpus = [
    'NBA is a Basketball league',
    'Basketball is popular in America.',
    'TV in America telecast BasketBall.',
]

#Create a vectorizer for english language
vectorizer = TfidfVectorizer(stop_words='english')

#Create the vector
tfidf=vectorizer.fit_transform(vector_corpus)

print("Tokens used as features are : ")
print(vectorizer.get_feature_names())

print("\n Size of array. Each row represents a document. Each column represents a feature/token")
print(tfidf.shape)

print("\n Actual TF-IDF array")
tfidf.toarray()


Tokens used as features are : 
['america', 'basketball', 'league', 'nba', 'popular', 'telecast', 'tv']

 Size of array. Each row represents a document. Each column represents a feature/token
(3, 7)

 Actual TF-IDF array




array([[0.        , 0.38537163, 0.65249088, 0.65249088, 0.        ,
        0.        , 0.        ],
       [0.54783215, 0.42544054, 0.        , 0.        , 0.72033345,
        0.        , 0.        ],
       [0.44451431, 0.34520502, 0.        , 0.        , 0.        ,
        0.5844829 , 0.5844829 ]])