In [1]:
import gensim
import pandas as pd
import nltk as nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from gensim import corpora


In [2]:
rs = 123
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.


True

In [3]:
#BoW Features
course1 = "this is an introduction data science course which introduces data science to beginners"
course2 = "machine learning for beginners"
courses = [course1, course2]
courses

['this is an introduction data science course which introduces data science to beginners',
 'machine learning for beginners']

In [4]:
#Tokenizing the two courses
tokenized_courses = [word_tokenize(course) for course in courses]
tokenized_courses

[['this',
  'is',
  'an',
  'introduction',
  'data',
  'science',
  'course',
  'which',
  'introduces',
  'data',
  'science',
  'to',
  'beginners'],
 ['machine', 'learning', 'for', 'beginners']]

In [5]:
#Create a toke dictionary for the two courses
tokens_dict = corpora.Dictionary(tokenized_courses)
print(tokens_dict.token2id)

{'an': 0, 'beginners': 1, 'course': 2, 'data': 3, 'introduces': 4, 'introduction': 5, 'is': 6, 'science': 7, 'this': 8, 'to': 9, 'which': 10, 'for': 11, 'learning': 12, 'machine': 13}


In [7]:
#Generate BoW features for each course
courses_bow = [tokens_dict.doc2bow(course) for course in tokenized_courses]
courses_bow

[[(0, 1),
  (1, 1),
  (2, 1),
  (3, 2),
  (4, 1),
  (5, 1),
  (6, 1),
  (7, 2),
  (8, 1),
  (9, 1),
  (10, 1)],
 [(1, 1), (11, 1), (12, 1), (13, 1)]]

In [10]:
#Printing each token and its count
for course_idx, course_bow in enumerate(courses_bow):
    print(f"Bag of words for course {course_idx}:")
    # For each token index, print its bow value (word count)
    for token_index, token_bow in course_bow:
        token = tokens_dict.get(token_index)
        print(f"--Token: '{token}', Count:{token_bow}")

Bag of words for course 0:
--Token: 'an', Count:1
--Token: 'beginners', Count:1
--Token: 'course', Count:1
--Token: 'data', Count:2
--Token: 'introduces', Count:1
--Token: 'introduction', Count:1
--Token: 'is', Count:1
--Token: 'science', Count:2
--Token: 'this', Count:1
--Token: 'to', Count:1
--Token: 'which', Count:1
Bag of words for course 1:
--Token: 'beginners', Count:1
--Token: 'for', Count:1
--Token: 'learning', Count:1
--Token: 'machine', Count:1


In [12]:
#BoW dimensionality reduction
stop_words = set(stopwords.words('english'))
stop_words

{'a',
 'about',
 'above',
 'after',
 'again',
 'against',
 'ain',
 'all',
 'am',
 'an',
 'and',
 'any',
 'are',
 'aren',
 "aren't",
 'as',
 'at',
 'be',
 'because',
 'been',
 'before',
 'being',
 'below',
 'between',
 'both',
 'but',
 'by',
 'can',
 'couldn',
 "couldn't",
 'd',
 'did',
 'didn',
 "didn't",
 'do',
 'does',
 'doesn',
 "doesn't",
 'doing',
 'don',
 "don't",
 'down',
 'during',
 'each',
 'few',
 'for',
 'from',
 'further',
 'had',
 'hadn',
 "hadn't",
 'has',
 'hasn',
 "hasn't",
 'have',
 'haven',
 "haven't",
 'having',
 'he',
 'her',
 'here',
 'hers',
 'herself',
 'him',
 'himself',
 'his',
 'how',
 'i',
 'if',
 'in',
 'into',
 'is',
 'isn',
 "isn't",
 'it',
 "it's",
 'its',
 'itself',
 'just',
 'll',
 'm',
 'ma',
 'me',
 'mightn',
 "mightn't",
 'more',
 'most',
 'mustn',
 "mustn't",
 'my',
 'myself',
 'needn',
 "needn't",
 'no',
 'nor',
 'not',
 'now',
 'o',
 'of',
 'off',
 'on',
 'once',
 'only',
 'or',
 'other',
 'our',
 'ours',
 'ourselves',
 'out',
 'over',
 'own',
 'r

In [13]:
#Tokens in course one
tokenized_courses[0]

['this',
 'is',
 'an',
 'introduction',
 'data',
 'science',
 'course',
 'which',
 'introduces',
 'data',
 'science',
 'to',
 'beginners']

In [15]:
processed_tokens = [w for w in tokenized_courses[0] if not w.lower() in stop_words]
processed_tokens

['introduction',
 'data',
 'science',
 'course',
 'introduces',
 'data',
 'science',
 'beginners']

In [16]:
tags = nltk.pos_tag(tokenized_courses[0])
tags

[('this', 'DT'),
 ('is', 'VBZ'),
 ('an', 'DT'),
 ('introduction', 'NN'),
 ('data', 'NNS'),
 ('science', 'NN'),
 ('course', 'NN'),
 ('which', 'WDT'),
 ('introduces', 'VBZ'),
 ('data', 'NNS'),
 ('science', 'NN'),
 ('to', 'TO'),
 ('beginners', 'NNS')]