In [2]:
import numpy as np
import nltk
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.decomposition import TruncatedSVD

In [4]:
import matplotlib.pyplot as plt
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords

In [31]:
import re

In [3]:
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...


True

In [5]:
lemmatizer = WordNetLemmatizer()

In [6]:
!head all_book_titles.txt

Philosophy of Sex and Love A Reader
Readings in Judaism, Christianity, and Islam
Microprocessors Principles and Applications
Bernhard Edouard Fernow: Story of North American Forestry
Encyclopedia of Buddhism
Motorola Microprocessor Family: 68000, 68008, 68010, 68020, 68030, and 68040, Programming and Interfacing with Applications
American Anthem: Student Edition Modern Era 2007
How to Read Literature Like a Professor A Lively and Entertaining Guide to Reading Between the Lines
Men Are from Mars, Women Are from Venus Secrets of Great Sex, Improving Communication, Lasting Intimacy and Fulfillment, Giving and Receiving Love, Secrets of Passion, Understanding Martian
Religious Traditions of the World A Journey Through Africa, Mesoamerica, North America, Judaism, Christianity, Islam, Hinduism, Buddhism, China, an


In [38]:
files = open('all_book_titles.txt', 'r')

In [39]:
books = [line for line in files]
books = [re.sub(r'[\n]','',book) for book in books]
books

['Philosophy of Sex and Love A Reader',
 'Readings in Judaism, Christianity, and Islam',
 'Microprocessors Principles and Applications',
 'Bernhard Edouard Fernow: Story of North American Forestry',
 'Encyclopedia of Buddhism',
 'Motorola Microprocessor Family: 68000, 68008, 68010, 68020, 68030, and 68040, Programming and Interfacing with Applications',
 'American Anthem: Student Edition Modern Era 2007',
 'How to Read Literature Like a Professor A Lively and Entertaining Guide to Reading Between the Lines',
 'Men Are from Mars, Women Are from Venus Secrets of Great Sex, Improving Communication, Lasting Intimacy and Fulfillment, Giving and Receiving Love, Secrets of Passion, Understanding Martian',
 'Religious Traditions of the World A Journey Through Africa, Mesoamerica, North America, Judaism, Christianity, Islam, Hinduism, Buddhism, China, an',
 "World's Wisdom Sacred Texts of the World's Religions",
 "Illustrated World's Religions A Guide to Our Wisdom Traditions",
 'Soul of Sex Cu

In [17]:
stops = set(stopwords.words('english'))

In [20]:
stops = stops.union({
  'introduction', 'edition', 'series', 'application',
  'approach', 'card', 'access', 'package', 'plus', 'etext',
  'brief', 'vol', 'fundamental', 'guide', 'essential', 'printed',
  'third', 'second', 'fourth', 'volume', 'entertaining', 'literature','philosophy'})

In [40]:
def tokenize(line):
    line = line.lower().strip()
    tokens = nltk.word_tokenize(line)
    tokens = [token for token in tokens if len(token) > 2]
    tokens = [lemmatizer.lemmatize(token) for token in tokens]
    tokens = [token for token in tokens if token not in stops]
    # tokens = [token for token in tokens if token not any (digit.isDigit() for digit in token)]
    return tokens

In [41]:
vectorizer = CountVectorizer(tokenizer = tokenize)

In [42]:
X = vectorizer.fit_transform(books)



In [44]:
X = X.T

In [45]:
X

<2233x2373 sparse matrix of type '<class 'numpy.int64'>'
	with 10514 stored elements in Compressed Sparse Column format>

In [46]:
idx_word_map = vectorizer.get_feature_names_out()

In [50]:
idx_word_map

array(["'90", "'the", '...', ..., 'zen', 'zionism', 'zurich'],
      dtype=object)

In [51]:
svd = TruncatedSVD()

In [52]:
Y = svd.fit_transform(X)

In [57]:
Y

array([[ 0.01705822, -0.00351939],
       [ 0.0006342 ,  0.0015203 ],
       [ 0.0265129 , -0.01289354],
       ...,
       [ 0.00295964,  0.00410861],
       [ 0.00766142,  0.00341168],
       [ 0.00114753,  0.00237012]])

In [59]:
import plotly.express as px

In [63]:
fig = px.scatter(x = Y[:,0], y = Y[:, 1], text = idx_word_map)
fig.update_traces(textposition='top center')
fig.show()
# On the leftmost side there are words related to science and tech
# On the topside there are words related to Arts and History

In [None]:
#