In [1]:
# ! pip install gensim
# ! pip install pyLDAvis

In [2]:
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to C:\Users\MORNING
[nltk_data]     SHIFT\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [3]:
import numpy as np
import json
import glob

# Gensim
import gensim
import gensim.corpora as corpora
from gensim.utils import simple_preprocess
from gensim.models import CoherenceModel

# Spacy
import spacy
from nltk.corpus import stopwords

# Visualization
import pyLDAvis
import pyLDAvis.gensim

import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning)

In [4]:
def load_data(file):
    with open(file, "r", encoding="utf-8") as f:
        data = f.read().splitlines()
    return data


In [5]:
stopwords = stopwords.words("english")
print(stopwords)

['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', '

In [6]:
data = load_data(r"C:\Users\MORNING SHIFT\Downloads\sample.txt")
print(data[:50])

['Machine learning is a subfield of artificial intelligence, which is broadly defined as the capability of a machine to imitate intelligent human behavior. Artificial intelligence systems are used to perform complex tasks in a way that is similar to how humans solve problems.', '', 'A carpenter is responsible for working with wood to build and repair structures and other items and they also may have to select and source appropriate materials depending on project needs. Their duties include calculating quotes, meeting with clients and creating design plans for remodeling projects.', '', 'Artificial developers design, Machine learning is a subfield of artificial intelligence, which is broadly defined as the capability of a machine to imitate intelligent human behavior. Artificial intelligence systems are used to perform complex tasks in a way that is similar to how humans solve problems. They also help build software systems that power networks and devices and ensure that those systems r

In [7]:
def lemmatization(texts, allowed_postags=["NOUN", "ADJ", "VERB", "ADV"]):
    nlp = spacy.load("en_core_web_sm", disable=["parser", "ner"])
    texts_out = []
    for text in texts:
        doc = nlp(text)
        new_text = []
        for token in doc:
            if token.pos_ in allowed_postags:
                new_text.append(token.lemma_)
        final = " ".join(new_text)
        texts_out.append(final)
    return texts_out

# Lemmatize the text data
lemmatized_texts = lemmatization(data)
print(lemmatized_texts[0][:90])
print(lemmatized_texts)

  from .autonotebook import tqdm as notebook_tqdm
  _torch_pytree._register_pytree_node(
  _torch_pytree._register_pytree_node(


machine learning subfield artificial intelligence broadly define capability machine imitat
['machine learning subfield artificial intelligence broadly define capability machine imitate intelligent human behavior artificial intelligence system use perform complex task way similar human solve problem', '', 'carpenter responsible work wood build repair structure other item also select source appropriate material depend project need duty include calculate quote meet client create design plan remodeling project', '', 'artificial developer design learning subfield artificial intelligence broadly define capability machine imitate intelligent human behavior artificial intelligence system use perform complex task way similar human solve problem also help build software system power network device ensure system remain functional', '', 'simple way understand relate other broad concept enable machine system sense reason act adapt human application allow machine extract knowledge datum learn autono

In [8]:
def gen_words(texts):
    final = []
    for text in texts:
        new = gensim.utils.simple_preprocess(text, deacc=True)
        final.append(new)
    return final

# Generate words
data_words = gen_words(lemmatized_texts)
#print(data_words[0][:20])
print(data_words)

[['machine', 'learning', 'subfield', 'artificial', 'intelligence', 'broadly', 'define', 'capability', 'machine', 'imitate', 'intelligent', 'human', 'behavior', 'artificial', 'intelligence', 'system', 'use', 'perform', 'complex', 'task', 'way', 'similar', 'human', 'solve', 'problem'], [], ['carpenter', 'responsible', 'work', 'wood', 'build', 'repair', 'structure', 'other', 'item', 'also', 'select', 'source', 'appropriate', 'material', 'depend', 'project', 'need', 'duty', 'include', 'calculate', 'quote', 'meet', 'client', 'create', 'design', 'plan', 'remodeling', 'project'], [], ['artificial', 'developer', 'design', 'learning', 'subfield', 'artificial', 'intelligence', 'broadly', 'define', 'capability', 'machine', 'imitate', 'intelligent', 'human', 'behavior', 'artificial', 'intelligence', 'system', 'use', 'perform', 'complex', 'task', 'way', 'similar', 'human', 'solve', 'problem', 'also', 'help', 'build', 'software', 'system', 'power', 'network', 'device', 'ensure', 'system', 'remain', 

In [9]:
# Create a dictionary and corpus
id2word = corpora.Dictionary(data_words)

corpus = []
for text in data_words:
    new = id2word.doc2bow(text)
    corpus.append(new)

#print(corpus[0][:20])
print(corpus)
word = id2word[[0][:1][0]]
print(word)

[[(0, 2), (1, 1), (2, 1), (3, 1), (4, 1), (5, 1), (6, 2), (7, 1), (8, 2), (9, 1), (10, 1), (11, 2), (12, 1), (13, 1), (14, 1), (15, 1), (16, 1), (17, 1), (18, 1), (19, 1), (20, 1)], [], [(21, 1), (22, 1), (23, 1), (24, 1), (25, 1), (26, 1), (27, 1), (28, 1), (29, 1), (30, 1), (31, 1), (32, 1), (33, 1), (34, 1), (35, 1), (36, 1), (37, 1), (38, 2), (39, 1), (40, 1), (41, 1), (42, 1), (43, 1), (44, 1), (45, 1), (46, 1), (47, 1)], [], [(0, 3), (1, 1), (2, 1), (3, 1), (4, 1), (5, 1), (6, 2), (7, 1), (8, 2), (9, 1), (10, 1), (11, 1), (12, 1), (13, 1), (14, 1), (15, 1), (16, 1), (17, 3), (18, 1), (19, 1), (20, 1), (21, 1), (23, 1), (29, 1), (48, 1), (49, 1), (50, 1), (51, 1), (52, 1), (53, 1), (54, 1), (55, 1), (56, 1)], [], [(6, 1), (11, 2), (17, 1), (20, 1), (36, 1), (57, 1), (58, 1), (59, 1), (60, 1), (61, 1), (62, 1), (63, 1), (64, 1), (65, 1), (66, 1), (67, 1), (68, 1), (69, 1), (70, 1), (71, 1), (72, 1), (73, 1)], [], [(21, 1), (74, 1), (75, 1), (76, 1), (77, 2), (78, 2), (79, 1), (80, 

In [10]:
# Build LDA model
lda_model = gensim.models.ldamodel.LdaModel(corpus=corpus,
                                           id2word=id2word,
                                           num_topics=2,
                                           random_state=100,
                                           update_every=1,
                                           chunksize=100,
                                           passes=10,
                                           alpha="auto")

In [11]:
# Visualize the data
pyLDAvis.enable_notebook()
vis = pyLDAvis.gensim.prepare(lda_model, corpus, id2word, mds="mmds", R=50)
vis