In [1]:
import nltk
import pandas as pd
import numpy as np
from nltk.corpus import stopwords
from nltk.stem import SnowballStemmer
from nltk.tokenize import RegexpTokenizer
from nltk.stem import WordNetLemmatizer
import json
import langid


In [2]:
# imports for analysis
import gensim
import gensim.corpora as corpora
from gensim.utils import simple_preprocess
from gensim.models import CoherenceModel
import spacy
import pyLDAvis
import pyLDAvis.gensim
from IPython.core.display import HTML

# NOTES !
# You might need to run these for spaCy to work!
# !pip install https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-2.2.0/en_core_web_sm-2.2.0.tar.gz
# ensure you have the latest version of PANDAS!


In [3]:
# 1. Data Collection --------------------------------------------------------------------------------

# Loading json
with open('Science_Technology_News.json', encoding='utf-8') as f:
    data = json.load(f)

# Main dataframe
df = pd.json_normalize(data)

# Printing Main dataframe structure
print(df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5437 entries, 0 to 5436
Data columns (total 11 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   title             5437 non-null   object
 1   link              5437 non-null   object
 2   keywords          2502 non-null   object
 3   creator           2610 non-null   object
 4   video_url         0 non-null      object
 5   description       4936 non-null   object
 6   content           1627 non-null   object
 7   pubDate           5437 non-null   object
 8   full_description  2670 non-null   object
 9   image_url         2169 non-null   object
 10  source_id         5437 non-null   object
dtypes: object(11)
memory usage: 467.4+ KB
None


In [4]:
# 2. Information Extraction -----------------------------------------------------------------

# Extracting relevant fields
df = df[['title', 'pubDate', 'creator', 'content']].copy()

# Filtering out non-English articles ------------------------------------------
def englishfilter(text):
    if pd.isna(text):
        return "other"
    else:
        return langid.classify(text)[0]
df['language'] = df['content'].apply(englishfilter)
df = df[df['language'] == 'en']
df.drop('language', axis=1, inplace=True)
#-----------------------------------------------------------------------------

# Preprocessing
stop_words = set(stopwords.words('english'))
stemmer = SnowballStemmer('english')
tokenizer = RegexpTokenizer(r'\w+')
lemmatizer = WordNetLemmatizer()

def preprocess(text):
    if text:
        tokens = tokenizer.tokenize(text.lower())
        filtered_tokens = [word for word in tokens if word not in stop_words]
        stemmed_tokens = [stemmer.stem(word) for word in filtered_tokens]
        return " ".join(stemmed_tokens)
    return ""

def preprocess_lemmatize(text):
    if text:
        tokens = tokenizer.tokenize(text.lower())
        filtered_tokens = [word for word in tokens if word not in stop_words]
        lemmed_tokens = [lemmatizer.lemmatize(word) for word in filtered_tokens]
        return " ".join(lemmed_tokens)
    return ""

# Applying preprocessing to content field
df['processed_content_stemmed'] = df['content'].apply(preprocess)

# Applying preprocessing to content field
df['processed_content_lemmed'] = df['content'].apply(preprocess_lemmatize)


In [5]:
# Printing Main dataframe structure
print(df.info())

<class 'pandas.core.frame.DataFrame'>
Int64Index: 565 entries, 19 to 5423
Data columns (total 6 columns):
 #   Column                     Non-Null Count  Dtype 
---  ------                     --------------  ----- 
 0   title                      565 non-null    object
 1   pubDate                    565 non-null    object
 2   creator                    486 non-null    object
 3   content                    565 non-null    object
 4   processed_content_stemmed  565 non-null    object
 5   processed_content_lemmed   565 non-null    object
dtypes: object(6)
memory usage: 30.9+ KB
None


In [6]:
# Printing preprocessed content
print(df['processed_content_stemmed'].head())
print(df['processed_content_lemmed'].head())

19    african automot compani autochek secur 13 1 mi...
25    right schedul spacex roll new crew dragon spac...
26    jeff bezo offici announc plan space ventur blu...
68    hasbro offici partner worldwid asset exchang w...
80    new seri cyclingnew delv back catalogu profess...
Name: processed_content_stemmed, dtype: object
19    african automotive company autochek secured 13...
25    right schedule spacex rolled new crew dragon s...
26    jeff bezos officially announced plan space ven...
68    hasbro officially partnered worldwide asset ex...
80    new series cyclingnews delving back catalogue ...
Name: processed_content_lemmed, dtype: object


In [7]:
## Topic Modeling ----------------------------------------------------------------------------------------------------------

# Restructuring of the data is nessecary 
# will be using the lammetized data (better for topic modeling)
# will be dropping all Nan values for data uniformity
# will be combining titles, dates, etc for training

unframed_data = df.drop(["content", "processed_content_stemmed"], axis=1).dropna().values.tolist()

# set up text_corpus
text_corpus = []

for article in unframed_data:
    new_article = []
    new_article.append(article[0].split(" "))
    new_article = new_article + article[1].split(" ")
    new_article = new_article + article[2][0].split(" ")
    new_article = new_article[0] + article[3].split(" ")
    text_corpus.append(new_article)

# suppressed, for debugging purposes only
# print(text_corpus[0])

In [8]:
# Perform mapping of ID to words
id2word = corpora.Dictionary(text_corpus)

corpus = []
for article in text_corpus:
    new = id2word.doc2bow(article)
    corpus.append(new)

print (corpus[0][0:20])

word = id2word[[0][:1][0]]
print (word)

[(0, 1), (1, 2), (2, 2), (3, 1), (4, 1), (5, 1), (6, 1), (7, 1), (8, 1), (9, 2), (10, 2), (11, 1), (12, 1), (13, 2), (14, 1), (15, 1), (16, 1), (17, 2), (18, 1), (19, 2)]
$13.1M


In [9]:
# Actual Topic Model using LDA

lda_model = gensim.models.ldamodel.LdaModel(corpus=corpus,
                                           id2word=id2word,
                                           num_topics=30,
                                           random_state=100,
                                           update_every=1,
                                           chunksize=100,
                                           passes=10,
                                           alpha="auto")

In [10]:
## Topic Visualization
pyLDAvis.enable_notebook(local=True)
vis = pyLDAvis.gensim.prepare(lda_model, corpus, id2word, mds="mmds", R=30)
vis

  head(R).drop('saliency', 1)
