# Imports
___

In [1]:
!pip install pyLDAvis



In [2]:
# import warnings filter
from warnings import simplefilter
# ignore all future warnings
simplefilter(action='ignore', category=FutureWarning)

import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
from matplotlib import cm
from matplotlib.colors import ListedColormap, LinearSegmentedColormap, TwoSlopeNorm
import seaborn as sns

from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.naive_bayes import MultinomialNB
from sklearn.preprocessing import FunctionTransformer, StandardScaler
from sklearn.cluster import DBSCAN, KMeans
from sklearn.manifold import TSNE
from sklearn.decomposition import PCA, LatentDirichletAllocation
from sklearn.feature_extraction.text import TfidfVectorizer

# Import CountVectorizer from feature_extraction.text.
from sklearn.feature_extraction.text import CountVectorizer

from nltk.stem import WordNetLemmatizer, PorterStemmer
from nltk import word_tokenize
from nltk.stem import PorterStemmer
from nltk.corpus import stopwords

from __future__ import print_function 
import pyLDAvis
import pyLDAvis.sklearn

import string
import collections

from pprint import pprint

  from imp import reload


Warnings Code:
https://machinelearningmastery.com/how-to-fix-futurewarning-messages-in-scikit-learn/

# Bring in Cleaned Dataset
___

In [3]:
articles = pd.read_csv('Datasets/articles_clean.csv')

In [4]:
articles.head()

Unnamed: 0,abstract,snippet,lead_paragraph,headline,keywords,pub_date,news_desk,section_name,type_of_material,word_count,uri,all_text,overall_sentiment
0,"The United States, supported by Britain, today...","The United States, supported by Britain, today...","The United States, supported by Britain, toda...","""U.S. and Britain Clearing Way for ""Relentless...",The United States Taliban Rhode Television Te...,2001-10-07 05:00:00+00:00,International,World,News,1333,nyt://article/02612e18-76c0-5447-bace-71473568...,"The United States, supported by Britain, today...",-0.9648
1,"Government can be expected to try to control, ...","Government can be expected to try to control, ...","IT will be, Americans have already been warned...","""The World",World Trade Center (NYC) Airlines and Airplane...,2001-10-07 05:00:00+00:00,Week in Review Desk,Week in Review,News,1322,nyt://article/0abd157f-d390-5787-bf89-4e87dad4...,"Government can be expected to try to control, ...",-0.9761
2,Richard B Woodward interviews photographer Edw...,Richard B Woodward interviews photographer Edw...,THE photographer Edward Grazda first visited A...,"""Art/Architecture; Images of Afghanistan, Befo...",Afghanistan Grazda Woodward Photography,2001-10-07 05:00:00+00:00,Arts and Leisure Desk,Arts,Interview,1638,nyt://article/15581e37-54ea-5aa4-bc80-34588ab1...,Richard B Woodward interviews photographer Edw...,-0.6815
3,"Violence Policy Center, Washington-based group...","Violence Policy Center, Washington-based group...","In the late 1980's, an American-based agent fo...","""In 80""s, Afghan Militias Used U.S. Rifles""",New York City Russia Washington (DC) Afghanist...,2001-10-07 05:00:00+00:00,National Desk,U.S.,News,603,nyt://article/17f9cbdb-ff36-5df1-8a37-5c02edbb...,"Violence Policy Center, Washington-based group...",-0.9661
4,Sending bright flashes of light and loud concu...,Sending bright flashes of light and loud concu...,"TOPDARA, Afghanistan, Oct. 7 — Sending bright...","""Thunderous Attack Heard in Kabul""",Afghanistan Taliban Dostum United States Polit...,2001-10-07 05:00:00+00:00,International,World,News,611,nyt://article/1a2a8209-9dd4-501b-a988-0c2adfc9...,Sending bright flashes of light and loud concu...,0.9022


In [5]:
articles.shape

(37398, 13)

# Topic Clustering with LDA
___

### Add overemphasized words to stopwords

In [8]:
new_stop_words = stopwords.words('english')
new_stop_words.append('administration')
new_stop_words.append('says')
new_stop_words.append('said')
new_stop_words.append('photo')
new_stop_words.append('one')
new_stop_words.append('two')

### Create Class for Easy LDA Modeling

In [9]:
class lda_model:
    def __init__(self, corpus, topic_count):
        self
        self.text = corpus
        self.topic_count = topic_count

    def fit(self):
    #     initialize and define params for vectorizer
        self.vectorizer = TfidfVectorizer(tokenizer=None,
                                     stop_words=new_stop_words,
                                     max_df=0.5,
                                     min_df=0.1,
                                     lowercase=True)

    #     vectorize text
        self.tfidf_model = self.vectorizer.fit_transform(self.text)

    #     perform lda
        self.lda = LatentDirichletAllocation(n_components=self.topic_count,
            random_state=0)
        self.lda_matrix = self.lda.fit_transform(self.tfidf_model)
        self.lda_components = self.lda.components_
        self.terms = self.vectorizer.get_feature_names_out()        
    
    def topics(self):
        #     get topics
        terms = self.vectorizer.get_feature_names()
        
        lda_topics = ''
        for index, component in enumerate(self.lda_components):
            zipped = zip(terms, component)
            top_terms_key=sorted(zipped, key = lambda t: t[1], reverse=True)[:7]
            top_terms_list=list(dict(top_terms_key).keys())
            lda_topics = lda_topics+f"Topic {str(index)}: {top_terms_list}" + '\n'
        return print(lda_topics) 
    
    def viz(self):
        return self.lda, self.tfidf_model, self.vectorizer  
    
    def components (self):
        return self.lda_components
    
    def columns(self):
        # To view what topics are assigned to the douments:
        doc_topic = self.lda.transform(self.tfidf_model)

        return [doc_topic[n].argmax() for n in range (doc_topic. shape[0])]
        

Helped with returning topics: https://machinelearninggeek.com/latent-dirichlet-allocation-using-scikit-learn/

### Use for loop to determine the ideal number of topics

In [10]:
for num in range(2, 13):
    print(f'Topics Count: {num}')
    model = lda_model(articles['all_text'], num)
    model.fit()
    print(model.topics(), '\n')

Topics Count: 2
Topic 0: ['afghanistan', 'president', 'military', 'united', 'washington', 'taliban', 'states']
Topic 1: ['new', 'war', 'american', 'afghan', 'officials', 'government', 'iraq']

None 

Topics Count: 3
Topic 0: ['president', 'military', 'washington', 'taliban', 'afghanistan', 'american', 'afghan']
Topic 1: ['new', 'afghan', 'american', 'officials', 'government', 'afghanistan', 'taliban']
Topic 2: ['afghanistan', 'war', 'iraq', 'united', 'states', 'american', 'military']

None 

Topics Count: 4
Topic 0: ['president', 'military', 'washington', 'american', 'officials', 'afghanistan', 'taliban']
Topic 1: ['new', 'afghan', 'officials', 'american', 'government', 'taliban', 'afghanistan']
Topic 2: ['afghanistan', 'war', 'taliban', 'afghan', 'american', 'military', 'government']
Topic 3: ['iraq', 'united', 'states', 'american', 'war', 'washington', 'military']

None 

Topics Count: 5
Topic 0: ['american', 'afghan', 'taliban', 'afghanistan', 'military', 'officials', 'washington']


#### With 5 topics 
1. Seems to be US Military Control in Afghanistan
2. Seems to relate more to Government Centric Updates
3. Seems to address General Public Opinion
4. White House Press Releases
5. Military Press Releases

#### Questions to ask:
1. Who(groups or individuals) would a sentiment of each topic represent
2. How does the sentiment of these groups change
3. what does it mean if they stay the same and how is the project still useful

In [6]:
articles[articles['all_text'].str.contains('one')]['all_text'][2]

'Richard B Woodward interviews photographer Edward Grazda, who comments on his work photographing recent history of Afghanistan; photo (M) - Richard B Woodward interviews photographer Edward Grazda, who comments on his work photographing recent history of Afghanistan; photo (M) - THE photographer Edward Grazda first visited Afghanistan in 1980, a year after the Russian entry into its civil war. During more than 15 subsequent trips over the next 20 years, he has amassed one of the most comprehensive, if personal, archives on the recent historical travails of the Afghan people. - "Art/Architecture; Images of Afghanistan, Before They Were Banned"'

In [7]:
articles[articles['all_text'].str.contains('new')]['all_text'][37382]

'As American troops rush to complete their withdrawal by President Biden’s Tuesday deadline, many Afghans are afraid that reprisals from the country’s new rulers will soon follow. -  - As American troops rush to complete their withdrawal by President Biden’s Tuesday deadline, many Afghans are afraid that reprisals from the country’s new rulers will soon follow. - "Some Afghans say that enemies of the Taliban have begun to disappear."'

### Visualize Topic Clusters

In [11]:
pyLDAvis.enable_notebook()

Adapting code to SKlearn help from: https://github.com/bmabey/pyLDAvis/blob/master/notebooks/sklearn.ipynb

In [12]:
model = lda_model(articles['all_text'], 5)
model.fit()
lda_tf, dtm_tf, tf_vectorizer = model.viz()

In [13]:
model.topics()

Topic 0: ['american', 'afghan', 'taliban', 'afghanistan', 'military', 'officials', 'washington']
Topic 1: ['officials', 'government', 'afghan', 'american', 'afghanistan', 'washington', 'new']
Topic 2: ['war', 'afghanistan', 'afghan', 'american', 'military', 'iraq', 'united']
Topic 3: ['new', 'united', 'states', 'washington', 'american', 'president', 'military']
Topic 4: ['president', 'iraq', 'military', 'washington', 'american', 'war', 'afghanistan']



In [14]:
%matplotlib inline

model_viz = pyLDAvis.sklearn.prepare(lda_model =lda_tf, dtm=dtm_tf, vectorizer=tf_vectorizer)
model_viz

In [15]:
pyLDAvis.save_html(model_viz, 'Visualizations/lda.html')

## Adding LDA Topic clusters to DataFrame
___

In [16]:
articles['topic_cluster'] = pd.Series(model.columns())
articles.head()

Unnamed: 0,abstract,snippet,lead_paragraph,headline,keywords,pub_date,news_desk,section_name,type_of_material,word_count,uri,all_text,overall_sentiment,topic_cluster
0,"The United States, supported by Britain, today...","The United States, supported by Britain, today...","The United States, supported by Britain, toda...","""U.S. and Britain Clearing Way for ""Relentless...",The United States Taliban Rhode Television Te...,2001-10-07 05:00:00+00:00,International,World,News,1333,nyt://article/02612e18-76c0-5447-bace-71473568...,"The United States, supported by Britain, today...",-0.9648,3
1,"Government can be expected to try to control, ...","Government can be expected to try to control, ...","IT will be, Americans have already been warned...","""The World",World Trade Center (NYC) Airlines and Airplane...,2001-10-07 05:00:00+00:00,Week in Review Desk,Week in Review,News,1322,nyt://article/0abd157f-d390-5787-bf89-4e87dad4...,"Government can be expected to try to control, ...",-0.9761,2
2,Richard B Woodward interviews photographer Edw...,Richard B Woodward interviews photographer Edw...,THE photographer Edward Grazda first visited A...,"""Art/Architecture; Images of Afghanistan, Befo...",Afghanistan Grazda Woodward Photography,2001-10-07 05:00:00+00:00,Arts and Leisure Desk,Arts,Interview,1638,nyt://article/15581e37-54ea-5aa4-bc80-34588ab1...,Richard B Woodward interviews photographer Edw...,-0.6815,2
3,"Violence Policy Center, Washington-based group...","Violence Policy Center, Washington-based group...","In the late 1980's, an American-based agent fo...","""In 80""s, Afghan Militias Used U.S. Rifles""",New York City Russia Washington (DC) Afghanist...,2001-10-07 05:00:00+00:00,National Desk,U.S.,News,603,nyt://article/17f9cbdb-ff36-5df1-8a37-5c02edbb...,"Violence Policy Center, Washington-based group...",-0.9661,0
4,Sending bright flashes of light and loud concu...,Sending bright flashes of light and loud concu...,"TOPDARA, Afghanistan, Oct. 7 — Sending bright...","""Thunderous Attack Heard in Kabul""",Afghanistan Taliban Dostum United States Polit...,2001-10-07 05:00:00+00:00,International,World,News,611,nyt://article/1a2a8209-9dd4-501b-a988-0c2adfc9...,Sending bright flashes of light and loud concu...,0.9022,0


In [17]:
articles.shape

(37398, 14)

In [18]:
articles.isna().sum()

abstract               866
snippet               4534
lead_paragraph         957
headline                 0
keywords             10348
pub_date                 0
news_desk             2456
section_name            11
type_of_material      1494
word_count               0
uri                      0
all_text                 0
overall_sentiment        0
topic_cluster            0
dtype: int64

In [19]:
articles_to_viz = articles
articles_to_viz['topic_cluster'] = articles_to_viz['topic_cluster'].map(lambda x: x+1)

In [20]:
articles_to_viz.head()

Unnamed: 0,abstract,snippet,lead_paragraph,headline,keywords,pub_date,news_desk,section_name,type_of_material,word_count,uri,all_text,overall_sentiment,topic_cluster
0,"The United States, supported by Britain, today...","The United States, supported by Britain, today...","The United States, supported by Britain, toda...","""U.S. and Britain Clearing Way for ""Relentless...",The United States Taliban Rhode Television Te...,2001-10-07 05:00:00+00:00,International,World,News,1333,nyt://article/02612e18-76c0-5447-bace-71473568...,"The United States, supported by Britain, today...",-0.9648,4
1,"Government can be expected to try to control, ...","Government can be expected to try to control, ...","IT will be, Americans have already been warned...","""The World",World Trade Center (NYC) Airlines and Airplane...,2001-10-07 05:00:00+00:00,Week in Review Desk,Week in Review,News,1322,nyt://article/0abd157f-d390-5787-bf89-4e87dad4...,"Government can be expected to try to control, ...",-0.9761,3
2,Richard B Woodward interviews photographer Edw...,Richard B Woodward interviews photographer Edw...,THE photographer Edward Grazda first visited A...,"""Art/Architecture; Images of Afghanistan, Befo...",Afghanistan Grazda Woodward Photography,2001-10-07 05:00:00+00:00,Arts and Leisure Desk,Arts,Interview,1638,nyt://article/15581e37-54ea-5aa4-bc80-34588ab1...,Richard B Woodward interviews photographer Edw...,-0.6815,3
3,"Violence Policy Center, Washington-based group...","Violence Policy Center, Washington-based group...","In the late 1980's, an American-based agent fo...","""In 80""s, Afghan Militias Used U.S. Rifles""",New York City Russia Washington (DC) Afghanist...,2001-10-07 05:00:00+00:00,National Desk,U.S.,News,603,nyt://article/17f9cbdb-ff36-5df1-8a37-5c02edbb...,"Violence Policy Center, Washington-based group...",-0.9661,1
4,Sending bright flashes of light and loud concu...,Sending bright flashes of light and loud concu...,"TOPDARA, Afghanistan, Oct. 7 — Sending bright...","""Thunderous Attack Heard in Kabul""",Afghanistan Taliban Dostum United States Polit...,2001-10-07 05:00:00+00:00,International,World,News,611,nyt://article/1a2a8209-9dd4-501b-a988-0c2adfc9...,Sending bright flashes of light and loud concu...,0.9022,1


In [21]:
articles_to_viz.to_csv('datasets/articles_to_viz.csv')