In [78]:
from IPython.display import HTML

HTML('''<script>
code_show=true; 
function code_toggle() {
 if (code_show){
 $('div.input').hide();
 } else {
 $('div.input').show();
 }
 code_show = !code_show
} 
$( document ).ready(code_toggle);
</script>
The raw code for this IPython notebook is by default hidden for easier reading.
To toggle on/off the raw code, click <a href="javascript:code_toggle()">here</a>.''')

In [77]:
import pandas as pd, numpy as np, glob, matplotlib.pyplot as plt, seaborn as sns, nltk, string, multiprocessing
from textblob import TextBlob
from collections import Counter
from multiprocessing import Pool
import itertools

In [2]:
%%file ta.py

import nltk 
from textblob import TextBlob
import re

class text_analysis:
    
    def __init__(self,words):
        self.words = words
        
    def word_freq(self,commentBody):
        splt = commentBody.split()
        toks = [stemmer.stem(t) for t in nltk.word_tokenize(commentBody) if t not in stopwords] #tokenizing and stemming comment into important words only
        worddict = {}
        for word in self.words:
            if len(splt) == 0 or toks.count(word) == 0:
                worddict[f"freq_{word}"] = float(0)
            else:
                worddict[f"freq_{word}"] = toks.count(word)/len(splt) 
        return worddict

    def preprocess(self,commentBody):
        commentBody = commentBody.lower()
        commentBody = commentBody.replace('<br/>',' ') # removing html tag 
        commentBody = re.sub(r'[^\w\s]','',commentBody) # look for all unicode patterns except word and whitespace characters and replace with nothing
        commentBody = re.sub(r'\s+',' ',commentBody) # look for multiple whitespaces and replace with nothing
        return commentBody

    def polar(self,commentBody):
        return TextBlob(commentBody).sentiment.polarity
    
def tok_stem(commentBody):
    toks = nltk.word_tokenize(commentBody)
    return [stemmer.stem(t) for t in toks if t not in stopwords]
stemmer = nltk.stem.snowball.EnglishStemmer()
stopwords = nltk.corpus.stopwords.words('english')

Overwriting ta.py


In [3]:
article_file = glob.glob("Articles*.csv")
cpu = multiprocessing.cpu_count()

In [4]:
articles = pd.read_csv('ArticlesApril2018.csv')

In [85]:
# print(articles.keywords[200])
# print(articles.headline[200])
# print(articles.pubDate[200])

In [6]:
from ta import text_analysis

In [79]:
TA = text_analysis(0)
articles.headline = Pool(processes=cpu).map(TA.preprocess,articles.headline)

In [80]:
articles['sentiment'] = Pool(processes=cpu).map(TA.polar,articles.headline)

In [81]:
high_sent = articles.nlargest(5, columns = ['sentiment'])
#high_sent['headline'].values

In [82]:
low_sent = articles.nsmallest(5, columns = ['sentiment'])
#low_sent['headline'].values

In [12]:
%%file sn.py

import itertools

def cln_kw(keywords):
    pairs = []
    words = keywords.replace('[', '')
    words = words.replace("'", '')
    words = words.replace(']', '')
    words = (x.strip() for x in words.split(',')) #Separates the keywords by comas
    for pair in itertools.combinations(words,2): #pairs up the keywords in groups of two
        pairs.append(pair) #appends the group of two keywords into the pairs list
    return pairs

Overwriting sn.py


In [13]:
from sn import cln_kw
import networkx as nx

In [14]:
sn = pd.DataFrame([articles.keywords,articles.sentiment]).T

In [91]:
pairs = Pool(processes=cpu).map(cln_kw,articles.keywords)

In [16]:
pairs = [p for pair in pairs for p in pair]

In [17]:
G = nx.Graph(pairs)
cent = nx.algorithms.centrality.degree_centrality(G)

In [83]:
import operator
sorted_cent = sorted(cent.items(), key=operator.itemgetter(1), reverse=True)
#sorted_cent

In [18]:
sent = []
colours = []
for row in sn.itertuples():
    words = row.keywords
    words = ([x.strip() for x in words.split(',')])
    for pair in itertools.combinations(words,2):
        sent.append(row.sentiment)
        if row.sentiment < 0:
            colours.append('#ffbdbd')
        elif row.sentiment == 0:
            colours.append('#afafaf')
        else:
            colours.append('#e1f7d5')

In [71]:
# print(len(sent))
# print(len(colours))
# print(len(pairs))

In [72]:
df = pd.DataFrame(np.column_stack([pairs,colours, sent]),columns=['word','node', 'colours', 'sentiment'])
# df.head()

In [21]:
import networkx as nx
from bokeh.models import Range1d, Plot
from bokeh.io import show, output_notebook
from bokeh.plotting import figure
from bokeh.models.graphs import from_networkx,NodesAndLinkedEdges
from bokeh.models import HoverTool, MultiLine,LassoSelectTool, WheelZoomTool, BoxSelectTool, Circle
import collections

In [22]:
output_notebook()

In [84]:
from random import sample
pairsample = sample(pairs,1000)
#pairsample

In [50]:
def update_sampling_factor(x=1e-4):
    global df
    sampleddf = df.sample(frac=x)
    sampleddf['pairs'] = list(zip(sampleddf.word,sampleddf.pair))
    push_notebook(handle=plot_handle)

In [51]:
G = nx.Graph(pairsample)

# We could use figure here but don't want all the axes and titles
plot = figure(x_range=Range1d(-10000,10000), y_range=Range1d(-10000, 10000))

# Create a Bokeh graph from the NetworkX input using nx.spring_layout
graph = from_networkx(G, nx.spring_layout, scale=10000, center=(0,0))
plot.renderers.append(graph)

# Add some new columns to the node renderer data source
graph.node_renderer.data_source.data['index'] = df['word'].unique()
graph.node_renderer.data_source.data['colours'] =df['colours']

# Format node and lines
graph.node_renderer.glyph.update(size=10, fill_color="colours")
graph.edge_renderer.glyph = MultiLine(line_color="#cccccc", line_alpha=0.8, line_width=2)

# Format hover tool
graph.node_renderer.hover_glyph = Circle(size=2, fill_color='#f1cbff')
graph.edge_renderer.hover_glyph = MultiLine(line_color='#c9c9ff', line_width=4)

# Set edge properties
graph.edge_renderer.glyph.line_dash = [1,1]
graph.inspection_policy = NodesAndLinkedEdges()

#Add a hovertool
plot.add_tools(HoverTool(tooltips="@index"))

data_handle = from_networkx(G, nx.spring_layout, scale=10000, center=(0,0))
plot_handle = show(plot, warning=False)




In [None]:
from bokeh.io import output_file, show
from bokeh.layouts import widgetbox
from bokeh.models.widgets import CheckboxGroup
import ipywidgets as widgets
from ipywidgets import interact, interact_manual
import warnings


interact_manual(update_sampling_factor, 
                x=widgets.BoundedFloatText(value=1e-6,min=1e-6,max=1.0,step=0.0001,
                                                  description='Sampling Factor:' ))

In [87]:
mods = list(nx.algorithms.community.label_propagation.label_propagation_communities(G))
# print(len(mods))

In [146]:
import ipywidgets as widgets
from ipywidgets import interact, interact_manual

In [86]:
from bokeh.io import output_file, show
from bokeh.layouts import widgetbox
from bokeh.models.widgets import CheckboxGroup
import ipywidgets as widgets
from ipywidgets import interact, interact_manual
import warnings


# interact_manual(update_sampling_factor, 
#                 x=widgets.BoundedFloatText(value=1e-6,min=1e-6,max=1.0,step=0.0001,
#                                                   description='Sampling Factor:' ))