In [1]:
import pandas as pd
import gensim
from gensim.utils import simple_preprocess
from gensim.parsing.preprocessing import STOPWORDS
from nltk.stem import WordNetLemmatizer, SnowballStemmer
from nltk.stem.porter import *
import numpy as np
np.random.seed(2018)
import nltk
nltk.download('wordnet')
from gensim import corpora, models

[nltk_data] Downloading package wordnet to /Users/xijiahu/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [4]:
ngnews = pd.read_csv('NationalGridNews.csv')

In [5]:
ngnews.dtypes

Index           int64
Date           object
NewsTitle      object
NewsContent    object
dtype: object

In [6]:
ngnews["content"] = ngnews["NewsTitle"].map(str) + ngnews["NewsContent"]

In [7]:
ngnews["content"] = ngnews["content"].astype(str)

In [8]:
ngrelease = pd.read_csv('ng_release2017.csv')

In [9]:
ngrelease = ngrelease.rename(columns={"_source.publication_date": "Date", "_source.title_rss": "title",'_source.text':'text'})

In [22]:
#dictionary from news
def lemmatize_stemming(text):
    return WordNetLemmatizer().lemmatize(text, pos='v')

def preprocess(text):
    result = []
    for token in gensim.utils.simple_preprocess(text):
        if token not in gensim.parsing.preprocessing.STOPWORDS and len(token) > 3:
            result.append(lemmatize_stemming(token))
    return result
#to clean text 

In [25]:
processed_docs = ngnews['content'].map(preprocess)
#to tokenize the words in news documents, the dictionary should also be applied to PR and tweets

In [26]:
dictionary = gensim.corpora.Dictionary(processed_docs)

dictionary.filter_extremes(no_below=5, no_above=0.5, keep_n=100000)
bow_corpus = [dictionary.doc2bow(doc) for doc in processed_docs]

tfidf = models.TfidfModel(bow_corpus)

lda_model = gensim.models.LdaMulticore(bow_corpus, num_topics=5, id2word=dictionary, passes=2, workers=2)

for idx, topic in lda_model.print_topics(-1):
    print('Topic: {} \nWords: {}'.format(idx, topic))

Topic: 0 
Words: 0.123*"point" + 0.122*"hinkley" + 0.077*"telegraph" + 0.061*"power" + 0.058*"energy" + 0.053*"regulator" + 0.052*"https" + 0.050*"href" + 0.045*"theguardian" + 0.045*"network"
Topic: 1 
Words: 0.221*"look" + 0.085*"run" + 0.085*"service" + 0.085*"britain" + 0.085*"infrastructure" + 0.082*"country" + 0.069*"place" + 0.066*"power" + 0.056*"telegraph" + 0.017*"https"
Topic: 2 
Words: 0.124*"https" + 0.123*"href" + 0.121*"theguardian" + 0.064*"business" + 0.064*"point" + 0.063*"plan" + 0.061*"network" + 0.033*"electricity" + 0.033*"read" + 0.033*"britain"
Topic: 3 
Words: 0.040*"telegraph" + 0.039*"look" + 0.039*"energy" + 0.039*"place" + 0.039*"point" + 0.039*"power" + 0.038*"hinkley" + 0.038*"project" + 0.038*"regulator" + 0.038*"britain"
Topic: 4 
Words: 0.130*"hinkley" + 0.102*"point" + 0.065*"theguardian" + 0.061*"href" + 0.059*"plan" + 0.058*"https" + 0.056*"network" + 0.055*"electricity" + 0.054*"project" + 0.045*"power"


In [28]:
scoresperdoc=lda_model.inference(bow_corpus)
with open('ngnewsTopic_news.tsv', "w",encoding="utf-8") as fo:
    for row in scoresperdoc[0]:
       fo.write("\t".join(["{:0.3f}".format(score) for score in row]))
       fo.write("\n")

In [29]:
def lemmatize_stemming(text):
    return WordNetLemmatizer().lemmatize(text, pos='v')

def preprocess(text):
    result1 = []
    for token in gensim.utils.simple_preprocess(text):
        if token not in gensim.parsing.preprocessing.STOPWORDS and len(token) > 3:
            result1.append(lemmatize_stemming(token))
    return result1
processed_docs1 = ngrelease['text'].map(preprocess)
bow_corpus1 = [dictionary.doc2bow(doc) for doc in processed_docs1]
scoresperdoc1=lda_model.inference(bow_corpus1)
with open('NGnewsTopic_pr.tsv', "w",encoding="utf-8") as fo:
    for row in scoresperdoc1[0]:
       fo.write("\t".join(["{:0.3f}".format(score) for score in row]))
       fo.write("\n")

In [10]:
tweets = pd.read_csv('NationalGridPubTweets.csv',lineterminator='\n')

  interactivity=interactivity, compiler=compiler, result=result)


In [31]:
def lemmatize_stemming(text):
    return WordNetLemmatizer().lemmatize(text, pos='v')

def preprocess(text):
    result2 = []
    for token in gensim.utils.simple_preprocess(text):
        if token not in gensim.parsing.preprocessing.STOPWORDS and len(token) > 3:
            result2.append(lemmatize_stemming(token))
    return result2
processed_docs2 = tweets['text'].map(preprocess)
bow_corpus2 = [dictionary.doc2bow(doc) for doc in processed_docs2]
scoresperdoc2=lda_model.inference(bow_corpus2)
with open('NGnewsTopic_tweets.tsv', "w",encoding="utf-8") as fo:
    for row in scoresperdoc2[0]:
       fo.write("\t".join(["{:0.3f}".format(score) for score in row]))
       fo.write("\n")

In [32]:
topic_news = pd.read_csv("NGnewsTopic_news.tsv", sep='\t', names = ["ntopic0", "ntopic1", "ntopic2", "ntopic3", 'ntopic4'])

In [33]:
topic_release = pd.read_csv("NGnewsTopic_pr.tsv", sep='\t', names = ["rtopic0", "rtopic1", "rtopic2", "rtopic3", 'rtopic4'])

In [34]:
topic_tweet = pd.read_csv("NGnewsTopic_tweets.tsv", sep='\t', names = ["ttopic0", "ttopic1", "ttopic2", "ttopic3", 'ttopic4'])

In [36]:
news_topic = topic_news.join(ngnews)

In [37]:
news_topic['Date'] = news_topic.Date.replace({'T':' '}, regex=True)
news_topic['Date'] = pd.to_datetime(news_topic['Date'])
#convert object to datetime

In [38]:
release_topic = topic_release.join(ngrelease)

In [39]:
release_topic['Date'] = release_topic.Date.replace({'T':' '}, regex=True)
release_topic['Date'] = pd.to_datetime(release_topic['Date'])

In [40]:
tweet_topic = topic_tweet.join(tweets)

In [41]:
tweet_topic['created_at'] = pd.to_datetime(tweet_topic['created_at'])
tweet_topic = tweet_topic.rename(index=str, columns={"created_at": "Date"})

In [42]:
result = news_topic.merge(release_topic, left_on='Date', right_on='Date', how='outer')
result1 = result.merge(tweet_topic, left_on='Date', right_on='Date', how='outer')

In [55]:
result1

Unnamed: 0,ntopic0,ntopic1,ntopic2,ntopic3,ntopic4,Date,rtopic0,rtopic1,rtopic2,rtopic3,...,_source.teaser,_source.feedurl,_source.title,_source.url,_type,ttopic0,ttopic1,ttopic2,ttopic3,ttopic4
0,0.200,0.200,0.200,0.2,0.200,2018-04-12 08:41:00,,,,,...,,,,,,,,,,
1,0.213,0.206,0.200,0.2,2.181,2018-03-09 16:01:16,,,,,...,,,,,,,,,,
2,0.212,0.206,0.200,0.2,2.181,2018-03-09 16:01:16,,,,,...,,,,,,,,,,
3,0.200,0.200,0.200,0.2,0.200,2018-03-09 16:01:16,,,,,...,,,,,,,,,,
4,1.192,0.205,0.200,0.2,0.203,2018-01-22 15:43:27,,,,,...,,,,,,,,,,
5,1.192,0.205,0.200,0.2,0.203,2018-01-22 15:43:27,,,,,...,,,,,,,,,,
6,0.200,0.200,0.200,0.2,0.200,2018-01-22 15:43:27,,,,,...,,,,,,,,,,
7,4.194,0.201,0.201,0.2,0.205,2018-01-23 15:53:46,,,,,...,,,,,,,,,,
8,4.194,0.201,0.201,0.2,0.204,2018-01-23 15:53:46,,,,,...,,,,,,,,,,
9,0.200,0.200,0.200,0.2,0.200,2018-01-23 15:53:46,,,,,...,,,,,,,,,,


In [44]:
result1= result1.drop([ 'NewsTitle','NewsContent','Index_x','content'], axis=1)

In [45]:
result1 = result1.drop(['to_user_id','to_user_name','in_reply_to_status_id','filter_level','lang','possibly_sensitive'], axis=1)

In [47]:
result1= result1.drop([ 'geo_lat','geo_lng','retweet_id','retweet_count','favorite_count','quoted_status_id'], axis=1)

In [48]:
result1= result1.drop([ 'from_user_verified','from_user_profile_image_url','from_user_created_at','from_user_withheld_scope'], axis=1)

In [49]:
result1=result1.drop([ 'from_user_utcoffset','from_user_timezone','from_user_description','from_user_url','from_user_favourites_count','source','location'], axis=1)

In [50]:
result1=result1.drop([ 'from_user_id','from_user_lang','from_user_tweetcount','from_user_followercount','from_user_friendcount','from_user_listed','from_user_realname','truncated','withheld_copyright','withheld_scope'], axis=1)

In [52]:
result1=result1.drop(['Index_y','id','from_user_name', 'text_y'], axis=1)

In [54]:
result1=result1.drop([ '_id','title','text_x', '_score', '_source.doctype'], axis=1)

In [58]:
result1.to_csv('NG_news_topic_all.csv')

In [59]:
result_day = result1.set_index('Date').groupby(pd.Grouper(freq='D')).mean()

In [128]:
result_day.to_csv('NG_news_topic_day.csv')

In [129]:
result_week = result1.set_index('Date').groupby(pd.Grouper(freq='W')).mean()

In [131]:
result_week.to_csv('topic_week.csv')

In [60]:
result_month = result1.set_index('Date').groupby(pd.Grouper(freq='M')).mean()

In [61]:
result_month.to_csv('NG_news_topic_month.csv')

In [11]:
#dictionary from pr release
def lemmatize_stemming(text):
    return WordNetLemmatizer().lemmatize(text, pos='v')

def preprocess(text):
    result = []
    for token in gensim.utils.simple_preprocess(text):
        if token not in gensim.parsing.preprocessing.STOPWORDS and len(token) > 3:
            result.append(lemmatize_stemming(token))
    return result
#to clean text 
processed_docs = ngrelease['text'].map(preprocess)
#to tokenize the words in news documents, the dictionary should also be applied to PR and tweets
dictionary = gensim.corpora.Dictionary(processed_docs)

dictionary.filter_extremes(no_below=5, no_above=0.5, keep_n=100000)
bow_corpus = [dictionary.doc2bow(doc) for doc in processed_docs]

tfidf = models.TfidfModel(bow_corpus)

lda_model = gensim.models.LdaMulticore(bow_corpus, num_topics=5, id2word=dictionary, passes=2, workers=2)

for idx, topic in lda_model.print_topics(-1):
    print('Topic: {} \nWords: {}'.format(idx, topic))

Topic: 0 
Words: 0.024*"site" + 0.017*"gasholders" + 0.017*"dismantle" + 0.016*"holders" + 0.013*"time" + 0.012*"remove" + 0.010*"future" + 0.010*"pip" + 0.010*"demand" + 0.008*"make"
Topic: 1 
Words: 0.015*"site" + 0.014*"gasholders" + 0.013*"event" + 0.013*"competition" + 0.011*"dismantle" + 0.010*"link" + 0.009*"explosion" + 0.009*"tunnel" + 0.009*"street" + 0.008*"cable"
Topic: 2 
Words: 0.031*"school" + 0.017*"road" + 0.014*"primary" + 0.013*"line" + 0.013*"grant" + 0.011*"substation" + 0.011*"fund" + 0.010*"start" + 0.009*"pupils" + 0.007*"power"
Topic: 3 
Words: 0.024*"substation" + 0.018*"equipment" + 0.016*"travel" + 0.015*"transformer" + 0.014*"junction" + 0.014*"road" + 0.013*"carry" + 0.013*"line" + 0.012*"delivery" + 0.010*"efficiently"
Topic: 4 
Words: 0.022*"line" + 0.010*"remove" + 0.010*"landscape" + 0.010*"power" + 0.009*"events" + 0.009*"impact" + 0.008*"application" + 0.008*"underground" + 0.007*"team" + 0.007*"overhead"


In [12]:
scoresperdoc=lda_model.inference(bow_corpus)
with open('NGReleaseTopic_pr.tsv', "w",encoding="utf-8") as fo:
    for row in scoresperdoc[0]:
       fo.write("\t".join(["{:0.3f}".format(score) for score in row]))
       fo.write("\n")

In [13]:
def lemmatize_stemming(text):
    return WordNetLemmatizer().lemmatize(text, pos='v')

def preprocess(text):
    result1 = []
    for token in gensim.utils.simple_preprocess(text):
        if token not in gensim.parsing.preprocessing.STOPWORDS and len(token) > 3:
            result1.append(lemmatize_stemming(token))
    return result1
processed_docs1 = ngnews['content'].map(preprocess)
bow_corpus1 = [dictionary.doc2bow(doc) for doc in processed_docs1]
scoresperdoc1=lda_model.inference(bow_corpus1)
with open('NGreleaseTopic_news.tsv', "w",encoding="utf-8") as fo:
    for row in scoresperdoc1[0]:
       fo.write("\t".join(["{:0.3f}".format(score) for score in row]))
       fo.write("\n")

In [14]:
def lemmatize_stemming(text):
    return WordNetLemmatizer().lemmatize(text, pos='v')

def preprocess(text):
    result2 = []
    for token in gensim.utils.simple_preprocess(text):
        if token not in gensim.parsing.preprocessing.STOPWORDS and len(token) > 3:
            result2.append(lemmatize_stemming(token))
    return result2
processed_docs2 = tweets['text'].map(preprocess)
bow_corpus2 = [dictionary.doc2bow(doc) for doc in processed_docs2]
scoresperdoc2=lda_model.inference(bow_corpus2)
with open('NGreleaseTopic_tweets.tsv', "w",encoding="utf-8") as fo:
    for row in scoresperdoc2[0]:
       fo.write("\t".join(["{:0.3f}".format(score) for score in row]))
       fo.write("\n")

In [15]:
topic_news = pd.read_csv("NGreleaseTopic_news.tsv", sep='\t', names = ["ntopic0", "ntopic1", "ntopic2", "ntopic3", 'ntopic4'])

In [16]:
topic_release = pd.read_csv("NGReleaseTopic_pr.tsv", sep='\t', names = ["rtopic0", "rtopic1", "rtopic2", "rtopic3", 'rtopic4'])

In [17]:
topic_tweet = pd.read_csv("NGreleaseTopic_tweets.tsv", sep='\t', names = ["ttopic0", "ttopic1", "ttopic2", "ttopic3", 'ttopic4'])

In [18]:
news_topic = topic_news.join(ngnews)

In [19]:
news_topic = news_topic.drop(['NewsTitle','NewsContent','content','Index'], axis = 1)

In [20]:
news_topic['Date'] = news_topic.Date.replace({'T':' '}, regex=True)
news_topic['Date'] = pd.to_datetime(news_topic['Date'])

In [21]:
release_topic = topic_release.join(ngrelease)

In [22]:
release_topic = release_topic.drop(['title','text'], axis = 1)

In [23]:
release_topic['Date'] = release_topic.Date.replace({'T':' '}, regex=True)
release_topic['Date'] = pd.to_datetime(release_topic['Date'])

In [24]:
tweet_topic = topic_tweet.join(tweets)

In [25]:
tweet_topic = tweet_topic.drop(['to_user_id','to_user_name','in_reply_to_status_id','filter_level','lang','possibly_sensitive'], axis=1)

In [26]:
tweet_topic = tweet_topic.drop([ 'geo_lat','geo_lng','text','retweet_id','retweet_count','favorite_count','quoted_status_id'], axis=1)

In [27]:
tweet_topic = tweet_topic.drop([ 'from_user_verified','from_user_profile_image_url','from_user_created_at','from_user_withheld_scope'], axis=1)

In [28]:
tweet_topic = tweet_topic.drop([ 'from_user_utcoffset','from_user_timezone','from_user_description','from_user_url','from_user_favourites_count','source','location'], axis=1)
tweet_topic = tweet_topic.drop(['id','from_user_name','from_user_id','from_user_lang','from_user_tweetcount','from_user_followercount','from_user_friendcount','from_user_listed','from_user_realname','truncated','withheld_copyright','withheld_scope'], axis=1)

In [29]:
tweet_topic = tweet_topic.drop(['Index'], axis = 1)

In [30]:
tweet_topic['created_at'] = pd.to_datetime(tweet_topic['created_at'])
tweet_topic = tweet_topic.rename(index=str, columns={"created_at": "Date"})

In [31]:
result = news_topic.merge(release_topic, left_on='Date', right_on='Date', how='outer')
result1 = result.merge(tweet_topic, left_on='Date', right_on='Date', how='outer')

In [32]:
result1.to_csv('ng_release_topic_all.csv')

In [33]:
result_day = result1.set_index('Date').groupby(pd.Grouper(freq='D')).mean()
result_day.to_csv('ng_release_topic_day.csv')

In [34]:
result_month = result1.set_index('Date').groupby(pd.Grouper(freq='M')).mean()
result_month.to_csv('ng_release_topic_month.csv')

In [35]:
text = []
text = tweets.text.astype(str)
date = []
date = tweets.created_at.astype(str)

In [36]:
tweet = pd.DataFrame(list(zip(date, text)), columns=['Date','text'])
tweet.to_csv("NGTweet.csv", index_label='Index')

In [37]:
tweet = pd.read_csv('NGTweet.csv',lineterminator='\n')

In [38]:
def lemmatize_stemming(text):
    return WordNetLemmatizer().lemmatize(text, pos='v')

def preprocess(text):
    result = []
    for token in gensim.utils.simple_preprocess(text):
        if token not in gensim.parsing.preprocessing.STOPWORDS and len(token) > 3:
            result.append(lemmatize_stemming(token))
    return result
#to clean text 
processed_docs = tweet['text'].map(preprocess)
#to tokenize the words in news documents, the dictionary should also be applied to PR and tweets
dictionary = gensim.corpora.Dictionary(processed_docs)

dictionary.filter_extremes(no_below=5, no_above=0.5, keep_n=100000)
bow_corpus = [dictionary.doc2bow(doc) for doc in processed_docs]

tfidf = models.TfidfModel(bow_corpus)

lda_model = gensim.models.LdaMulticore(bow_corpus, num_topics=5, id2word=dictionary, passes=2, workers=2)

for idx, topic in lda_model.print_topics(-1):
    print('Topic: {} \nWords: {}'.format(idx, topic))

Topic: 0 
Words: 0.202*"generate" + 0.201*"total" + 0.201*"currently" + 0.045*"coal" + 0.040*"solar" + 0.031*"wind" + 0.027*"storage" + 0.026*"nuclear" + 0.024*"large" + 0.023*"hydro"
Topic: 1 
Words: 0.031*"pickup" + 0.028*"time" + 0.019*"energy" + 0.016*"electric" + 0.010*"fight" + 0.010*"warn" + 0.010*"afam" + 0.009*"decent" + 0.009*"nationalgriduk" + 0.008*"stage"
Topic: 2 
Words: 0.047*"latam" + 0.040*"generation" + 0.024*"esportsgears" + 0.022*"power" + 0.021*"kings" + 0.021*"week" + 0.021*"live" + 0.017*"final" + 0.016*"electricity" + 0.014*"gear"
Topic: 3 
Words: 0.062*"west" + 0.038*"latamkings" + 0.032*"dominate" + 0.031*"udevu" + 0.031*"gaoh" + 0.028*"esportsgears" + 0.026*"latam" + 0.018*"fridge" + 0.016*"right" + 0.014*"plug"
Topic: 4 
Words: 0.015*"export" + 0.015*"power" + 0.014*"irish" + 0.014*"dutch" + 0.013*"nationalgriduk" + 0.012*"workers" + 0.011*"years" + 0.009*"score" + 0.008*"today" + 0.007*"stand"


In [39]:
scoresperdoc=lda_model.inference(bow_corpus)
with open('NGTweetTopic_tweet.tsv', "w",encoding="utf-8") as fo:
    for row in scoresperdoc[0]:
       fo.write("\t".join(["{:0.3f}".format(score) for score in row]))
       fo.write("\n")

In [41]:
def lemmatize_stemming(text):
    return WordNetLemmatizer().lemmatize(text, pos='v')

def preprocess(text):
    result1 = []
    for token in gensim.utils.simple_preprocess(text):
        if token not in gensim.parsing.preprocessing.STOPWORDS and len(token) > 3:
            result1.append(lemmatize_stemming(token))
    return result1
processed_docs1 = ngnews['content'].map(preprocess)
bow_corpus1 = [dictionary.doc2bow(doc) for doc in processed_docs1]
scoresperdoc1=lda_model.inference(bow_corpus1)
with open('NGTweetTopic_news.tsv', "w",encoding="utf-8") as fo:
    for row in scoresperdoc1[0]:
       fo.write("\t".join(["{:0.3f}".format(score) for score in row]))
       fo.write("\n")

In [42]:
def lemmatize_stemming(text):
    return WordNetLemmatizer().lemmatize(text, pos='v')

def preprocess(text):
    result2 = []
    for token in gensim.utils.simple_preprocess(text):
        if token not in gensim.parsing.preprocessing.STOPWORDS and len(token) > 3:
            result2.append(lemmatize_stemming(token))
    return result2
processed_docs2 = ngrelease['text'].map(preprocess)
bow_corpus2 = [dictionary.doc2bow(doc) for doc in processed_docs2]
scoresperdoc2=lda_model.inference(bow_corpus2)
with open('NGTweetTopic_release.tsv', "w",encoding="utf-8") as fo:
    for row in scoresperdoc2[0]:
       fo.write("\t".join(["{:0.3f}".format(score) for score in row]))
       fo.write("\n")

In [43]:
topic_news = pd.read_csv("NGTweetTopic_news.tsv", sep='\t', names = ["ntopic0", "ntopic1", "ntopic2", "ntopic3", 'ntopic4'])
topic_release = pd.read_csv("NGTweetTopic_release.tsv", sep='\t', names = ["rtopic0", "rtopic1", "rtopic2", "rtopic3", 'rtopic4'])
topic_tweet = pd.read_csv("NGTweetTopic_tweet.tsv", sep='\t', names = ["ttopic0", "ttopic1", "ttopic2", "ttopic3", 'ttopic4'])

In [45]:
news_topic = topic_news.join(ngnews)
news_topic = news_topic.drop(['NewsTitle','NewsContent','content','Index'], axis = 1)

In [46]:
news_topic['Date'] = news_topic.Date.replace({'T':' '}, regex=True)
news_topic['Date'] = pd.to_datetime(news_topic['Date'])

In [49]:
release_topic = topic_release.join(ngrelease)
release_topic = release_topic.drop(['title','text'], axis = 1)

In [50]:
release_topic['Date'] = release_topic.Date.replace({'T':' '}, regex=True)
release_topic['Date'] = pd.to_datetime(release_topic['Date'])

In [51]:
tweet_topic = topic_tweet.join(tweet)

In [52]:
tweet_topic = tweet_topic.drop(['Index','text'],axis  = 1)

In [53]:
tweet_topic['Date'] = pd.to_datetime(tweet_topic['Date'])

In [54]:
result = news_topic.merge(release_topic, left_on='Date', right_on='Date', how='outer')
result1 = result.merge(tweet_topic, left_on='Date', right_on='Date', how='outer')

In [55]:
result1.to_csv('NGtweet_topic_all.csv')

In [56]:
result_day = result1.set_index('Date').groupby(pd.Grouper(freq='D')).mean()
result_day.to_csv('NGtweet_topic_day.csv')

In [57]:
result_day = result1.set_index('Date').groupby(pd.Grouper(freq='M')).mean()
result_day.to_csv('NGtweet_topic_month.csv')