In [1]:
from sklearn.externals import joblib
from flask import Flask, jsonify,request,json
from flask import render_template
import spacy
import en_core_web_sm
# from googletrans import Translator
import pandas as pd
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

# import polyglot
# from polyglot.text import Text, Word
# from polyglot.downloader import downloader
# from polyglot.mapping import Embedding

## chinese word nlp
import jieba.analyse as analyse
import jieba


# import tensorflow as tf
# from keras.models import load_model
# from keras.backend import clear_session
# import numpy as np
# import pickle

from sqlalchemy import create_engine
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer




In [2]:
## DB connection
sql_hostname = 'YOUR_CODE'
sql_username = 'YOUR_CODE'
sql_password = 'YOUR_CODE'
sql_main_database = 'YOUR_CODE'
sql_table ="YOUR_CODE"


# engine = create_engine('mysql+pymysql://{}:{}@{}:{}/{}'.format(sql_username,sql_password,sql_hostname,sql_main_database))

engine = create_engine('mysql://{}:{}@{}/{}'.format(sql_username,sql_password,sql_hostname,sql_main_database))


## SQL query
query = "select * from " + str(sql_main_database) + "." + str(sql_table)

In [3]:
## read from db
df = pd.read_sql_query(query, engine)


In [4]:
## load NLP model
nlp = en_core_web_sm.load()

## VaderSentiment analyser
analyser = SentimentIntensityAnalyzer()


## user define function

In [5]:
def entity_tag(article):
    doc = nlp(article.encode('ascii', 'ignore').decode('utf8') )   ## handling text unicode 
    label = []
    text = []
    full=[]
    entity_list = ['PERSON','ORG','GPE','PRODUCT']
    for ent, i in zip(doc.ents,range(0,len(doc))):
        label.append(ent.label_)
        text.append(ent.text)
        if label[i] in entity_list:
            full.append(text[i])
    
    entity = list(set(full))
    return entity

# def entity_tag_ms(article):
#     # entity_list = pd.DataFrame()
#     entity_list=[]
#     text = Text(article)
#     try:
#         for item in text.entities:
#             entity_list.append(item)
#     except:
#         print('embedding not loaded ')
#     return entity_list


def entity_tag_zh(article):
    entity_list=[]
    for key in analyse.extract_tags(article,5, withWeight=False):
    	entity_list.append( key.encode('utf-8'))
    return entity_list

def sentiment_analyzer_scores(sentence):
    score = analyser.polarity_scores(sentence)
    
    return score



## Name entities tag (english)

In [6]:
df.head()

Unnamed: 0,nid,link,title,date
0,1501,https://www.bbc.com/news/world-asia-china-5124...,"China coronavirus spread is accelerating, Xi J...",2020-01-25 14:01:07
1,1502,https://www.cnn.com/2020/01/20/health/what-is-...,Coronavirus explained: What you need to know,2020-01-20 00:00:00
2,1503,https://www.nytimes.com/article/what-is-corona...,"What Is the Coronavirus? Symptoms, Treatment a...",2020-01-23 00:00:00
3,1504,https://www.bbc.com/news/uk-northern-ireland-5...,Coronavirus: Man is treated for symptoms in Be...,2020-01-23 00:00:00
4,1505,https://www.businessinsider.com/how-to-protect...,The Wuhan coronavirus has spread to 10 countri...,2020-01-23 00:00:00


In [10]:
## apply NER function 
df['entity_en'] = df['title'].apply(entity_tag)

In [11]:

## split the entity into new row and join with nid colymn
entity_df = (df['entity_en'].apply(pd.Series)\
            .stack()\
            .reset_index(level=1, drop=True)\
            .to_frame('entity_en')\
            .join(df[['nid','link', 'title' ,'date']], how='left') )

In [12]:
entity_df

Unnamed: 0,entity_en,nid,link,title,date
0,Xi Jinping,1501,https://www.bbc.com/news/world-asia-china-5124...,"China coronavirus spread is accelerating, Xi J...",2020-01-25 14:01:07
0,China,1501,https://www.bbc.com/news/world-asia-china-5124...,"China coronavirus spread is accelerating, Xi J...",2020-01-25 14:01:07
1,Coronavirus,1502,https://www.cnn.com/2020/01/20/health/what-is-...,Coronavirus explained: What you need to know,2020-01-20 00:00:00
2,Coronavirus,1503,https://www.nytimes.com/article/what-is-corona...,"What Is the Coronavirus? Symptoms, Treatment a...",2020-01-23 00:00:00
3,Belfast,1504,https://www.bbc.com/news/uk-northern-ireland-5...,Coronavirus: Man is treated for symptoms in Be...,2020-01-23 00:00:00
...,...,...,...,...,...
5983,China,7484,https://www.bbc.co.uk/news/uk-wales-51249369,China coronavirus: Cardiff lecturer 'stranded'...,2020-01-28 01:01:06
5985,Cramer,7486,https://www.thestreet.com/jim-cramer/coronavir...,Coronavirus Selloff?: Cramer's 'Mad Money' Rec...,2020-01-24 00:00:00
5985,Coronavirus Selloff,7486,https://www.thestreet.com/jim-cramer/coronavir...,Coronavirus Selloff?: Cramer's 'Mad Money' Rec...,2020-01-24 00:00:00
5987,UK,7488,https://www.theguardian.com/science/2020/jan/2...,'I felt like ET': UK man describes surreal cor...,2020-01-24 00:00:00


## save result data to db mySQL

In [13]:
entity_df.to_sql(   str((sql_table) +'_nameEntity'), con = engine,  if_exists='append',index=False)


In [14]:
entity_df.to_csv(str((sql_table) +'_nameEntity'+ '.csv') , index = False)

## sentiment analysis using vaderSentiment ( work well in both formal and informal language (social media）

In [6]:

## apply the function
df['sentiment']= df['title'].apply(sentiment_analyzer_scores)

## convert the dict result to series 
sentiment_df = df['sentiment'].apply(pd.Series)

## join original df with sentiment
df_withSentiment = df[['nid','link', 'title'  ,'date']].join(sentiment_df)

## create new label
df_withSentiment['sentimentType'] = ""
df_withSentiment['sentimentType'] = df_withSentiment['compound'].apply(lambda x: 'Positive' if x>0  else ( 'Neutral' if x == 0 else 'Negative')  )



In [7]:
df_withSentiment

Unnamed: 0,nid,link,title,date,neg,neu,pos,compound,sentimentType
0,1501,https://www.bbc.com/news/world-asia-china-5124...,"China coronavirus spread is accelerating, Xi J...",2020-01-25 14:01:07,0.167,0.833,0.000,-0.1027,Negative
1,1502,https://www.cnn.com/2020/01/20/health/what-is-...,Coronavirus explained: What you need to know,2020-01-20 00:00:00,0.000,1.000,0.000,0.0000,Neutral
2,1503,https://www.nytimes.com/article/what-is-corona...,"What Is the Coronavirus? Symptoms, Treatment a...",2020-01-23 00:00:00,0.231,0.769,0.000,-0.2732,Negative
3,1504,https://www.bbc.com/news/uk-northern-ireland-5...,Coronavirus: Man is treated for symptoms in Be...,2020-01-23 00:00:00,0.000,1.000,0.000,0.0000,Neutral
4,1505,https://www.businessinsider.com/how-to-protect...,The Wuhan coronavirus has spread to 10 countri...,2020-01-23 00:00:00,0.000,1.000,0.000,0.0000,Neutral
...,...,...,...,...,...,...,...,...,...
5985,7486,https://www.thestreet.com/jim-cramer/coronavir...,Coronavirus Selloff?: Cramer's 'Mad Money' Rec...,2020-01-24 00:00:00,0.314,0.686,0.000,-0.4939,Negative
5986,7487,https://nypost.com/2020/01/23/netflix-releases...,Netflix releases 'Pandemic' docuseries as coro...,2020-01-23 00:00:00,0.000,1.000,0.000,0.0000,Neutral
5987,7488,https://www.theguardian.com/science/2020/jan/2...,'I felt like ET': UK man describes surreal cor...,2020-01-24 00:00:00,0.000,0.783,0.217,0.3612,Positive
5988,7489,https://www.cnbc.com/2020/01/23/us-futures-poi...,"S&P 500 ekes out gain, erases losses after WHO...",2020-01-23 00:00:00,0.165,0.488,0.348,0.4588,Positive


In [17]:
## save the file to db
df_withSentiment.to_sql(   str((sql_table) +'_sentiment'), con = engine,  if_exists='append',index=False)


In [9]:
df_withSentiment.to_csv( str((sql_table) +'_sentiment'+ '.csv') , index = False)