In [1]:
import os
import pickle
import numpy as np
import pandas as pd
import operate_data as od
import ml_model as ml
from  demo import Predictor
from datetime import datetime, timedelta
from dateutil.relativedelta import relativedelta
import ipdb
from collections import Counter
from tqdm import tqdm
import warnings
warnings.filterwarnings('ignore')

VECTOR_MODE = {'onehot': 0, 'wordfreq': 1, 'twovec': 2, 'tfidf': 3, 'outofdict': 4}

Building prefix dict from the default dictionary ...
Loading model from cache /var/folders/x6/8vnlx3g94g1csgbt9sc6drkw0000gn/T/jieba.cache
Loading model cost 0.828 seconds.
Prefix dict has been built succesfully.


## Run emo classifier , return dict and counter of

In [2]:
def emo_classifier(date_start='2016/09',date_end='2016/10',data='./data/China.csv'):
    

    news_df=pd.read_csv(data,encoding='cp950')
    news_df['Time'] = pd.to_datetime(news_df['Time'])  
    news_df.dropna(inplace=True)
    news_df.reset_index(drop=True, inplace=True)
    #mask = (df['date'] > start_date) & (df['date'] <= end_date)
    mask=(news_df['Time']>=date_start) &( news_df['Time']<date_end )
    news_df=news_df[mask]
 
    reload=False
    if reload:
            best_vector = "wordfreq"
            best_model = 1  # linearLogistic
            save_model(best_vector, best_model)
    else:
            od.loadStopwords()
            od.loadEmotionwords()
            od.loadWords(od.stopList)
            od.loadDocument(od.stopList)
            
    predictor=Predictor()
    predictor.load_model()
    predictor.set_mode(mode="wordfreq")
    some_new=[]
    tags=[]
    time=[]
    titles=[]
    result={}
    for date,title,news in zip(news_df['Time'],news_df['Title'],news_df['Content']):
        predictor.set_news(news=news)
        predictor.trans_vec()
        tag = predictor()
#         date=datetime.strptime(date,"%Y/%m/%d")
   
        date.strftime('%Y/%m/%d')
        time.append(date)
        titles.append(title)
        some_new.append(news)
        tags.append(tag[0])
       
    counter=Counter(tags)
    result={
            'Time':time,
            'Title':titles,
            'Content':some_new,
            'Tag':tags
    }
    df=pd.DataFrame(result)
    
    return df,counter

## In order to convert number to datetime

In [3]:
def get_time(data):
    result={}
    start_date = "2016/03" 
    stop_date="2019/03"
    start=datetime.strptime(start_date,"%Y/%m")
    stop=datetime.strptime(stop_date,"%Y/%m")
    dates=[]
    arr_smonth=[]
    arr_emonth=[]
    for number in data:
        smonth = start +  relativedelta(months=number)
        emonth=smonth+ relativedelta(months=1)
        arr_smonth.append(smonth.strftime('%Y/%m')) # Return a string representing the date and time
        arr_emonth.append(emonth.strftime('%Y/%m')) 
    result={
        'from':arr_smonth,
        'end':arr_emonth
    }
    
    return result

## Getting the date and count of pos and neg

In [4]:

def get_count(data,path):
    data=get_time(data)
    date=list()
    pos=list()
    neg=list()
    res=dict()
   
    for i in tqdm(range(len(data['from'])), desc='[*] Counting...', dynamic_ncols=True):
        emo_df,counter=emo_classifier(date_start=data['from'][i],date_end=data['end'][i],data=path)
        date.append(data['from'][i])
        pos.append(counter[1])
        neg.append(counter[-1])
    res={
        "date":date,
        "pos":pos,
        "neg":neg
    }
    df=pd.DataFrame(res)
    return df

## China

In [5]:
china_df=get_count(data=[i for i in range(36)],path='./data/China.csv')
china_df

[*] Counting...: 100%|██████████| 36/36 [10:51<00:00, 22.06s/it]


Unnamed: 0,date,pos,neg
0,2016/03,56,27
1,2016/04,71,25
2,2016/05,73,32
3,2016/06,63,35
4,2016/07,67,27
5,2016/08,66,34
6,2016/09,57,36
7,2016/10,60,28
8,2016/11,63,36
9,2016/12,75,57


In [6]:
def get_percentage(data):
    percentage=[]
    for i in range(36):
        total=data['pos'][i]+data['neg'][i]
        percentage.append((data['pos'][i]/total)*100)

    data['rate']=percentage
    return data

In [7]:
china_df=get_percentage(china_df)
china_df

Unnamed: 0,date,pos,neg,rate
0,2016/03,56,27,67.46988
1,2016/04,71,25,73.958333
2,2016/05,73,32,69.52381
3,2016/06,63,35,64.285714
4,2016/07,67,27,71.276596
5,2016/08,66,34,66.0
6,2016/09,57,36,61.290323
7,2016/10,60,28,68.181818
8,2016/11,63,36,63.636364
9,2016/12,75,57,56.818182


## Brazil

In [8]:
brazil_df=get_count(data=[i for i in range(36)],path='./data/Brazil.csv')
brazil_df

[*] Counting...: 100%|██████████| 36/36 [02:18<00:00,  3.68s/it]


Unnamed: 0,date,pos,neg
0,2016/03,7,1
1,2016/04,7,0
2,2016/05,3,5
3,2016/06,1,1
4,2016/07,2,0
5,2016/08,4,1
6,2016/09,5,0
7,2016/10,4,0
8,2016/11,2,0
9,2016/12,2,1


In [9]:
brazil_df=get_percentage(brazil_df)
brazil_df

Unnamed: 0,date,pos,neg,rate
0,2016/03,7,1,87.5
1,2016/04,7,0,100.0
2,2016/05,3,5,37.5
3,2016/06,1,1,50.0
4,2016/07,2,0,100.0
5,2016/08,4,1,80.0
6,2016/09,5,0,100.0
7,2016/10,4,0,100.0
8,2016/11,2,0,100.0
9,2016/12,2,1,66.666667


## India

In [12]:
india_df=get_count(data=[i for i in range(36)],path='./data/India.csv')
india_df

[*] Counting...: 100%|██████████| 36/36 [02:56<00:00,  4.06s/it]


Unnamed: 0,date,pos,neg
0,2016/03,4,3
1,2016/04,5,4
2,2016/05,12,3
3,2016/06,10,0
4,2016/07,9,4
5,2016/08,16,5
6,2016/09,8,2
7,2016/10,16,3
8,2016/11,16,6
9,2016/12,16,14


In [13]:
india_df=get_percentage(india_df)
india_df

Unnamed: 0,date,pos,neg,rate
0,2016/03,4,3,57.142857
1,2016/04,5,4,55.555556
2,2016/05,12,3,80.0
3,2016/06,10,0,100.0
4,2016/07,9,4,69.230769
5,2016/08,16,5,76.190476
6,2016/09,8,2,80.0
7,2016/10,16,3,84.210526
8,2016/11,16,6,72.727273
9,2016/12,16,14,53.333333


## Russia

In [14]:
russia_df=get_count(data=[i for i in range(36)],path='./data/Russia.csv')
russia_df

[*] Counting...: 100%|██████████| 36/36 [02:23<00:00,  3.81s/it]


Unnamed: 0,date,pos,neg
0,2016/03,2,0
1,2016/04,4,1
2,2016/05,2,0
3,2016/06,0,1
4,2016/07,2,1
5,2016/08,4,0
6,2016/09,5,1
7,2016/10,1,2
8,2016/11,7,0
9,2016/12,12,2


In [15]:
russia_df=get_percentage(russia_df)
russia_df

Unnamed: 0,date,pos,neg,rate
0,2016/03,2,0,100.0
1,2016/04,4,1,80.0
2,2016/05,2,0,100.0
3,2016/06,0,1,0.0
4,2016/07,2,1,66.666667
5,2016/08,4,0,100.0
6,2016/09,5,1,83.333333
7,2016/10,1,2,33.333333
8,2016/11,7,0,100.0
9,2016/12,12,2,85.714286


In [21]:
f = open('china_emo.json','w')
f.write(china_df['rate'].to_json(force_ascii=False))
f.close()

In [22]:
f = open('india_emo.json','w')
f.write(india_df['rate'].to_json(force_ascii=False))
f.close()

In [23]:
f = open('russia_emo.json','w')
f.write(russia_df['rate'].to_json(force_ascii=False))
f.close()

In [24]:
f = open('brazil_emo.json','w')
f.write(brazil_df['rate'].to_json(force_ascii=False))
f.close()