In [1]:
import pandas as pd
import numpy as np
from keras.models import load_model
from keras.preprocessing.sequence import pad_sequences
from gmplot import gmplot
import pickle

emotions = ["happy", "sad", "disgust", "angry", "fear", "surprise"]
colors = ["orange", "blue", "green", "red", "gray", "purple"]

model = load_model("models/ja_tweets_sentiment/model_2018-08-28-15:00.h5")

with open("models/ja_tweets_sentiment/tokenizer_cnn_ja.pkl", "rb") as f:
    tokenizer = pickle.load(f)
    
def preprocess(data, tokenizer, maxlen=280):
    return(pad_sequences(tokenizer.texts_to_sequences(data), maxlen=maxlen))

Using TensorFlow backend.


In [3]:
import json
data = None
for i in range(1453):
    with open("tweets/data_{}.json".format(i)) as f:
        if data is None:
            data = json.load(f)
        else:
            data['statuses'] += json.load(f)['statuses']

1. 市, 県, スペース, ,でsplit
2. lower
3. どれかが市名辞書に引っかかれば、最初に引っかかったものをリターン
4. 市名辞書に引っかからなければ県名辞書で引っかかったものをリターン
5. どれにもひっかからなければ無視

In [4]:
import pickle
with open("pref_dict.pkl", "rb") as f:
    pref_dict = pickle.load(f)
    
with open("city_dict.pkl", "rb") as f:
    city_dict = pickle.load(f)

In [5]:
import re
regex = r"[都道府県\,市町村群 ]"

def geocode(location, regex, city_dict, pref_dict):
    for d in re.split(regex, location):
        if d in city_dict:
            return city_dict[d]
        elif d in pref_dict:
            return pref_dict[d]
        else:
            return False

def format_data(statuses):
    results = []
    for status in statuses:
        try:
            coordinates = geocode(status['user']['location'], regex, city_dict, pref_dict)
            if coordinates:
                results.append({
                    'user':status['user']['screen_name'], 
                    'text':status['full_text'], 
                    'coordinates': coordinates,
                    'date': status['created_at']
                })
            else:
                continue
        except Exception as e:
            print(e)
            break
    return pd.DataFrame(results)

sample_size = 100000
df = format_data(data['statuses'])
df = df.sample(sample_size)
del(data)

In [8]:
df.head()

Unnamed: 0,coordinates,date,text,user,strength,emotion
216397,"(35.85718, 139.65079)",Mon Aug 27 10:34:47 +0000 2018,さくらももこさんご冥福をお祈りいたします,omu_musou,0.20349,1
42914,"(35.44771, 139.64256)",Tue Aug 28 12:17:21 +0000 2018,RT @jbot26830444: 2018年亡くなった有名人\n\nさくらももこ(漫画家)...,Keitore7000,0.070822,4
89447,"(36.595240000000004, 136.62566999999999)",Mon Aug 27 22:19:43 +0000 2018,RT @ribon60th: 【さくらももこさんご逝去の報】「ちびまる子ちゃん」の原作者であ...,HosiHukusu,0.081132,1
26719,"(40.82516, 140.73976000000002)",Tue Aug 28 23:00:55 +0000 2018,RT @COCHAE: さくらももこ先生、伝統こけしの世界にも大きな大きな力をくださいました...,tsugirl_junne,0.45315,4
36517,"(35.6895, 139.69163999999998)",Tue Aug 28 14:02:33 +0000 2018,RT @kazutan_1220: さくらももこさんの訃報を聞いた後に、初代OPだった「ゆめ...,greensilverBear,0.165295,4


In [7]:
preds = model.predict(preprocess(df['text'].tolist(), tokenizer))
strength = list(map(max, preds))
df['strength'] = strength
preds = np.argmax(preds, axis=1)
df['emotion'] = preds

In [9]:
df.to_csv("data.csv", index=False)

In [13]:
df = df.sample(10000)

In [10]:
from random import random
import math
from math import sqrt
from collections import namedtuple

def plot_gmap(data, colors, symbol="o", center=(38, 137, 6), size_param=(500, 1000), outfile="my_map.html"):
    gmap = gmplot.GoogleMapPlotter(*center)
    Symbol = namedtuple('Symbol', ['symbol', 'lat', 'long', 'size'])
    for i,d in data.iterrows():
        lat, lon = d['coordinates']
        if isinstance(lat, float) and isinstance(lon, float) and not math.isnan(lat) and not math.isnan(lon):
            lat, lon = lat+(random()-0.5)/2, lon+(random()-0.5)/2
            size = size_param[0]+size_param[1]*d['strength']**sqrt(2)
            settings = {"radius": size, "color": colors[d['emotion']], "marker": False, "face_alpha": 0.2}
            gmap.circle(lat,lon, **settings)
    gmap.draw(outfile)
    return gmap

In [14]:
gmap = plot_gmap(df, colors, size_param=(3000, 20000), outfile="momoko.html")

In [113]:
import math
math.isnan(tmp[0])

True

In [100]:
tmp = df.iloc[23]['coordinates']

In [107]:
df = df.dropna()

In [108]:
df.head(100)

Unnamed: 0,coordinates,text,user,strength,emotion
0,"(35.6895, 139.69163999999998)",【試合結果】\nさくらももこさん悼み喪章をつけてピッチへ…清水が横浜FM下し6試合ぶりの白星...,SoccerKingJP,0.029418,5
1,"(34.97683, 138.38315)",【横浜F・マリノス戦 試合終了】\n清水 2-1 横浜FM\n応援してくださった皆様、\nそ...,spulse_official,0.148884,0
2,"(38.268390000000004, 140.87212)",RT @mollichane: さくらももこさんが亡くなって日が浅いので言いたくなかったけど...,BreakerMidorima,0.120147,4
3,"(34.97683, 138.38315)",RT @mollichane: さくらももこさんが亡くなって日が浅いので言いたくなかったけど...,mazda2desu,0.120147,4
4,"(34.97683, 138.38315)",RT @mollichane: さくらももこさんが亡くなって日が浅いので言いたくなかったけど...,fomal373,0.120147,4
5,"(35.6895, 139.69163999999998)",RT @comecaML: 西原理恵子が自作の中で、「さくらももこに無いダークさが自分にはあ...,akiyamayasumi,0.071195,4
6,"(36.34404, 140.44547)",RT @Isseki3: ヘビースモーカーのさくらももこさんはタバコに殺された(乳癌の発症率...,ITF_QSYS,0.106272,4
7,"(35.85718, 139.65079)",RT @monokirk: さくらももこくらい有名な作家で、文章でマンガで何度も自分のエピソ...,yone51,0.265456,4
8,"(34.68635, 135.52043)",相互フォローありがとうございます～！\nさくらももこさんは国民的作家さんですね！\n\n#相...,sugumika,0.917076,0
9,"(37.90246, 139.02341)",RT @mollichane: さくらももこさんが亡くなって日が浅いので言いたくなかったけど...,qoo09,0.120147,4
