In [126]:
import json
data = None
for i in range(1453):
    with open("tweets/data_{}.json".format(i)) as f:
        if data is None:
            data = json.load(f)
        else:
            data['statuses'] += json.load(f)['statuses']

In [127]:
import pickle
with open("pref_dict.pkl", "rb") as f:
    pref_dict = pickle.load(f)
    
with open("city_dict.pkl", "rb") as f:
    city_dict = pickle.load(f)

In [143]:
import re
import pandas as pd
import math
regex = r"[都道府県\,市町村群 ]"

def geocode(location, regex, city_dict, pref_dict):
    ds = re.split(regex, location)
    for d in ds:
        d = d.lower().strip()
        if d in city_dict:
            t = city_dict[d]
            if math.isnan(float(t[0])) or math.isnan(float(t[1])):
                continue
            else:
                return t
    
    for d in ds:
        d = d.lower().strip()
        if d in pref_dict:
            t = pref_dict[d]
            if math.isnan(float(t[0])) or math.isnan(float(t[1])):
                continue
            else:
                return t
    return False

def format_data(statuses):
    results = []
    for status in statuses:
        try:
            coordinates = geocode(status['user']['location'], regex, city_dict, pref_dict)
            if coordinates:
                results.append({
                    'user':status['user']['screen_name'], 
                    'text':status['full_text'], 
                    'coordinates': coordinates,
                    'date': status['created_at']
                })
            else:
                continue
        except Exception as e:
            print(e)
            break
    return pd.DataFrame(results)

df = format_data(data['statuses'])

In [144]:
df.shape

(275882, 4)

In [145]:
import pandas as pd
import numpy as np
from keras.models import load_model
from keras.preprocessing.sequence import pad_sequences
import pickle

emotions = ["happy", "sad", "disgust", "angry", "fear", "surprise"]
colors = ["orange", "blue", "green", "red", "gray", "purple"]

model = load_model("models/ja_tweets_sentiment/model_2018-08-28-15:00.h5")

with open("models/ja_tweets_sentiment/tokenizer_cnn_ja.pkl", "rb") as f:
    tokenizer = pickle.load(f)

In [146]:
def preprocess(data, tokenizer, maxlen=280):
    return(pad_sequences(tokenizer.texts_to_sequences(data), maxlen=maxlen))

preds = model.predict(preprocess(df['text'].tolist(), tokenizer))
strength = list(map(max, preds))
df['strength'] = strength
preds = np.argmax(preds, axis=1)
df['emotion'] = preds

In [147]:
df = df.dropna()
df.date = pd.to_datetime(df.date)
df["date_day"] = list(map(lambda x: str(x).split()[0], df.date))
df["date_hour"] = list(map(lambda x: str(x).split(":")[0], df.date))
df = df.sort_values(by="date", ascending=True)

(275882, 8)

In [149]:
df.to_csv("data.csv", index=False)

In [150]:
#sample_size=12000
#df = pd.read_csv("data.csv")
#df = df.dropna()
#df = df.sample(sample_size)
#df.date = pd.to_datetime(df.date)
#df["date_day"] = list(map(lambda x: str(x).split()[0], df.date))
#df = df.sort_values(by="date", ascending=True)

In [154]:
df["coordinates"].unique()

array([(35.60456, 140.12311), (36.56458, 139.88339),
       (35.706590000000006, 139.86792), ..., (38.79557, 141.51051),
       (32.55638, 130.68242), (31.90355, 130.70251000000002)], dtype=object)

In [163]:
df['coordinates_fixed'] = [(round(d[0]), round(d[1])) for d in df['coordinates']]

In [165]:
df['coordinates_fixed'].unique()

array([(36, 140), (37, 140), (43, 141), (35, 136), (35, 140), (37, 139),
       (36, 139), (38, 139), (36, 134), (28, 129), (38, 141), (36, 137),
       (35, 135), (35, 137), (33, 130), (34, 131), (36, 138), (44, 142),
       (37, 138), (40, 141), (36, 141), (35, 134), (34, 132), (43, 143),
       (34, 130), (34, 135), (35, 139), (33, 134), (35, 133), (32, 131),
       (35, 138), (33, 132), (26, 128), (31, 131), (39, 141), (36, 136),
       (38, 140), (33, 129), (37, 137), (34, 133), (34, 134), (33, 131),
       (35, 132), (39, 142), (41, 141), (40, 140), (34, 136), (43, 142),
       (41, 140), (32, 130), (37, 141), (42, 140), (27, 128), (34, 129),
       (42, 141), (33, 133), (39, 140), (36, 135), (27, 142), (25, 125),
       (34, 137), (36, 133), (31, 130), (43, 140), (45, 142), (27, 129),
       (32, 132), (43, 144), (44, 144), (24, 124), (42, 143), (44, 145),
       (44, 143), (38, 138), (34, 140), (43, 146), (34, 139), (42, 142),
       (24, 123), (26, 127), (30, 131), (40, 142), 

In [294]:
import numpy as np
import pandas as pd
import math
from math import sqrt
from random import random
from ast import literal_eval as make_tuple

df_fixed = {}
size_param=(sqrt(3), 100000)
in_colors = []
in_radiuses = []
in_coords = []
in_dates = []

max_emo = {}
for name, group in df.groupby(["date_hour"]):
    emo = 0.0
    for d in group['strength']:
        emo += d
    max_emo[name] = emo

for name, group in df.groupby(["date_hour", "coordinates_fixed", "emotion"]):
    radius = 0.0
    for d in group['strength']:
        radius += d
    radius = (size_param[1]*radius) / max_emo[name[0]]**(1/size_param[0])
    if radius < size_param[0]:
        continue
    in_colors.append(colors[name[2]])
    in_radiuses.append(radius)
    in_coords.append(name[1])
    in_dates.append(name[0])

In [282]:
list(map(len, [in_colors,in_radiuses,in_coords,in_dates]))

[9348, 9348, 9348, 9348]

In [266]:
max_emo

{'2018-08-27': 26889.238191342974,
 '2018-08-28': 13783.658433027224,
 '2018-08-29': 4473.967996143445,
 '2018-08-30': 1674.6215828826594,
 '2018-08-31': 37.43535278644413}

In [295]:
%load_ext autoreload
%autoreload 2
import mapgen
center = (38, 137)
mapgen.plot("test_map_ver3.html", center, in_coords, in_radiuses, in_colors, in_dates)

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


True