In [11]:
import csv
import json
import re
from collections import Counter
from math import sqrt, log
from scipy.stats.stats import pearsonr
# from plot_curve_fit import curve_fit, scatter_fit_plot, pie_chart, plot_map

import numpy as np
import bokeh
from math import pi

from bokeh.embed import components
from scipy import optimize
import pandas as pd
from bokeh.models import Line, ColumnDataSource
from bokeh import palettes

from bokeh.io import output_file, show
# from bokeh.transform import cumsum

from bokeh.models import LogColorMapper
from bokeh.plotting import figure

In [16]:
MIN_NUM_OF_REGION = 5
MIN_CORRELATION = 0.3
GROUP_NAME_SUB = r'(mild|medium|strongest|strong|_|-)'
# GEO_CODE_FILE_NAME = ['LGA_2017_VIC.csv', 'LGA_2017_SA.csv', 'LGA_2017_NSW.csv']
GEO_CODE_FILE_NAME = ['LGA_2017_NSW.csv']


TITLE_REPLACE_PATTERN = r'.%.'

URL = 'http://10.9.131.221:5984/'
RAW_TWEETS = 'raw_tweets'
CORPUS_VIEW_NAME = 'word_choice_result'
ALL_DOC_VIEW_FUNC = "function (doc) {\n emit(doc._id, doc); \n}"
ALL_TEXT_VIEW_FUNC = "function (doc) {\n if (doc.text != null) {\n  emit(doc._id, doc.text);\n }\n}"
HAS_GEO_VIEW_FUNC = "function (doc) {\n  if (doc.geo != null) {\n     emit(doc._id, doc.geo);\n  }\n}"

VARIANCE = 0.2

INTERESTED_FEATURE = ["Highest Year Of School Completed - Persons Aged 15 Years And Over  Not Stated %"
                       # "Youth Engagement In Work/Study Working Part-Time & Studying Full-Time %"
                      # "Persons With Post School Qualifications Advanced Diploma, Or Diploma  %",
                      # "Youth Engagement In Work/Study Working Full-Time & Studying Part-Time %"
                      ]



In [22]:
def linear_func(x, k, b):
    return k * np.asarray(x) + b


def curve_fit(x_data, y_data, function_type='linear'):
    fitted_y = None
    if function_type == 'linear':
        params, params_covariance = optimize.curve_fit(linear_func, x_data, y_data)
        fitted_y = linear_func(x_data, params[0], params[1])
    return fitted_y


def scatter_fit_plot(x, y, y_fit, label, x_name, y_name, toshow):
    x = np.asarray(x)
    y = np.asarray(y)
    y_fit = np.asarray(y_fit)

    radii = (max(max(x),max(y)) + min(min(x),min(y))) * 0.02

    colors = [
        "#%02x%02x%02x" % (int(r), int(g), 150) for r, g in zip(50+2*x, 30+2*y)
    ]


    TOOLS="hover,crosshair,pan,wheel_zoom,zoom_in,zoom_out,box_zoom,undo,redo,reset,tap,save,box_select,poly_select,lasso_select,"
    TOOLTIPS = [
        ("index", "$index"),
        ("(x,y)", "($x, $y)"),
        ("Region", "@label"),
    ]

    p = figure(title="Word Choice VS Education Status", plot_height=351,plot_width=352, x_axis_label=x_name, y_axis_label=y_name, tools=TOOLS, tooltips=TOOLTIPS,
               background_fill_color='white',
               background_fill_alpha=0.2,
               border_fill_color='white',
               border_fill_alpha=0.5)

    p.title.text_font_size = '14pt'
    p.title.text_color = 'black'
    p.title.align = 'center'
    p.xaxis.major_label_text_color = "black"
    p.xaxis.major_label_text_font_size = '12pt'
    p.yaxis.major_label_text_color = "black"
    p.yaxis.major_label_text_font_size = '12pt'

    p.scatter(x, y, fill_color=colors, fill_alpha=0.6,
              line_color=None, radius=radii,
              hover_color='red')

    line = Line(x='x', y='y', line_color="#666699", line_width=2)
    lines_source = ColumnDataSource(data=dict(x=x, y=y_fit, label=label))
    p.add_glyph(lines_source, line)
    output_file("result1" + '.html')
    if toshow:
        show(p)

    script_map, div_map = components(p)

    return script_map, div_map, p


def pie_chart(x, title, fade_palette=False):
    data = pd.Series(x).reset_index(name='value').rename(columns={'index':'country'})
    data['angle'] = data['value']/data['value'].sum() * 2*pi
    if not fade_palette:
        data['color'] = palettes.Category20c[len(x)]
    else:
        # data['color'] = ['#fffccf','#ffefc3','#ffd4aa','#feb990', '#ff906a'][0:len(x)]
        data['color'] = ['#ffc100', '#ff9a00', '#ff7400', '#ff4d00', '#ff0000'][0:len(x)]

    TOOLS = "hover,wheel_zoom,zoom_in,zoom_out,box_zoom,undo,redo,reset,tap,save,box_select,poly_select,lasso_select,"

    p = figure(plot_height=375,plot_width=625, title=title, toolbar_location='right',
               tools=TOOLS, tooltips="@country: @value", x_range=(-0.5, 1.0),
               background_fill_color='white',
               background_fill_alpha=0.2,
               border_fill_color='white',
               border_fill_alpha=0.5)

    p.wedge(x=0, y=1, radius=0.4,
            start_angle=cumsum('angle', include_zero=True), end_angle=cumsum('angle'),
            line_color="white", fill_color='color', legend='country', source=data)

    p.title.align = 'center'
    p.title.text_font_size = '14pt'
    p.title.text_color = 'black'
    p.legend.label_text_font_size = '9pt'
    p.axis.axis_label = None
    p.axis.visible = False
    p.grid.grid_line_color = None

    # output_file("pie.html")
    output_file(title + '.html')

    show(p)

    script_map, div_map = components(p)

    return script_map, div_map


def plot_map(map_list, title):
    palette = palettes.Viridis6
    # palette = palettes.OrRd
    x_list = map_list[0]
    y_list = map_list[1]
    name_list = map_list[2]
    ratio_list = map_list[3]

    palette.reverse()

    color_mapper = LogColorMapper(palette=palette)

    data = dict(
        x=x_list,
        y=y_list,
        name=name_list,
        rate=ratio_list,
        )

    TOOLS = "pan,wheel_zoom,reset,hover,save"

    p = figure(
            title=title, plot_height=400, plot_width=300, tools=TOOLS,
            x_axis_location=None, y_axis_location=None,
            tooltips=[
                ("Name", "@name"), ("Percentage)", "@rate%"), ("(Long, Lat)", "($x, $y)")
            ])

    p.grid.grid_line_color = None
    p.hover.point_policy = "follow_mouse"

    p.patches('x', 'y', source=data,
                fill_color={'field': 'rate', 'transform': color_mapper},
                fill_alpha=0.7, line_color="white", line_width=0.5)
    show(p)
    # gridplot(p_list, ncols=3)

    # p.patches('x', 'y', source=data,
    #           fill_color='#ff0000',
    #           fill_alpha={'field': 'rate', 'transform': color_mapper}, line_color="white", line_width=0.5)

    # p.patches('x', 'y', source=data)

    # show(p)
    output_file(title + '.html')
    p = None

In [18]:
def reduce_sampling_bias(result):
    total_tweets = {}
    tweets_scale = {}
    valid_result_name = []
    for item in result['rows']:
        tweets_scale[item['region_name']] = int(log(item['region_total_tweets'], 10))
        total_tweets[item['region_name']] = item['region_total_tweets']
    value = Counter(tweets_scale.values()).most_common(1)[0][0]
    for name, count in tweets_scale.items():
        if count == value:
            valid_result_name.append(name)
        elif count == value + 1 or count == value - 1:
            if total_tweets[name] > (1 - VARIANCE) * pow(10, value) and total_tweets[name] < (1 + VARIANCE) * pow(10, value):
                valid_result_name.append(name)
    return valid_result_name


def rearrange_info(result, valid_region):

    aurin_item_name = set()
    corpus_name = set()

    for item in result['rows']:
        for name in item['edu_info']:
            aurin_item_name.add(name)
        for name in item['word_choice']:
            corpus_name.add(name)

    aurin_points = {}
    corpus_points = {}
    for item in result['rows']:
        if item['region_name'] in valid_region:
            # for item in row['rows']:
            for name in aurin_item_name:
                try:
                    aurin_points[name]['value'].append(item['edu_info'][name])
                    aurin_points[name]['region'].append(item['region_name'])
                except:
                    try:
                        aurin_points[name] = {'value': [item['edu_info'][name]], 'region': [item['region_name']]}
                    except:
                        try:
                            aurin_points[name]['value'].append(None)
                        except:
                            aurin_points[name] = {'value': [None], 'region': [None]}
            for name in corpus_name:
                try:
                    corpus_points[name].append(item['word_choice'][name])
                except:
                    try:
                        corpus_points[name] = [item['word_choice'][name]]
                    except:
                        try:
                            corpus_points[name].append(None)
                        except:
                            corpus_points[name] = [None]
    print(aurin_points)
    print(corpus_points)
    return aurin_points, corpus_points


def rearrange_info_from_db(result, valid_region):

    aurin_item_name = set()
    corpus_name = set()

    for row in result.rows:
        for item in row.value['rows']:
            for name in item['edu_info']:
                aurin_item_name.add(name)
            for name in item['word_choice']:
                corpus_name.add(name)

    aurin_points = {}
    corpus_points = {}
    for row in result.rows:
        for item in row.value['rows']:
            for name in aurin_item_name:
                try:
                    aurin_points[name]['value'].append(item['edu_info'][name])
                    aurin_points[name]['region'].append(item['region_name'])
                except:
                    try:
                        aurin_points[name] = {'value': [item['edu_info'][name]], 'region': [item['region_name']]}
                    except:
                        try:
                            aurin_points[name]['value'].append(None)
                        except:
                            aurin_points[name] = {'value': [None], 'region': [None]}
            for name in corpus_name:
                try:
                    corpus_points[name].append(item['word_choice'][name])
                except:
                    try:
                        corpus_points[name] = [item['word_choice'][name]]
                    except:
                        try:
                            corpus_points[name].append(None)
                        except:
                            corpus_points[name] = [None]
    return aurin_points, corpus_points


def obtain_scatter_info(aurin_points, corpus_points):
    correlation_result = {}
    script_map_list = []
    div_map_list = []
    for aurin_name, info in aurin_points.items():
        x_list = info['value']
        region_list = info['region']
        for corpus_name, y_list in corpus_points.items():
            valid_x = []
            valid_y = []
            valid_region = []
            valid_pos = []
            if x_list and y_list:
                for i in range(len(x_list)):
                    if x_list[i] is not None and y_list[i] is not None:
                        valid_pos.append(i)

                if len(valid_pos) >= MIN_NUM_OF_REGION:
                    for i in valid_pos:
                        valid_x.append(x_list[i])
                        valid_y.append(y_list[i])
                        valid_region.append(region_list[i])
                    correlation = pearsonr(valid_x,valid_y)
                    show = False
                    if abs(correlation[0]) > MIN_CORRELATION:
                        show = True
                    correlation_result[(aurin_name,corpus_name)] = correlation
                    fitted_y = curve_fit(valid_x,valid_y)
                    corpus_name = (corpus_name.replace('_', ' ') + " (%)").title()
                    script_map, div_map, p = scatter_fit_plot(valid_x,valid_y,fitted_y,valid_region,aurin_name,corpus_name, show)

                    # if corpus_name == 'body_parts_mild' and aurin_name == 'Highest Year Of School Completed - Persons Aged 15 Years And Over  Not Stated %':
                    #     correlation_result[(aurin_name,corpus_name)] = correlation
                    #     fitted_y = curve_fit(valid_x,valid_y)
                    #     corpus_name = (corpus_name.replace('_', ' ') + " (%)").title()
                    #     script_map, div_map, p = scatter_fit_plot(valid_x,valid_y,fitted_y,valid_region,aurin_name,corpus_name,show)

                    # script_map_list.append(script_map)
                    # div_map_list.append(div_map)
                    # print(correlation_result)
    return script_map_list, div_map_list, correlation_result


def group_info(result, whole_corpus_included=False):
    group_name_pattern = re.compile(GROUP_NAME_SUB)
    group_result = {}
    for name, value in result['corpus_count'].items():
        group_name = (re.sub(group_name_pattern, '', name.replace('_', ' '))).title()
        try:
            group_result[group_name] += value
        except:
            group_result[group_name] = value
    if whole_corpus_included:
        whole_corpus_name = max(group_result, key=lambda key: group_result[key])
        total = group_result[whole_corpus_name]
        del group_result[whole_corpus_name]
        repeated = sum(group_result.values())
        group_result['Unclassified'] = total - repeated
        # print(group_result)
    return group_result


def degree_info(result, whole_corpus_included=False):
    degree_result = {}
    for name, value in result['corpus_count'].items():
        if 'mild' in name:
            degree_name = 'Mild'
        elif 'medium' in name:
            degree_name = 'Medium'
        elif 'strong' in name:
            degree_name = 'Strong'
        elif 'strongest' in name:
            degree_name = 'Strongest'
        else:
            degree_name = 'Unclassified'
        try:
            degree_result[degree_name] += value
        except:
            degree_result[degree_name] = value

    if whole_corpus_included:
        whole_corpus_name = max(degree_result, key=lambda key: degree_result[key])
        total = degree_result[whole_corpus_name]
        del degree_result[whole_corpus_name]
        repeated = sum(degree_result.values())
        degree_result['Unclassified'] = total - repeated
        # print(degree_result)
    ordered_result = {}

    if 'Unclassified' in degree_result.keys():
        ordered_result['Unclassified'] = degree_result['Unclassified']
    if 'Mild' in degree_result.keys():
        ordered_result['Mild'] = degree_result['Mild']
    if 'Medium' in degree_result.keys():
        ordered_result['Medium'] = degree_result['Medium']
    if 'Strong' in degree_result.keys():
        ordered_result['Strong'] = degree_result['Strong']
    if 'Strongest' in degree_result.keys():
        ordered_result['Strongest'] = degree_result['Strongest']
    # print(ordered_result)
    return ordered_result


def get_map_info(geoinfo, valid_name, whole_name, result, feature):
    ratio_list = []
    name_list = []
    x_large_list = []
    y_large_list = []
    print(len(geoinfo['features']))
    for polygons in geoinfo['features']:
        if polygons['properties']['lga_name17'] in whole_name:
            x_list = []
            y_list = []
            for points in polygons['geometry']['coordinates'][0]:
                for x, y in points:
                    x_list.append(x)
                    y_list.append(y)
            x_large_list.append(x_list)
            y_large_list.append(y_list)
            if polygons['properties']['lga_name17'] in valid_name:
                ratio_list.append(get_ratio(result, polygons['properties']['lga_name17'], feature))
            else:
                ratio_list.append(None)
            name_list.append(polygons['properties']['lga_name17'])
    map_list = [x_large_list, y_large_list, name_list, ratio_list]

    return map_list


def get_ratio(result, region_name, feature):
    rate = None
    for item in result['rows']:
        if item['region_name'] == region_name:
            try:
                rate = item['edu_info'][feature]
            except:
                pass
    return rate


In [None]:
# # connect to db to retrieve result
# couch_server = couchdb.Server(url=URL)
# corpus_db = couch_server[CORPUS_VIEW_NAME]
# corpus_view_path = create_view(url=URL, db_name=CORPUS_VIEW_NAME, view_name="corpus_view", mapFunc=ALL_DOC_VIEW_FUNC, overwrite=False)
# corpus_docs = corpus_db.view(corpus_view_path)
# print(corpus_docs)

In [20]:
# rearrange result info
result_file = open('result.json', 'r')
corpus_docs = json.load(result_file)

geo_file = open('geoinfo.json', 'r')
geoinfo = json.load(geo_file)
geo_file.close()

geo_code_list = []
for code in GEO_CODE_FILE_NAME:
    geo_code = {}
    code_file = open(code, 'r')
    reader = csv.reader(code_file)
    for row in reader:
        geo_code[row[1]] = row[2]
    code_file.close()
    geo_code_list.append(geo_code)

valid_region = reduce_sampling_bias(corpus_docs)
print("valid region", valid_region)

region_name_list = []
for code in geo_code_list:
    code_list = []
    for valid in valid_region:
        if valid in code.values():
            code_list.append(valid)
    region_name_list.append(code_list)

print("region name list", region_name_list)

valid region ['Frankston (C)', 'Hume (C)', 'Brimbank (C)', 'Melton (C)', 'Kingston (C) (Vic.)', 'Greater Dandenong (C)', 'Manningham (C)', 'Banyule (C)', 'Knox (C)', 'Bayside (C)', 'Casey (C)', 'Darebin (C)', 'Glen Eira (C)', 'Boroondara (C)', 'Maribyrnong (C)', 'Greater Geelong (C)', 'Hobsons Bay (C)', 'Cardinia (S)', 'Maroondah (C)', 'Adelaide Hills (DC)', 'Charles Sturt (C)', 'Marion (C)', 'Holdfast Bay (C)', 'Campbelltown (C) (SA)', 'Burnside (C)', 'Canada Bay (A)', 'Canterbury-Bankstown (A)', 'Blacktown (C)', 'Ku-ring-gai (A)', 'Campbelltown (C) (NSW)', 'Burwood (A)', 'Cumberland (A)']
region name list [['Canada Bay (A)', 'Canterbury-Bankstown (A)', 'Blacktown (C)', 'Ku-ring-gai (A)', 'Campbelltown (C) (NSW)', 'Burwood (A)', 'Cumberland (A)']]


In [23]:
# plot map distribution
list_map_list = []
for feature in INTERESTED_FEATURE:
    title = feature + ' Distribution'
    for whole_name in region_name_list:
        map_list = get_map_info(geoinfo, valid_region, whole_name, corpus_docs, feature)
        # list_map_list.append(map_list)
        plot_map(map_list, title)

306


AttributeError: unexpected attribute 'tooltips' to Figure, possible attributes are above, aspect_scale, background_fill_alpha, background_fill_color, below, border_fill_alpha, border_fill_color, css_classes, disabled, extra_x_ranges, extra_y_ranges, h_symmetry, height, hidpi, inner_height, inner_width, js_event_callbacks, js_property_callbacks, layout_height, layout_width, left, lod_factor, lod_interval, lod_threshold, lod_timeout, match_aspect, min_border, min_border_bottom, min_border_left, min_border_right, min_border_top, name, outline_line_alpha, outline_line_cap, outline_line_color, outline_line_dash, outline_line_dash_offset, outline_line_join, outline_line_width, output_backend, plot_height, plot_width, renderers, right, sizing_mode, subscribed_events, tags, title, title_location, toolbar, toolbar_location, toolbar_sticky, v_symmetry, width, x_range, x_scale, y_range or y_scale

In [None]:
# plot scatter 
script_map_scatter, div_map_scatter, correlation_result = obtain_scatter_info(edu_points, word_choice_points)
degree = degree_info(corpus_docs, True)
group = group_info(corpus_docs, True)
print(correlation_result)

In [None]:
# plot group pie chart
script_map_group_pie, div_map_group_pie = pie_chart(group, 'Offensive Word Group Pie Chart')

In [None]:
# plot degree of offensiveness pie chart
script_map_degree_pie, div_map_degree_pie = pie_chart(degree, 'Offensive Word Degree Pie Chart', True)