In [1]:
import itertools
import json
import csv
import math
import random
import re
import matplotlib.path as mplPath
import numpy as np

In [2]:
AURIN_FILE_NAME = 'AURIN_Edu_2016.json'
# GEO_CODE_FILE_NAME = ['LGA_2017_VIC.csv', 'LGA_2017_SA.csv', 'LGA_2017_NSW.csv']
GEO_CODE_FILE_NAME = ['LGA_2017_NSW.csv']
WHOLE_REGION = False
VALID_REGION_NUM = 35
SPATIAL_FILE_NAME = 'geoinfo.json'
# TWITTER_FILE_NAME = ['twitter_mel.json', 'twitter_adelaide.json', 'twitter_syd.json']
TWITTER_FILE_NAME = ['twitter_syd.json']


VALID_YEAR = ' '
corpus_name_list = ['bad-words',
                    'body_parts_mild', 'body_parts_medium', 'body_parts_strong', 'body_parts_strongest',
                    'gender_identity_mild', 'gender_identity_medium',
                    'gender_identity_strong', 'gender_identity_strongest',
                    'disability_mild', 'disability_medium',
                    'disability_strong', 'disability_strongest',
                    'race_ethnicity_mild', 'race_ethnicity_medium',
                    'race_ethnicity_strong', 'race_ethnicity_strongest',
                    'sexual_reference_mild', 'sexual_reference_medium', 'sexual_reference_strong',
                    # 'older_people_mild', 'older_people_medium',
                    # 'religious_insults_strong'
                    ]

WHOLE_CORPUS = corpus_name_list[0]


In [3]:
def readability_match(file_name):
    try:
        file = open(file_name, 'r')
        json_file = json.load(file)
        name_title_match = {}
        for item in json_file['selectedAttributes']:
            if item['stype'] == 'ratio':
                name_title_match[item['name']] = item['title']
        file.close()
    except:
        print("Readability match failed")
    return name_title_match


def check_belonging(point, contour):
    for element in contour:
        element = list(element)
        crd = np.array(element)  # poly
        bbPath = mplPath.Path(crd)
        r = 0.00001  # accuracy
        isIn = bbPath.contains_point(point, radius=r)
        return isIn


def get_tweet_coordinates(item):
    point = None
    if item['doc']['geo']:  # get tweet coordinates
        point = item['doc']['geo']['coordinates']
    elif item['doc']['coordinates']:
        point = item['doc']['coordinates']['coordinates']
    elif item['doc']['place']['bounding_box']['coordinates']:
        points = item['doc']['place']['bounding_box']['coordinates'][0]
        x = []
        y = []
        [x.append(p[0]) for p in points]
        [y.append(p[1]) for p in points]
        point = [sum(x)/len(points), sum(y)/len(points)]
    return point


# def get_valid_code(geoinfo, geo_code):
#     valid_code = set()
#     for item in geoinfo['features']:
#         if item['properties']['lga_code17'] in geo_code.keys():
#             valid_code.add(item['properties']['lga_code17'])
#     return valid_code


def get_valid_code(geoinfo, geo_code, all_valid=False):
    valid_code = set()
    if all_valid:
        for item in geoinfo['features']:
            if item['properties']['lga_code17']:
                valid_code.add(item['properties']['lga_code17'])
    else:
        for item in geoinfo['features']:
            print(item['properties']['lga_code17'])
            if item['properties']['lga_code17'] in geo_code:
                valid_code.add(item['properties']['lga_code17'])
    return valid_code


def get_tweet_region(coordinate, geoinfo, valid_code):
    coordinate.reverse()
    for polygons in geoinfo['features']:
        if polygons['properties']['lga_code17'] in valid_code:
            if check_belonging(coordinate, itertools.chain.from_iterable(polygons['geometry']['coordinates'])):
                return polygons['properties']['lga_name17']
    return None


def get_tweet_text(tweet):
    text = tweet['doc']['text'].lower()
    text = text.replace('-', ' ')
    return text


def hash_corpus():
    corpus_hash = {}
    try:
        for name in corpus_name_list:
            corpus_file = open(name+'.txt', 'r')
            for word in corpus_file:
                word = word.replace('\n', '')
                pattern = re.compile(r'\s+' + re.escape(word) + r'(\s+|\W+)')
                corpus_hash[pattern] = name
    except:
        print("Hashing corpus failed")
    return corpus_hash


def get_belonged_corpus(corpus_hash, text):
    for pattern, category in corpus_hash.items():
        if bool(pattern.search(text)):
            return category
    return None


In [None]:
def main():

    # load files
    twitter_json_list = []
    # try:
    #     for name in TWITTER_FILE_NAME:
    #         twitter_file = open(name, 'rb')
    #         twitter_json_list.append(json.load(twitter_file))
    #         twitter_file.close()
    # except:
    #     print('Loading Twitter info failed')
    try:
        aurin_file = open(AURIN_FILE_NAME, 'r')
        aurin_info = json.load(aurin_file)
        aurin_file.close()

        geo_code_list = []
        for code in GEO_CODE_FILE_NAME:
            geo_code = {}
            code_file = open(code, 'r')
            reader = csv.reader(code_file)
            for row in reader:
                geo_code[row[1]] = row[2]
            code_file.close()
            geo_code_list.append(geo_code)

        geo_file = open('geoinfo.json', 'r')
        geoinfo = json.load(geo_file)
        geo_file.close()

        name_title_pair = readability_match('geoinfo_meta.json')
        pre_valid_code_list = []
        valid_code_list = []
        for geo_code in geo_code_list:
            pre_valid_code = get_valid_code(geoinfo, geo_code, WHOLE_REGION)
            pre_valid_code_list.append(get_valid_code(geoinfo, geo_code, WHOLE_REGION))
            sample = random.sample(range(len(pre_valid_code)), VALID_REGION_NUM)
            valid_code = []
            for item in sample:
                pre_valid_code = list(pre_valid_code)
                valid_code.append(pre_valid_code[item])
            valid_code_list.append(valid_code)




        # valid_code = get_valid_code(geoinfo, geo_code, WHOLE_REGION)
        for code in valid_code_list:
            print("Valid code: ", len(code))

        # generate and store AURIN dataset info
        region_info = {}

        for valid_code in valid_code_list:
            for item in aurin_info['features']:
                if item['properties']['lga_code17'] in valid_code:
                    region_info[item['properties']['lga_name17']] = {'edu_info': {}, 'word_choice': {}}
                    for name, title in name_title_pair.items():
                        region_info[item['properties']['lga_name17']]['edu_info'].update({title: item['properties'][name]})

        # print(region_info)

        corpus_hash = hash_corpus()
        corpus_count = {}
        corpus_region = {}
        region_total_tweets = {}
        # print(len(tweets['rows']))

        invalid_points = []

        j=0
        k=0
        a=0
        b=0
        # for tweets in twitter_json_list:
        for iter in range(len(TWITTER_FILE_NAME)):
            name = TWITTER_FILE_NAME[iter]
            valid_code = valid_code_list[iter]
            twitter_file = open(name, 'rb')
            # tweets = json.load(twitter_file)
            # twitter_file.close()
            i = 0
            for line in twitter_file:
                if not line.endswith(b']}\r\n') and not line.endswith(b'[\r\n'):  # get rid of the first and last line
                    if line.endswith(b',\r\n'):
                        line = line[:-3]
                    elif line.endswith(b'}}\r\n'):
                        line = line[:-2]
                    else:
                        line = None
                else:
                    line = None
                if line:
                    item = json.loads(line.decode("utf-8"))
                    if item['doc']['created_at']:
                        if bool(re.search(VALID_YEAR, item['doc']['created_at'])):
                            point = get_tweet_coordinates(item)  # get Tweet coordinates
                            if not point:
                                j += 1
                            else:
                                region_name = get_tweet_region(point, geoinfo, valid_code)
                                if not region_name:
                                    k += 1
                                    invalid_points.append(point)
                                else:
                                    try:
                                        region_total_tweets[region_name] += 1
                                    except:
                                        region_total_tweets[region_name] = 1
                                    text = get_tweet_text(item)
                                    a += 1
                                    category = get_belonged_corpus(corpus_hash, text)
                                    if category:
                                        b+=1
                                        try:
                                            corpus_count[category] += 1  # corpus sum
                                        except:
                                            corpus_count[category] = 1
                                        try:
                                            corpus_region[region_name][category] += 1
                                            if category != WHOLE_CORPUS:
                                                corpus_region[region_name][WHOLE_CORPUS] += 1
                                        except:
                                            corpus_region[region_name] = {category: 1}
                                            if category != WHOLE_CORPUS:
                                                try:
                                                    corpus_region[region_name][WHOLE_CORPUS] += 1
                                                except:
                                                    corpus_region[region_name][WHOLE_CORPUS] = 1
                i+=1
                if i % 10000 == 0:
                    print(i, " Rows Processed")
                if i > 180000:
                    break

        print('no geo: ',j)
        print('no valid region: ', k)
        print('has valid region: ', a)
        print('all valid: ', b)
        print(region_total_tweets)
        print(corpus_region)
    except Exception as e:
        print(e)


    result_json = {'rows': [], 'corpus_count': corpus_count}
    for name, value in corpus_region.items():
        for key, num in value.items():
            value[key] = round(num/region_total_tweets[name]*100, 3)
        region_info[name]['word_choice'].update(value)
        row = {'region_name': name, 'region_total_tweets': region_total_tweets[name]}
        row.update(region_info[name])
        print(row)
        result_json['rows'].append(row)

    result_json = json.dumps(result_json)
    print(result_json)

    result_corpus_region = open('corpus_region.json', 'w+')
    for item in corpus_region:
        result_corpus_region.writelines(item)
    result_corpus_region.close()

    result = open('result.json', 'w+')
    for lines in result_json:
        result.writelines(lines)
    result.close()

    corpus_count[WHOLE_CORPUS] = sum(corpus_count.values())
    print(corpus_count)

    invalid = open('invalid_points.json', 'w+')
    invalid_dict = {'points': invalid_points}
    invalid_dict = json.dumps(invalid_dict)
    invalid.write(invalid_dict)
    invalid.close()


if __name__ == '__main__':
    main()