# 1. Modify previously harvested data

## 1.1 Read Each Document

In [1]:
import couchdb, afinn

url_connect = "http://admin:A456852s@127.0.0.1:5984"
couch = couchdb.Server(url_connect) 

db_name = "a_twitter_test"
db = couch[db_name]

Obtain each document's ID:

In [2]:
all_docs = [i for i in db.view('_all_docs', include_docs=True)]
len(all_docs)

108

Sample ID:

In [3]:
my_id = all_docs[0].id
my_id

'1382473379408539649'

Sample Doc:

In [4]:
my_doc = dict(all_docs[0].doc)
my_doc

{'_id': '1382473379408539649',
 '_rev': '2-c87cb8e7459525cd272f2302b8d9645d',
 'created_at': 'Wed Apr 14 23:18:33 +0000 2021',
 'id': 1382473379408539649,
 'id_str': '1382473379408539649',
 'text': 'Absolutely horrible first inning #Phillies',
 'truncated': False,
 'entities': {'hashtags': [{'text': 'Phillies', 'indices': [33, 42]}],
  'symbols': [],
  'user_mentions': [],
  'urls': []},
 'metadata': {'iso_language_code': 'en', 'result_type': 'recent'},
 'source': '<a href="http://tapbots.com/tweetbot" rel="nofollow">Tweetbot for iΟS</a>',
 'in_reply_to_status_id': None,
 'in_reply_to_status_id_str': None,
 'in_reply_to_user_id': None,
 'in_reply_to_user_id_str': None,
 'in_reply_to_screen_name': None,
 'user': {'id': 76908648,
  'id_str': '76908648',
  'name': 'Chris Meister 🏏⚾️🏈🏒',
  'screen_name': 'sportmeister_',
  'location': 'Melbourne, Australia',
  'description': 'Husband, father, lover of all sports @BentleighCC',
  'url': 'https://t.co/chTa81tivY',
  'entities': {'url': {'url

## 1.2 Calculate Sentiment Score 

In [5]:
from afinn import Afinn
afinn = Afinn()
if "full_text" in my_doc:
    text = my_doc["full_text"]
else:
    text = my_doc["text"]
afinn.score(text)

-3.0

## 1.3 Keyword Search

### 1.3.1 Covid

In [6]:
import re

def mention_time(text, keyword_list, delimiter):
    return sum([1 if token.lower() in keyword_list else 0 for token in re.split(delimiter, text)])

keywords_list = dict()

delimiter = r"[^a-zA-Z0-9\-]"

# 15 Keywords
keywords_list["covid_words"] = ["covid", "covid-19", "coronavirus", "antibodies", "vaccine",
                                "antibody", "quarantine", "self-isolation", "epidemic", "immunity", 
                                "distancing", "sars-cov-2", "ventilator", "immunization", "tested"]

mention = mention_time("#coVid and #coviD-19 and ?sars-cOv-2!", keywords_list["covid_words"] , delimiter)
mention

3

### 1.3.2 COVID Vaccine

In [7]:
# 5 Keywords
keywords_list["vaccine_words"] = ["vaccine", "pfizer", "biontech", "oxford", "astrazeneca"]

mention = mention_time(">vaccine and [pfizer] and {oxford}", keywords_list["vaccine_words"], delimiter)
mention

3

### 1.3.3 Vulgar Words 

In [8]:
# 25 Keywords
keywords_list["vulgar_words"] = ["asshole", "fyfi", "wfh", "shit", "damn", "fuck", "jerkoff", 
                                  "ass", "prick", "damn", "gtfo", "stfu", "goddamn", "fxxk",
                                  "bitch", "wtf", "cunt", "cock", "dick", "frick", "pussy", 
                                  "cum", "fk", "fucker", "omfg"]

mention = mention_time("~~~fUck and #$%shit^% and damn!!", keywords_list["vulgar_words"], delimiter)
mention

3

### 1.3.4 China Related

In [9]:
# 10 Keywords
keywords_list["china_words"] = ["china", "chinese", "ccp", "mandarin", "yuan", "chinatown", 
                                "beijing", "xinjiang", "australia–china", "sino-australia"]

mention = mention_time("China and YuAN¥ and XInJiang", keywords_list["china_words"], delimiter)
mention

3

### 1.3.5 Alcohol Related

In [10]:
# 25 Keywords
keywords_list["alcohol_words"] = ["alcoholic", "drunk", "spirit", "alcohol", "cocktail", 
                                  "beer", "vodka", "corona", "bottle", "wine", "martini", "vb",
                                  "asahi", "carlsberg", "whiskey", "pub", "casino", "hooch", 
                                  "booze", "shooter", "shot", "soju", "baijiu", "brandy", "bundaberg"]

mention = mention_time("sOJu and @VodKA&&&PUB!!!", keywords_list["alcohol_words"], delimiter)
mention

3

### 1.3.6 Crime Related

In [11]:
# 25 Keywords
keywords_list["crime_words"] = ["crime", "criminal", "gun", "shoot", "robbery", "kidnap", "kidnapping"
                                "kill", "murder", "assault", "drug", "fraud", "hacking", "homicide",
                                "smuggling", "terrorism", "speeding", "theft", "prison", "sentence", 
                                "police", "felony", "killed", "drugs", "discrimination", "death"]

mention = mention_time("sOJu and @VodKA&&&PUB!!!", keywords_list["crime_words"], delimiter)
mention

0

### 1.3.7 VIC Government

In [12]:
# 5 Keywords
keywords_list["vic_words"] = ["government", "vic", "daniel", "andrew", "minister"]

mention = mention_time("GOvernment and @daniel !!andrew!!!", keywords_list["vic_words"], delimiter)
mention

3

## 1.4 Obtain SA2 Area 

For example: use docs in `twitter/city/melbourne`.

In [13]:
db_name = "twitter/city/melbourne"
db = couch[db_name]
all_docs = [i for i in db.view('_all_docs', include_docs=True)]
len(all_docs)

52619

Read grid file:

In [14]:
import json
from shapely.geometry import Polygon, MultiPolygon, Point

with open("melb.json") as file:
    grid = json.loads(file.read())["features"]

polygons = []
for loc in grid:
    if loc["geometry"]["type"] == "Polygon":
        polygons.append(Polygon(loc["geometry"]['coordinates'][0]))
    else:
        polygons.append(MultiPolygon([Polygon(each[0]) for each in loc["geometry"]['coordinates']]))
SA2_names = [loc["properties"]['SA2_NAME16'] for loc in grid]
SA2_codes_9_digits = [loc["properties"]['SA2_MAIN16'] for loc in grid]
SA2_codes_5_digits = [loc["properties"]['SA2_5DIG16'] for loc in grid]


Check SA2 Area

In [15]:
count = 0
for each in all_docs:
    if count == 5:
        break
    doc = each.doc
    if doc["coordinates"] is not None:
        count += 1
        point = Point(doc["coordinates"]["coordinates"])
        for p, n, c1, c2 in zip(polygons, SA2_names, SA2_codes_9_digits, SA2_codes_5_digits):
            if p.contains(point):
                print(n, c1, c2)
                break

Hampton Park - Lynbrook 212031305 21305
Werribee - South 213051368 21368
Werribee - West 213051468 21468
Cranbourne 212031300 21300
Berwick - South 212021294 21294


## 1.5 Complete Code for Making Modification

Take database `a_twitter_test` as exmaple:

In [16]:
%%writefile update.py

import couchdb
import json
import re
import os
from shapely.geometry import Polygon, MultiPolygon, Point
from afinn import Afinn

def mention_time(text, keyword_list, delimiter):
    return sum([1 if token.lower() in keyword_list else 0 for token in re.split(delimiter, text)])

# CouchDB URL
url_connect = "http://admin:A456852s@127.0.0.1:5984"

# Database list
cities = ["sydney", "brisbane", "melbourne", "perth", "adelaide", "canberra"]
db_names = [f"twitter/city/{city}" for city in cities]

# Split delimiter
delimiter = r"[^a-zA-Z0-9\-]"

# affin object
afinn = Afinn()

# Keywards
keywords_list = dict()
for filename in os.listdir():
    if filename.endswith("txt"):
        with open(filename, "r") as file:
            keywords_list[filename.split(".")[0]] = file.read().split()

# Grid
with open("melb.json") as file:
    grid = json.loads(file.read())["features"]
polygons = []
for loc in grid:
    if loc["geometry"]["type"] == "Polygon":
        polygons.append(Polygon(loc["geometry"]['coordinates'][0]))
    else:
        polygons.append(MultiPolygon([Polygon(each[0]) for each in loc["geometry"]['coordinates']]))
area_list = dict()
area_list["SA2_names"] = [loc["properties"]['SA2_NAME16'] for loc in grid]
area_list["SA2_codes_9_digits"] = [loc["properties"]['SA2_MAIN16'] for loc in grid]
area_list["SA2_codes_5_digits"] = [loc["properties"]['SA2_5DIG16'] for loc in grid]
            
if __name__ == "__main__":
    # Connect CouchDB
    couch = couchdb.Server(url_connect)
    
    # Get all database
    for db_name in db_names:
        db = couch[db_name]
        
        # Get all docs
        all_docs = [i for i in db.view('_all_docs', include_docs=True)]
        
        # Get each doc
        count = 1
        length = len(all_docs)
        for each_doc in all_docs:
            this_doc = dict(each_doc.doc)
            
            # Load text info
            if "full_text" in this_doc:
                this_text = this_doc["full_text"]
            elif "text" in this_doc:
                this_text = this_doc["text"]
            else:
                count += 1 
                continue
            
            # Add field: affin
            this_doc["afinn"] = afinn.score(this_text)
            
            # Add field: keywords
            for key in keywords_list:
                mention = mention_time(this_text, keywords_list[key], delimiter)
                this_doc[key] = mention
            
            # Add field: SA2
            if this_doc["coordinates"] is not None:
                point = Point(this_doc["coordinates"]["coordinates"])
                index = None
                for i, polygon in enumerate(polygons):
                    if polygon.contains(point):
                        index = i
                        break
                for key in area_list:
                    if index is not None:
                        this_doc[key] = area_list[key][index]
                    else:
                        this_doc[key] = None
            else:
                for key in area_list:
                    this_doc[key] = None
                    
            # Post each dock
            db.save(this_doc)

            # Print log
            print(f"{count}/{length} docs updated in database: {db_name}", end='\r')
            count += 1 

Overwriting update.py


# 2. Harvest Useful Tweets in 45.113.232.90

In [17]:
%timeit
import requests

url = 'http://45.113.232.90/couchdbro/twitter/_design/twitter/_view/summary'
params = {
    'start_key': '["sydney",2020,5,1]',
    'end_key': '["sydney",2020,5,1]',
    'reduce': 'false',
    'include_docs': 'true'
}
user = ("readonly", "ween7ighai9gahR6")

req = requests.get(url, params=params, auth=user)

In [18]:
%timeit
docs = json.loads(req.text)['rows']
len(docs)

63223

In [19]:
count = 1
for i in range(len(docs)):
    if docs[i]['doc']["coordinates"] is not None:
        count += 1
count

65

In [20]:
url = 'http://45.113.232.90/couchdbro/twitter/_design/twitter/_view/summary'
params = {
    'reduce': 'true',
    'include_docs': 'false',
    'group_level': 1
}
user = ("readonly", "ween7ighai9gahR6")

req2 = requests.get(url, params=params, auth=user)

In [21]:
json.loads(req2.text)

{'rows': [{'key': ['adelaide'], 'value': 30951022},
  {'key': ['brisbane'], 'value': 59529193},
  {'key': ['canberra'], 'value': 18704632},
  {'key': ['hobart'], 'value': 7349898},
  {'key': ['melbourne'], 'value': 42033167},
  {'key': ['perth'], 'value': 44301748},
  {'key': ['sydney'], 'value': 96931480}]}