##### The `location_json.ipynb` notebook stores all the code results from `location.ipynb` into `.json` files for easy use by the frontend.
##### For all the statistical count data, only the data related to the top 10 locations with the highest counts are stored (storing all would result in too much data). If more data is needed, you can modify the value in the `head` or the `index` brackets in the code to obtain more data.
##### You can refer to the comments before each section of data to find the specific data you need. If you need all related data, you can simply look at the final `final_location_analysis_results.json` file.


In [2]:
import pandas as pd
file_path = '../../data/processed/tweets_with_sentiment_vader.csv'
df = pd.read_csv(file_path)

df.head()

Unnamed: 0,_id,author,parent,quoted,mentions,hashtags,depth,node_type,quote_ancestors,tweet_ancestors,...,horiz_offset,influence_tweet_factor,left,vert_offset,location,vert_correct,cleaned_text,dominant_topic,sentiment,weighted_sentiment
0,1134575763334680576,108577207,,1.134424e+18,"['108577207', '21475927', '254515782', '341163...","['ausvotes2019', 'auspol']",0,Root,[],[],...,0.5,0.477121,False,0.795556,"Tasmania, Australia",True,methinks scottmorrisonmp olofdawson scottjakob...,1,0.5419,0.258552
1,1163265539247968256,3112695773,,,"['3112695773', '88593058']",[],0,Root,[],[],...,0.5,3.471585,False,0.99,Unknown,True,koala cut tie alan jones significant buyer med...,3,-0.5574,-1.935062
2,1164479471346257921,2960282202,,,"['3459051', '50393960', '17596622', '155065462...","['amazonfire', 'prayfortheamazon']",0,Root,[],[],...,0.5,0.30103,True,0.572222,Unknown,True,kindly use pay firefighter jet bombardier cl 1...,5,0.1027,0.030916
3,1165088032082604038,2233234848,,,['2233234848'],['amazonfires'],0,Root,[],[],...,0.5,0.477121,False,0.785556,Unknown,True,earth talk human done amazonfires,9,0.0,0.0
4,1170123597802872834,95802989,,,['95802989'],"['stanthorpe', 'qldfires', 'tenterfield', 'nsw...",0,Root,[],[],...,0.5,1.946943,True,0.755556,Australia,True,givit supporting service responding bushfire e...,5,0.7146,1.391286


In [6]:
df.columns

Index(['_id', 'author', 'parent', 'quoted', 'mentions', 'hashtags', 'depth',
       'node_type', 'quote_ancestors', 'tweet_ancestors', 'children',
       'tree_authors', 'tree_hashtags', 'tree_users', 'author_keynode',
       'hashtag_keynode', 'valid', 'created_at', 'created_at_dt',
       'display_text', 'display_text_range', 'extended_entities',
       'favourite_count', 'lang', 'quote_count', 'reply_count',
       'retweet_count', 'urls', 'text', 'child_nodes', 'influence_tweet',
       'influence_user', 'verified', 'deidentmentions', 'deidentscreenname',
       'deidentname', 'timebucket', 'horiz_offset', 'influence_tweet_factor',
       'left', 'vert_offset', 'location', 'vert_correct', 'cleaned_text',
       'dominant_topic', 'sentiment', 'weighted_sentiment', 'date'],
      dtype='object')

In [13]:
# Initialize an empty dictionary to store the results
results = {}


In [14]:
# 1. Location distribution statistics

location_counts = df['location'].value_counts()
location_unique_count = df['location'].nunique()

results['top_10_locations_by_tweet_counts'] = location_counts.head(10).to_dict()
results['unique_locations_count'] = location_unique_count

# Save to JSON after this step
import json
with open('../../data/processed/location_analysis_step_1.json', 'w') as f:
    json.dump(results, f, indent=4)


In [25]:
import json

# Read the JSON file back
with open('../../data/processed/location_analysis_step_1.json', 'r') as f:
    data = json.load(f)

# Print the contents of the JSON file
print(json.dumps(data, indent=4))


{
    "top_10_locations_by_tweet_counts": {
        "Unknown": 62340,
        "Victoria, Australia": 15408,
        "New South Wales, Australia": 14884,
        "United States": 13339,
        "Australia": 13165,
        "United Kingdom": 7115,
        "Queensland, Australia": 4963,
        "South Australia, Australia": 3329,
        "India": 3261,
        "Canada": 3188
    },
    "unique_locations_count": 185
}


In [15]:
# 2. Location distribution in time

from datetime import datetime

df['created_at_dt'] = pd.to_datetime(df['created_at_dt'])
df['date'] = df['created_at_dt'].dt.date
location_time_distribution = df.groupby(['date', 'location']).size().unstack(fill_value=0)

start_date = datetime(2019, 12, 1).date()
filtered_location_time_distribution = location_time_distribution.loc[start_date:]

# Convert the index to strings
filtered_location_time_distribution.index = filtered_location_time_distribution.index.astype(str)

results['location_time_distribution'] = filtered_location_time_distribution[location_counts.index[:10]].to_dict()

# Save to JSON after this step
with open('../../data/processed/location_analysis_step_2.json', 'w') as f:
    json.dump(results, f, indent=4)


In [26]:
# Verify the contents of the saved JSON file
with open('../../data/processed/location_analysis_step_2.json', 'r') as f:
    data = json.load(f)

print(json.dumps(data, indent=4))

{
    "top_10_locations_by_tweet_counts": {
        "Unknown": 62340,
        "Victoria, Australia": 15408,
        "New South Wales, Australia": 14884,
        "United States": 13339,
        "Australia": 13165,
        "United Kingdom": 7115,
        "Queensland, Australia": 4963,
        "South Australia, Australia": 3329,
        "India": 3261,
        "Canada": 3188
    },
    "unique_locations_count": 185,
    "location_time_distribution": {
        "Unknown": {
            "2019-12-01": 0,
            "2019-12-02": 0,
            "2019-12-03": 1,
            "2019-12-04": 2,
            "2019-12-05": 4,
            "2019-12-06": 1,
            "2019-12-07": 2,
            "2019-12-08": 3,
            "2019-12-09": 2,
            "2019-12-10": 6,
            "2019-12-11": 3,
            "2019-12-12": 0,
            "2019-12-13": 2,
            "2019-12-14": 2,
            "2019-12-15": 2,
            "2019-12-16": 1,
            "2019-12-17": 1,
            "2019-12-18": 4,
     

In [16]:
# 3. Emotional location distribution

if 'sentiment' in df.columns:
    location_sentiment_distribution = df.groupby('location')['sentiment'].mean()
    top_locations = location_counts.index[:10]
    top_location_sentiment_distribution = location_sentiment_distribution.loc[top_locations]
    
    results['sentiment_distribution_by_top_10_locations'] = top_location_sentiment_distribution.to_dict()

# Save to JSON after this step
with open('../../data/processed/location_analysis_step_3.json', 'w') as f:
    json.dump(results, f, indent=4)

In [27]:
# Verify the contents of the saved JSON file
with open('../../data/processed/location_analysis_step_3.json', 'r') as f:
    data = json.load(f)

print(json.dumps(data, indent=4))


{
    "top_10_locations_by_tweet_counts": {
        "Unknown": 62340,
        "Victoria, Australia": 15408,
        "New South Wales, Australia": 14884,
        "United States": 13339,
        "Australia": 13165,
        "United Kingdom": 7115,
        "Queensland, Australia": 4963,
        "South Australia, Australia": 3329,
        "India": 3261,
        "Canada": 3188
    },
    "unique_locations_count": 185,
    "location_time_distribution": {
        "Unknown": {
            "2019-12-01": 0,
            "2019-12-02": 0,
            "2019-12-03": 1,
            "2019-12-04": 2,
            "2019-12-05": 4,
            "2019-12-06": 1,
            "2019-12-07": 2,
            "2019-12-08": 3,
            "2019-12-09": 2,
            "2019-12-10": 6,
            "2019-12-11": 3,
            "2019-12-12": 0,
            "2019-12-13": 2,
            "2019-12-14": 2,
            "2019-12-15": 2,
            "2019-12-16": 1,
            "2019-12-17": 1,
            "2019-12-18": 4,
     

In [17]:
# 4. Location and topic statistics

location_topic_distribution = df.groupby(['location', 'dominant_topic']).size().unstack(fill_value=0)
top_locations = location_counts.index[:10]
top_location_topic_distribution = location_topic_distribution.loc[top_locations]

results['topic_distribution_across_top_10_locations'] = top_location_topic_distribution.to_dict()

# Save to JSON after this step
with open('../../data/processed/location_analysis_step_4.json', 'w') as f:
    json.dump(results, f, indent=4)

In [28]:
# Verify the contents of the saved JSON file
with open('../../data/processed/location_analysis_step_4.json', 'r') as f:
    data = json.load(f)

print(json.dumps(data, indent=4))

{
    "top_10_locations_by_tweet_counts": {
        "Unknown": 62340,
        "Victoria, Australia": 15408,
        "New South Wales, Australia": 14884,
        "United States": 13339,
        "Australia": 13165,
        "United Kingdom": 7115,
        "Queensland, Australia": 4963,
        "South Australia, Australia": 3329,
        "India": 3261,
        "Canada": 3188
    },
    "unique_locations_count": 185,
    "location_time_distribution": {
        "Unknown": {
            "2019-12-01": 0,
            "2019-12-02": 0,
            "2019-12-03": 1,
            "2019-12-04": 2,
            "2019-12-05": 4,
            "2019-12-06": 1,
            "2019-12-07": 2,
            "2019-12-08": 3,
            "2019-12-09": 2,
            "2019-12-10": 6,
            "2019-12-11": 3,
            "2019-12-12": 0,
            "2019-12-13": 2,
            "2019-12-14": 2,
            "2019-12-15": 2,
            "2019-12-16": 1,
            "2019-12-17": 1,
            "2019-12-18": 4,
     

In [21]:
# 5. Location and tweet interaction statistics

location_retweet_count = df.groupby('location')['retweet_count'].mean().loc[top_locations]
location_quote_count = df.groupby('location')['quote_count'].mean().loc[top_locations]
location_reply_count = df.groupby('location')['reply_count'].mean().loc[top_locations]

results['retweet_count_by_top_10_locations'] = location_retweet_count.to_dict()
results['quote_count_by_top_10_locations'] = location_quote_count.to_dict()
results['reply_count_by_top_10_locations'] = location_reply_count.to_dict()

# Save to JSON after this step
with open('../../data/processed/location_analysis_step_5.json', 'w') as f:
    json.dump(results, f, indent=4)

In [29]:
# Verify the contents of the saved JSON file
with open('../../data/processed/location_analysis_step_5.json', 'r') as f:
    data = json.load(f)


print(json.dumps(data, indent=4))


{
    "top_10_locations_by_tweet_counts": {
        "Unknown": 62340,
        "Victoria, Australia": 15408,
        "New South Wales, Australia": 14884,
        "United States": 13339,
        "Australia": 13165,
        "United Kingdom": 7115,
        "Queensland, Australia": 4963,
        "South Australia, Australia": 3329,
        "India": 3261,
        "Canada": 3188
    },
    "unique_locations_count": 185,
    "location_time_distribution": {
        "Unknown": {
            "2019-12-01": 0,
            "2019-12-02": 0,
            "2019-12-03": 1,
            "2019-12-04": 2,
            "2019-12-05": 4,
            "2019-12-06": 1,
            "2019-12-07": 2,
            "2019-12-08": 3,
            "2019-12-09": 2,
            "2019-12-10": 6,
            "2019-12-11": 3,
            "2019-12-12": 0,
            "2019-12-13": 2,
            "2019-12-14": 2,
            "2019-12-15": 2,
            "2019-12-16": 1,
            "2019-12-17": 1,
            "2019-12-18": 4,
     

In [22]:
# 6. Statistics related to the influence of tweets

location_influence_tweet = df.groupby('location')['influence_tweet'].mean().loc[top_locations]
location_influence_user = df.groupby('location')['influence_user'].mean().loc[top_locations]

results['tweet_influence_by_top_10_locations'] = location_influence_tweet.to_dict()
results['user_influence_by_top_10_locations'] = location_influence_user.to_dict()

# Save to JSON after this step
with open('../../data/processed/location_analysis_step_6.json', 'w') as f:
    json.dump(results, f, indent=4)


In [30]:
# Verify the contents of the saved JSON file
with open('../../data/processed/location_analysis_step_6.json', 'r') as f:
    data = json.load(f)


print(json.dumps(data, indent=4))

{
    "top_10_locations_by_tweet_counts": {
        "Unknown": 62340,
        "Victoria, Australia": 15408,
        "New South Wales, Australia": 14884,
        "United States": 13339,
        "Australia": 13165,
        "United Kingdom": 7115,
        "Queensland, Australia": 4963,
        "South Australia, Australia": 3329,
        "India": 3261,
        "Canada": 3188
    },
    "unique_locations_count": 185,
    "location_time_distribution": {
        "Unknown": {
            "2019-12-01": 0,
            "2019-12-02": 0,
            "2019-12-03": 1,
            "2019-12-04": 2,
            "2019-12-05": 4,
            "2019-12-06": 1,
            "2019-12-07": 2,
            "2019-12-08": 3,
            "2019-12-09": 2,
            "2019-12-10": 6,
            "2019-12-11": 3,
            "2019-12-12": 0,
            "2019-12-13": 2,
            "2019-12-14": 2,
            "2019-12-15": 2,
            "2019-12-16": 1,
            "2019-12-17": 1,
            "2019-12-18": 4,
     

In [24]:
# 7. hashtag related statistics
from collections import Counter
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

top_location_df = df[df['location'].isin(top_locations)]
top_location_df = top_location_df.dropna(subset=['hashtags'])
top_location_df['hashtags'] = top_location_df['hashtags'].apply(eval)

location_hashtag_counts = {}
for location in top_locations:
    location_data = top_location_df[top_location_df['location'] == location]
    hashtags = location_data['hashtags'].sum()
    hashtag_counter = Counter(hashtags)
    location_hashtag_counts[location] = hashtag_counter.most_common(10)

results['hashtag_distribution_by_top_10_locations'] = location_hashtag_counts

# Save to JSON after this step
with open('../../data/processed/location_analysis_step_7.json', 'w') as f:
    json.dump(results, f, indent=4)

# Save the final combined results to a single JSON file
with open('../../data/processed/final_location_analysis_results.json', 'w') as f:
    json.dump(results, f, indent=4)

In [32]:
# Verify the final JSON file
with open('../../data/processed/final_location_analysis_results.json', 'r') as f:
    data = json.load(f)

print(json.dumps(data, indent=4))

{
    "top_10_locations_by_tweet_counts": {
        "Unknown": 62340,
        "Victoria, Australia": 15408,
        "New South Wales, Australia": 14884,
        "United States": 13339,
        "Australia": 13165,
        "United Kingdom": 7115,
        "Queensland, Australia": 4963,
        "South Australia, Australia": 3329,
        "India": 3261,
        "Canada": 3188
    },
    "unique_locations_count": 185,
    "location_time_distribution": {
        "Unknown": {
            "2019-12-01": 0,
            "2019-12-02": 0,
            "2019-12-03": 1,
            "2019-12-04": 2,
            "2019-12-05": 4,
            "2019-12-06": 1,
            "2019-12-07": 2,
            "2019-12-08": 3,
            "2019-12-09": 2,
            "2019-12-10": 6,
            "2019-12-11": 3,
            "2019-12-12": 0,
            "2019-12-13": 2,
            "2019-12-14": 2,
            "2019-12-15": 2,
            "2019-12-16": 1,
            "2019-12-17": 1,
            "2019-12-18": 4,
     