In [1]:
import json
import os
from transformers import AutoModelForSequenceClassification
from transformers import AutoTokenizer
import torch
from scipy.special import expit
def process_json_files(input_dir, output_dir):
    if not os.path.exists(output_dir):
        os.makedirs(output_dir, exist_ok=True)
    for filename in os.listdir(input_dir):
        input_filepath = os.path.join(input_dir, filename)
        if os.path.isfile(input_filepath) and filename.endswith('.json'):
            index = filename.split('.')[0].split('part')[-1]  
            output_filepath = os.path.join(output_dir, f'geo_tweets_part{index}.json')
            
            with open(input_filepath, 'r', encoding='utf-8') as infile, open(output_filepath, 'w', encoding='utf-8') as outfile:
                data = json.load(infile) 
                output_data = [] 
                for entry in data:
                    if 'includes' in entry['doc'].keys():
                        new_entry = {
                            'id': entry['doc']['_id'],
                            'created_at': entry['doc']['data']['created_at'],
                            'sentiment': entry['doc']['data'].get('sentiment', None),
                            'text': entry['value']['text'],
                           file:///C:/Users/zheyu/AppData/Local/Programs/Python/Python312/Lib/site-packages/tqdm/auto.py#line=20 'geo':entry['doc']['includes']['places'][0]['full_name'],
                            'coordinates':entry['doc']['includes']['places'][0]['geo']['bbox']
                        }
                        output_data.append(new_entry)

                json.dump(output_data, outfile, indent=4) 
#replace with proper input and output directory
input_dir = 'D:/geo/geoout'  
output_dir = 'D:/geo/processed_geooutput'  
process_json_files(input_dir, output_dir)


  from .autonotebook import tqdm as notebook_tqdm


In [5]:
import json
import glob

def read_json(file_path):
    """Reads a JSON file and returns the data."""
    with open(file_path, 'r', encoding='utf-8') as file:
        return json.load(file)

def write_json(data, file_path):
    """Writes data to a JSON file."""
    with open(file_path, 'w', encoding='utf-8') as file:
        json.dump(data, file, indent=4)

#replace with proper input  directory
directory_path = 'D:/geo/processed_geooutput'
files = glob.glob(f'{directory_path}/*.json')
merged_data = []

for file_path in files:
    data = read_json(file_path)
    merged_data.extend(data)  

#replace with proper output directory
output_file_path = 'D:/geo/merged_tweets.json'

write_json(merged_data, output_file_path)


In [1]:
import json
import os
import re
input_directory = 'D:/geo/merged_tweets.json'
output_directory = 'D:/geo/filtered_tweets.json'
def contains_emoji(text):
    # Emoji ranges: https://unicode.org/emoji/charts/emoji-list.html
    emoji_pattern = re.compile(
        "["
        "\U0001F600-\U0001F64F"  # emoticons
        "\U0001F300-\U0001F5FF"  # symbols & pictographs
        "\U0001F680-\U0001F6FF"  # transport & map symbols
        "\U0001F700-\U0001F77F"  # alchemical symbols
        "\U0001F780-\U0001F7FF"  # Geometric Shapes Extended
        "\U0001F800-\U0001F8FF"  # Supplemental Arrows-C
        "\U0001F900-\U0001F9FF"  # Supplemental Symbols and Pictographs
        "\U0001FA00-\U0001FA6F"  # Chess Symbols
        "\U0001FA70-\U0001FAFF"  # Symbols and Pictographs Extended-A
        "\U00002702-\U000027B0"  # Dingbats
        "\U000024C2-\U0001F251" 
        "]+", flags=re.UNICODE)
    return emoji_pattern.search(text) is not None
if not os.path.exists(output_directory):
    os.makedirs(output_directory)

for filename in os.listdir(input_directory):
    if filename.endswith('.json'):
        input_file_path = os.path.join(input_directory, filename)
        output_file_path = os.path.join(output_directory, filename)
        
        with open(input_file_path, 'r', encoding='utf-8') as file:
            data = json.load(file)

        filtered_data = [
            item for item in data
            if not contains_emoji(item['text']) and
            item['geo'] != "Victoria, Australia" and
            len(item['text']) > 50 and
            item['sentiment'] is not None
        ]

        with open(output_file_path, 'w', encoding='utf-8') as outfile:
            json.dump(filtered_data, outfile, indent=4, ensure_ascii=False)

        print(f"Filtered data from {filename} has been saved to {output_file_path}.")



KeyboardInterrupt



In [2]:
import json
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from scipy.special import expit
def load_json(file_path):
    with open(file_path, 'r', encoding='utf-8') as file:
        return json.load(file)

def save_json(data, file_path):
    with open(file_path, 'w', encoding='utf-8') as file:
        json.dump(data, file, indent=4)

# Global initialization of model and tokenizer
device = torch.device("cpu")
model_path = "cardiffnlp/tweet-topic-21-multi"
tokenizer = AutoTokenizer.from_pretrained(model_path)
model = AutoModelForSequenceClassification.from_pretrained(model_path).to(device)
class_mapping = model.config.id2label

# Simplified toot_topic_classification function
def toot_topic_classification(toot):
    tokens = tokenizer(toot, return_tensors='pt', max_length=512, truncation=True).to(device)
    output = model(**tokens)
    scores = expit(output["logits"][0].detach().numpy())
    return [class_mapping[i] for i, prediction in enumerate((scores >= 0.5) * 1) if prediction]


def process_json_files(input_filepath, output_filepath):
    data = load_json(input_filepath)
    for item in data:
      if 'text' in item.keys():
        text = item['text']
        topics = toot_topic_classification(text)
        item['topics'] = topics  # Adding the topics to the JSON object
    save_json(data, output_filepath)

# Specify the input and output file paths
input_filepath = 'D:/geo/filtered_tweets.json'
output_filepath = 'D:/geo/final.json'

process_json_files(input_filepath, output_filepath)

KeyboardInterrupt: 

In [None]:
import json
import os
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from scipy.special import expit

def load_json(file_path):
    with open(file_path, 'r', encoding='utf-8') as file:
        return json.load(file)

def save_json(data, file_path):
    with open(file_path, 'w', encoding='utf-8') as file:
        json.dump(data, file, indent=4, ensure_ascii=False)

def tweet_topic_classification(tweet):
    device = torch.device("cpu")  # Set to CPU to avoid issues on non-CUDA environments
    model_path = "cardiffnlp/tweet-topic-21-multi"
    tokenizer = AutoTokenizer.from_pretrained(model_path)
    model = AutoModelForSequenceClassification.from_pretrained(model_path).to(device)
    class_mapping = model.config.id2label
    topics = []
    tokens = tokenizer(tweet, return_tensors='pt', max_length=512, truncation=True).to(device)
    output = model(**tokens)
    output = {key: value.to("cpu") for key, value in output.items()}
    scores = output["logits"][0].detach().numpy()
    scores = expit(scores)
    predictions = (scores >= 0.5) * 1

    for i, prediction in enumerate(predictions):
        if prediction:
            topics.append(class_mapping[i])
    return topics

def process_directory(input_directory, output_directory):
    if not os.path.exists(output_directory):
        os.makedirs(output_directory)

    for filename in os.listdir(input_directory):
        if filename.endswith('.json'):
            input_filepath = os.path.join(input_directory, filename)
            output_filepath = os.path.join(output_directory, filename)
            data = load_json(input_filepath)
            for item in data:
                if 'text' in item:
                    text = item['text']
                    topics = tweet_topic_classification(text)
                    item['topics'] = topics
            save_json(data, output_filepath)
            print(f"Processed {filename} and saved to {output_filepath}")

# Specify the input and output directories
input_directory = 'D:/geo/test2'
output_directory = 'D:/geo/topic2'

process_directory(input_directory, output_directory)


In [1]:
import geopandas as gpd
import json
from shapely.geometry import Point

#replace with proper directory
fp = "D:/COMP90024_2024_ASMT2_Group12/data/SA2-Map/SA2_2021_AUST_GDA2020.shp"
sa2_gdf = gpd.read_file(fp)
sa2_gdf = sa2_gdf.to_crs(epsg=4326)

with open('tweet.json', 'r') as json_file:
    data = json.load(json_file)

#GeoDataFrame
records = []
for entry in data:
    coordinates = entry['coordinates']
    # average coordinate center
    lon = (coordinates[0] + coordinates[2]) / 2
    lat = (coordinates[1] + coordinates[3]) / 2
    point = Point(lon, lat)
    record = {
        'id': entry['id'],
        'created_at': entry['created_at'],
        'sentiment': entry['sentiment'],
        'text': entry['text'],
        'geo': entry['geo'],
        'topics': entry['topics'],
        'coordinates': coordinates,
        'geometry': point
    }
    records.append(record)

gdf = gpd.GeoDataFrame(records, geometry='geometry', crs="EPSG:4326")
gdf_with_sa2 = gpd.sjoin_nearest(gdf, sa2_gdf, how='left', distance_col='distance')
# suburb name
gdf_with_sa2 = gdf_with_sa2[['id', 'created_at', 'sentiment', 'text', 'geo', 'topics', 'coordinates', 'SA2_NAME21', 'geometry']]

output_data = gdf_with_sa2.drop(columns='geometry').to_dict(orient='records')

with open('suburb_centre.json', 'w') as json_output_file:
    json.dump(output_data, json_output_file, indent=2)




