__Challenge Link__: https://zindi.africa/competitions/geoai-challege-location-mention-recognition-from-social-media

In the initial processing phase, I employed the Spacy en-core small model to tokenize the input texts and prepare them for model compatibility. I made slight adjustments to the text tokenizer. Here I have captured and retained essential information including the event name, Tweet ID, word, part of speech tag, start offset, and the word's target.

In [1]:
import os
import glob
import jsonlines
import spacy
from spacy.tokenizer import Tokenizer
from spacy.util import compile_prefix_regex, compile_suffix_regex, compile_infix_regex
import pandas as pd
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

In [2]:
current_directory = os.getcwd()
print(current_directory)

C:\Users\nabar\Documents\Data_Science\Challenge\Zindi\GeoAI Challege Location Mention Recognition from Social Media by ITU\Submission


In [3]:
# Load the spaCy English tokenizer
nlp = spacy.load("en_core_web_sm")

# Define a custom rule to split "@" as a separate token
prefixes = list(nlp.Defaults.prefixes) + [r'@']
suffixes = list(nlp.Defaults.suffixes) + [r'@']
infixes = list(nlp.Defaults.infixes) + [r'@']
custom_tokenizer = Tokenizer(nlp.vocab, prefix_search=compile_prefix_regex(prefixes).search,
                              suffix_search=compile_suffix_regex(suffixes).search,
                              infix_finditer=compile_infix_regex(infixes).finditer,
                              token_match=None)

# Update the spaCy tokenizer with the custom tokenizer
nlp.tokenizer = custom_tokenizer



# Data Read

In [4]:
gold_data_path = current_directory + r'\IDRISI-main\LMR\data\EN\gold-random-json'

In [5]:
gold_data_path

'C:\\Users\\nabar\\Documents\\Data_Science\\Challenge\\Zindi\\GeoAI Challege Location Mention Recognition from Social Media by ITU\\Submission\\IDRISI-main\\LMR\\data\\EN\\gold-random-json'

In [6]:
train_event_list = glob.glob(gold_data_path + '/*/train.jsonl', recursive = True)
val_event_list = glob.glob(gold_data_path + '/*/dev.jsonl', recursive = True)
test_event_list = glob.glob(gold_data_path + '/*/test_unlabeled.jsonl', recursive = True)

In [7]:
len(train_event_list)

19

# Create Train Data

In [8]:
%%time
# Initialize the result as a list of lists
result = []

for event in train_event_list:
    event_name = event.split('\\')[-2]
    with jsonlines.open(event, "r") as reader:
        for input_json in reader:
            sentence_id = input_json["tweet_id"]
            text = input_json["text"]
            location_mentions = input_json["location_mentions"]
            # Tokenize the text using spaCy
            doc = nlp(text)
            # Process the tokenized text and location mentions
            if len(location_mentions)>0:
                for token in doc:
                    start_offset = token.idx
                    end_offset = start_offset + len(token.text)
                    tag = "O"
                    for item in location_mentions:
                        start_index_gold = item['start_offset']
                        end_index_gold = item['end_offset']
                        if start_offset >=start_index_gold and end_offset<=end_index_gold:
                            tag = "LOC"
                    result.append([event_name, sentence_id, str(token.text), str(token.pos_), start_offset, tag])


CPU times: total: 2min 16s
Wall time: 3min 51s


In [9]:
# Define column names
columns = ["Event_Name", "Sentence", "Word", "POS", "start_offset", "Tag"]

# Create a DataFrame
df = pd.DataFrame(result, columns=columns)

# Display the DataFrame
df.to_csv('train.csv', index = False)

# Create Validation Data

In [10]:
%%time
# Initialize the result as a list of lists
result = []

for event in val_event_list:
    event_name = event.split('\\')[-2]
    with jsonlines.open(event, "r") as reader:
        for input_json in reader:
            sentence_id = input_json["tweet_id"]
            text = input_json["text"]
            location_mentions = input_json["location_mentions"]
            # Tokenize the text using spaCy
            doc = nlp(text)
            # Process the tokenized text and location mentions
            if len(location_mentions)>0:
                for token in doc:
                    start_offset = token.idx
                    end_offset = start_offset + len(token.text)
                    tag = "O"
                    for item in location_mentions:
                        start_index_gold = item['start_offset']
                        end_index_gold = item['end_offset']
                        if start_offset >=start_index_gold and end_offset<=end_index_gold:
                            tag = "LOC"
                    result.append([event_name, sentence_id, str(token.text), str(token.pos_), start_offset, tag])


CPU times: total: 16.1 s
Wall time: 31.6 s


In [11]:
# Define column names
columns = ["Event_Name", "Sentence", "Word", "POS", "start_offset", "Tag"]

# Create a DataFrame
df = pd.DataFrame(result, columns=columns)

# Display the DataFrame
df.to_csv('val.csv', index = False)

# Test set

In [12]:
%%time
# Initialize the result as a list of lists
result = []

for event in test_event_list:
    event_name = event.split('\\')[-2]
    with jsonlines.open(event, "r") as reader:
        for input_json in reader:
            sentence_id = input_json["tweet_id"]
            text = input_json["text"]
            # Tokenize the text using spaCy
            doc = nlp(text)
            # Process the tokenized text and location mentions
            for token in doc:
                start_offset = token.idx
                result.append([event_name, sentence_id, token.text, token.pos_, start_offset])


CPU times: total: 39 s
Wall time: 1min 3s


In [13]:
# Define column names
columns = ["Event_Name", "Sentence", "Word", "POS", "start_offset"]

# Create a DataFrame
df = pd.DataFrame(result, columns=columns)

# Display the DataFrame
df.to_csv('test.csv', index = False)

In [14]:
#End