In [67]:
# for analysis and data processing
import pandas as pd
import numpy as np
import seaborn as sns
import ppscore as pps
import datetime as dt
import re
import spacy
from spacy.tokens import DocBin
from spacy.training import Example
from spacy.util import minibatch, compounding
import random
from bs4 import BeautifulSoup
import requests
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from nltk.tag import StanfordNERTagger
import torch
from tqdm import tqdm
from transformers import BertTokenizer, BertForTokenClassification, pipeline
import warnings
warnings.filterwarnings("ignore")

In [50]:
def read_data(path_to_csv_file):
    '''
    Reads csv files from specified paths

    Parameters:
            path to csv file locations
    Returns:
            dataframes of imported csv files
    '''
    # Read CSV files
    df = pd.read_csv(path_to_csv_file)
    
    return df

# call the function to import train data - update with relevant paths
train_df = read_data('/Users/mncedisimncwabe/Downloads/Train.csv')
test_df = read_data('/Users/mncedisimncwabe/Downloads/Test.csv')

In [7]:
train_df.head()

Unnamed: 0,tweet_id,text,location
0,ID_1001136212718088192,,EllicottCity
1,ID_1001136696589631488,"Flash floods struck a Maryland city on Sunday,...",Maryland
2,ID_1001136950345109504,State of emergency declared for Maryland flood...,Maryland
3,ID_1001137334056833024,Other parts of Maryland also saw significant d...,Baltimore Maryland
4,ID_1001138374923579392,"Catastrophic Flooding Slams Ellicott City, Mar...",Ellicott City Maryland


In [51]:
# Handle null values
train_df['text'].fillna('', inplace=True)
train_df.dropna(subset=['location'], inplace=True)
test_df['text'].fillna('', inplace=True)

In [52]:
# Function to clean the text data
def clean_text(text):
    if isinstance(text, str):
        text = re.sub(r'http\S+', '', text)  # Remove URLs
        text = re.sub(r'[^A-Za-z0-9\s]', '', text)  # Remove special characters
        text = re.sub(r'\s+', ' ', text).strip()  # Remove leading/trailing/extra spaces
        return text
    return ""

# Apply cleaning function to the test data
train_df['cleaned_text'] = test_df['text'].astype(str).apply(clean_text)
test_df['cleaned_text'] = test_df['text'].astype(str).apply(clean_text)
train_df.head()


Unnamed: 0,tweet_id,text,location,cleaned_text
0,ID_1001136212718088192,,EllicottCity,What is happening to the infrastructure in New...
1,ID_1001136696589631488,"Flash floods struck a Maryland city on Sunday,...",Maryland,SOLDER MISSING IN FLOOD PRAY FOR EDDISON HERMO...
2,ID_1001136950345109504,State of emergency declared for Maryland flood...,Maryland,RT TIME Police searching for missing person af...
3,ID_1001137334056833024,Other parts of Maryland also saw significant d...,Baltimore Maryland,Flash Flood Tears Through Maryland Town For Se...
4,ID_1001138374923579392,"Catastrophic Flooding Slams Ellicott City, Mar...",Ellicott City Maryland,Ellicott City FLOODING Pictures Maryland Gover...


In [54]:
# Split the training data
X_train, X_test, y_train, y_test = train_test_split(train_df['cleaned_text'], train_df['location'], test_size=0.3, random_state=42)
print(X_train.shape)
print(X_test.shape)

(30422,)
(13038,)


In [56]:
# Load the pre-trained BERT model for token classification
tokenizer = BertTokenizer.from_pretrained("dbmdz/bert-large-cased-finetuned-conll03-english")
model = BertForTokenClassification.from_pretrained("dbmdz/bert-large-cased-finetuned-conll03-english")

Some weights of the model checkpoint at dbmdz/bert-large-cased-finetuned-conll03-english were not used when initializing BertForTokenClassification: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [63]:
# Create a pipeline for NER
nlp = pipeline("ner", model=model, tokenizer=tokenizer, aggregation_strategy="simple")

Hardware accelerator e.g. GPU is available in the environment, but no `device` argument is passed to the `Pipeline` object. Model will be on CPU.


In [64]:
# Function to extract location mentions using the BERT model
def extract_locations_batch(texts):
    ner_results = nlp(texts)
    locations = []
    for result in ner_results:
        locs = [res['word'] for res in result if 'LOC' in res['entity_group']]
        locations.append(' '.join(locs) if locs else "None")
    return locations

In [68]:
# Apply the model on X_test in batches
batch_size = 32  # Adjust batch size according to your system's capacity
X_test_pred = []

for i in tqdm(range(0, len(X_test), batch_size), desc="Processing Batches"):
    batch_texts = [str(text) for text in X_test[i:i + batch_size]]  # Ensure each text is a string
    X_test_pred.extend(extract_locations_batch(batch_texts))

# Convert predictions to a pandas Series
X_test_pred = pd.Series(X_test_pred, index=X_test.index)
print(X_test_pred.head())

Processing Batches: 100%|██████████| 408/408 [31:13<00:00,  4.59s/it]

54566    None
66329    None
56439    None
61298    None
69287    None
dtype: object





In [69]:
# Function to compute Word Error Rate (WER)
def wer(reference, hypothesis):
    r = reference.split()
    h = hypothesis.split()
    d = [[0 for x in range(len(h)+1)] for y in range(len(r)+1)]
    for i in range(len(r)+1):
        for j in range(len(h)+1):
            if i == 0:
                d[i][j] = j
            elif j == 0:
                d[i][j] = i
            else:
                substitution_cost = 0 if r[i-1] == h[j-1] else 1
                d[i][j] = min(d[i-1][j] + 1,      # Deletion
                              d[i][j-1] + 1,      # Insertion
                              d[i-1][j-1] + substitution_cost)  # Substitution
    return d[len(r)][len(h)]

# Calculate WER for each row in X_test
wer_scores = [wer(ref, hyp) for ref, hyp in zip(y_test, X_test_pred)]

# Output the average WER
average_wer = sum(wer_scores) / len(wer_scores)
print(f'Average WER on X_test: {average_wer}')

Average WER on X_test: 1.5473999079613439


In [70]:
# Apply the trained model on test_df in batches
test_df_pred = []
for i in tqdm(range(0, len(test_df), batch_size), desc="Processing Test Data"):
    batch_texts = [str(text) for text in test_df['cleaned_text'][i:i + batch_size]]  # Ensure each text is a string
    test_df_pred.extend(extract_locations_batch(batch_texts))

# Assign predictions to test_df
test_df['locations_pred'] = test_df_pred

Processing Test Data: 100%|██████████| 92/92 [07:51<00:00,  5.12s/it]


In [72]:
test_df.to_csv('/Users/mncedisimncwabe/Downloads/loc.csv')

In [38]:
# Download the model if not already installed
try:
    nlp = spacy.load("en_core_web_sm")
except OSError:
    from spacy.cli import download
    download("en_core_web_sm")
    nlp = spacy.load("en_core_web_sm")

In [39]:
# Split the training data
X_train, X_test, y_train, y_test = train_test_split(train_df['cleaned_text'], train_df['location'], test_size=0.3, random_state=42)
print(X_train.shape),
print(X_test.shape)

(30422,)
(13038,)


In [42]:
# Function to extract location mentions
def extract_locations(text):
    text = str(text)  # Ensure the input is a string
    doc = nlp(text)
    locations = [ent.text for ent in doc.ents if ent.label_ == 'GPE' or ent.label_ == 'LOC']
    return ' '.join(locations)

In [43]:
# Apply the model on X_test
X_test_pred = X_test.apply(extract_locations)
X_test_pred.head()

54566    
66329    
56439    
61298    
69287    
Name: cleaned_text, dtype: object

In [45]:
# Function to compute Word Error Rate (WER)
def wer(reference, hypothesis):
    r = reference.split()
    h = hypothesis.split()
    d = [[0 for x in range(len(h)+1)] for y in range(len(r)+1)]
    for i in range(len(r)+1):
        for j in range(len(h)+1):
            if i == 0:
                d[i][j] = j
            elif j == 0:
                d[i][j] = i
            else:
                substitution_cost = 0 if r[i-1] == h[j-1] else 1
                d[i][j] = min(d[i-1][j] + 1,      # Deletion
                              d[i][j-1] + 1,      # Insertion
                              d[i-1][j-1] + substitution_cost)  # Substitution
    return d[len(r)][len(h)]

# Calculate WER for each row in X_test
wer_scores = [wer(ref, hyp) for ref, hyp in zip(y_test, X_test_pred)]

# Output the average WER
average_wer = sum(wer_scores) / len(wer_scores)
print(f'Average WER on X_test: {average_wer}')

Average WER on X_test: 1.5272281024697039


In [46]:
# Apply the trained model on test_df
test_df['locations_pred'] = test_df['cleaned_text'].apply(extract_locations)
test_df.head()

Unnamed: 0,tweet_id,text,cleaned_text,locations_pred
0,ID_1001154804658286592,What is happening to the infrastructure in New...,What is happening to the infrastructure in New...,New England New Orleans
1,ID_1001155505459486720,SOLDER MISSING IN FLOOD.. PRAY FOR EDDISON HER...,SOLDER MISSING IN FLOOD PRAY FOR EDDISON HERMO...,
2,ID_1001155756371136512,RT @TIME: Police searching for missing person ...,RT TIME Police searching for missing person af...,Ellicott City Maryland
3,ID_1001159445194399744,Flash Flood Tears Through Maryland Town For Se...,Flash Flood Tears Through Maryland Town For Se...,Ellicott City
4,ID_1001164907587538944,Ellicott City #FLOODING Pictures: Maryland Gov...,Ellicott City FLOODING Pictures Maryland Gover...,


In [47]:
test_df.to_csv('/Users/mncedisimncwabe/Downloads/locations.csv',index=False)