# Location Positive/Negative Sentence Classification
- Building a logistic regression model to detect whether a sentence contains the gold location description keywords
- Train on the processed NASA example (through Badr's model) `new_filter_train_set.csv` and predicted on the `article_sample.tsv`.

In [1]:
import pandas as pd
import re
import numpy as np
from sklearn.pipeline import make_pipeline
from sklearn.compose import make_column_transformer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import RandomizedSearchCV, cross_validate
from sklearn.dummy import DummyClassifier
from sklearn.metrics import (
    accuracy_score,
    classification_report,
    confusion_matrix,
    f1_score,
    make_scorer,
    precision_score,
    recall_score,
)
from collections import defaultdict
import geocoder
import numpy as np
import geopy.distance
from scipy.spatial.distance import euclidean

## 1. Data Preprocessing

- Get the POS/NEG label for each sentence
    - First do exact match: if `location_description` appears in `text`, mark the sentence as a positive sentence (`pos_setence=Yes`), otherwise, mark the sentence as a negative sentence (`pos_setence=No`).
    - Then do partial match: if one or more location entities in the sentence appear in the gold label (`location_description`), mark the sentence as a positive sentence (`pos_setence=Yes`), otherwise, mark the sentence as a negative sentence (`pos_setence=No`).

In [2]:
def merge_locs_dates(data):
    """Fill NAs and merge location/date columns"""
    data = data.fillna('')  # replace NAN with empty string
    data['locations'] = data[['GPE', 'LOC']].agg('|'.join, axis=1)  # join GPE and LOC by |
    data['dates'] = data[['DATE','TIME']].agg('|'.join, axis=1)  # join DATE and TIME by |
    data = data.drop(columns=['GPE', 'LOC', 'DATE', 'TIME'])  # keep only the joined column
    return data

In [3]:
def preprocess(nasa, ner):
    """
    prepare data for pos sentence prediction model
    
    Parameters
    ----------
    nasa : pandas DataFrame
        nasa dataset containing location_description
    ner : pandas DataFrame
        dataset containing id, text, GPE, LOC, DATE, TIME columns from previous step

    Returns
    ----------
        a pandas DataFrame with id, text, location_description, locations, dates, pos_sentence columns
    """
    nasa = nasa.reset_index()
    nasa = nasa.rename(columns={"index": "id"})  # id: index in NASA dataset
    df = pd.merge(ner, nasa, how='left', on='id')
    df = merge_locs_dates(df[['id', 'text', 'location_description', 'GPE', 'LOC', 'DATE', 'TIME']])
    
    data = df.to_dict()  # transform dataframe into dictionary

    data['pos_sentence'] = {}
    n = defaultdict(int)
    data['number_of_pos_sent'] = {}
    data['contain_pos_sent'] = {}

    # exact match
    # iterate over each sentence in the data
    for i in range(len(data['text'])):
        if data['location_description'][i] in data['text'][i]:
            data['pos_sentence'][i] = 'Yes'
            n[data['id'][i]] += 1
        else:
            data['pos_sentence'][i] = 'No'

    # partial match
    for i in range(len(data['text'])):
        if n[data['id'][i]] == 0:
            locs = list(filter(None, data['locations'][i].split("|"))) 
            if any(loc in data['location_description'][i] for loc in locs):
                data['pos_sentence'][i] = 'Yes'
                n[data['id'][i]] += 1
            else:
                data['pos_sentence'][i] = 'No'
    
    # count how many pos_sentence each document has
    # count how many documents contain gold place name, how many doesn't
    for i in range(len(data['text'])):
        if n[data['id'][i]] == 0:
            data['number_of_pos_sent'][i] = 0
            data['contain_pos_sent'][i] = False
        else:
            data['number_of_pos_sent'][i] = n[data['id'][i]]
            data['contain_pos_sent'][i] = True
        
    return pd.DataFrame(data)[pd.DataFrame(data)['contain_pos_sent']]

In [4]:
nasa = pd.read_csv("../data/nasa_global_landslide_catalog_point.csv")
ner = pd.read_csv('../data/new_filter_train_set.csv', index_col=0) 
data = preprocess(nasa, ner)
data

Unnamed: 0,id,text,location_description,locations,dates,pos_sentence,number_of_pos_sent,contain_pos_sent
0,0,Reported By: | Edited By: |Source: ANI |Update...,Dabhol village of Maharashtra's Ratnagiri dist...,|,"Aug 04 , 2015|11:39 AM IST",No,1,True
1,0,At least 12 people are feared to be buried und...,Dabhol village of Maharashtra's Ratnagiri dist...,Dabhol|Maharashtra|Ratnagiri|,|,Yes,1,True
2,0,The landslide is said to have taken place at 3...,Dabhol village of Maharashtra's Ratnagiri dist...,|,|3:30 am,No,1,True
3,0,Ratnagiri (Maharashtra): Landslide occurred at...,Dabhol village of Maharashtra's Ratnagiri dist...,Ratnagiri|Maharashtra|,|3 : 30 am today,No,1,True
4,0,"pic.twitter.com/MOcmTxnAZX June 22, 2015",Dabhol village of Maharashtra's Ratnagiri dist...,|,"June 22 , 2015|",No,1,True
...,...,...,...,...,...,...,...,...
26128,35646,"Meanwhile, the Kanchanjunga Academy Higher Sec...",Phidim-Raanke section of the Mechi Highway at ...,Phidim-1|,|,No,1,True
26129,35649,Follow us on landslide blocks doda batote high...,"Chakwa bridge, 9 kilomters short of Doda",Jammu|,|,No,1,True
26130,35649,The highway was blocked after a landslide hit ...,"Chakwa bridge, 9 kilomters short of Doda",Doda|,|last evening,Yes,1,True
26131,35649,He said that the work to open the highway was ...,"Chakwa bridge, 9 kilomters short of Doda",|,today|evening,No,1,True


In [5]:
describe_df = data[['id', 'number_of_pos_sent', 'contain_pos_sent']].drop_duplicates()
print(f"Distribution of number of positive sentence each document has: \n{describe_df['number_of_pos_sent'].value_counts().sort_index()}\n")
print(f"How many documents contain gold place name: \n{describe_df['contain_pos_sent'].value_counts()}")

Distribution of number of positive sentence each document has: 
1     2498
2       74
3       29
4       11
5        6
6        6
7        2
8        1
9        4
10       3
11       1
12       1
13       4
18       1
19       1
Name: number_of_pos_sent, dtype: int64

How many documents contain gold place name: 
True    2642
Name: contain_pos_sent, dtype: int64


## 2. Train the Model

In [6]:
def train(data):
    """Train and return the model for pos sentence prediction"""
    df_train, df_test = train_test_split(data, test_size=0.20, random_state=123)  # shuffle=False, 
    X_train, y_train = df_train["text"], df_train["pos_sentence"]
    X_test, y_test = df_test["text"], df_test["pos_sentence"]
    
    model = make_pipeline(TfidfVectorizer(ngram_range=(1,2)), LogisticRegression(max_iter=2000, class_weight='balanced'))
    model.fit(X_train, y_train)
    test_scores = classification_report(y_test, model.predict(X_test))
    
    return model, test_scores

In [7]:
model, test_scores = train(data)

In [8]:
print(test_scores)

              precision    recall  f1-score   support

          No       0.93      0.83      0.87      3871
         Yes       0.34      0.58      0.43       600

    accuracy                           0.79      4471
   macro avg       0.63      0.70      0.65      4471
weighted avg       0.85      0.79      0.81      4471



## 3. Prediction

In [9]:
def get_distance(p1, p2):
    """Get the geographical distance between two points"""
    if p1 and p2:
        return round(geopy.distance.geodesic(p1, p2).km, 3)
    else:
        return None

In [10]:
def get_outlier_idx(centroid, points):
    """
    Parameters: 
        centroid: a tuple of centroid;
        points: a list of tuples
    Return:
        the index of the point that should be removed
    """
    dists = [get_distance(centroid, point) for point in points]
    return dists.index(max(dists))

In [11]:
def get_smallest_region_idx(locs):
    """
    Parameters
    ----------
    locs : list of dictionary
        a list of dictionary containing latitude, longitude, 
        northeast point, southwest point for all the location 
        entities in the positive sentence
    
    Returns
    ----------
        an integer indicating the index of the location entity 
        that has the smallest region
    """
    dists = [get_distance(loc['northeast'], loc['southwest']) for loc in locs]
    return dists.index(min(dists))

In [12]:
def predict(df, model):   # df=example, model=best_model
    """Get the most likely locations, latitude, longitude based on pred model

    Parameters
    ----------
    df: 
        a data frame containing document ID (id) and tokenized sentences (text) for each document, 
        extracted location entities (locations), and extracted date entities (dates)
    model:
        the prediction model (logistic model trained on NASA dataset)

    Returns
    -------
        a data frame with locations, the most likely location, latitude, longitude, diameter
    """
    df = merge_locs_dates(df)
    
    # get predict_proba
    pd.options.mode.chained_assignment = None   # silent warning message
    df['predict_proba'] = model.predict_proba(df['text'])[:, 1]

    result = {'locations': defaultdict(str),
              'location': defaultdict(str),
              'latitude': defaultdict(float),
              'longitude': defaultdict(float),
              'diameter_km': defaultdict(float)}

    # get a dict of idxmax for each document
    idx_max = df.groupby('id')['predict_proba'].idxmax().to_dict()

    data = df.to_dict()
    for i, idx in idx_max.items():  # i: index of the document; idx: index of the df
        # ensure the `locations` column of the `idxmax` row is not empty 
        current_proba = df.query('id == @i')['predict_proba']
        while data['locations'][idx] == "|":
            try:
                current_proba = current_proba.drop(idx)  # drop the current idxmax
                idx = current_proba.idxmax()  # get the idxmax of the rest
            except ValueError:
                print(f"All locations in document {i} are empty!") 
                idx = -1  # set idx=-1 if all locations are empty
                break
        
        # store the locations, latitude, longitude in result dict
        if idx != -1:
            result['locations'][i] = data['locations'][idx]

            locs = list(filter(None, data['locations'][idx].split("|")))
            geolocs = []
            for loc in locs:
                geocoded = geocoder.arcgis(loc).json
                if geocoded:
                    geoloc = geocoded['bbox']
                    geoloc['lat'], geoloc['lng'] = geocoded['lat'], geocoded['lng']
                    geolocs.append(geoloc)
            
            if len(geolocs) > 2:
                # remove the farthest outlier
                lats, lngs = [loc['lat'] for loc in geolocs], [loc['lng'] for loc in geolocs]
                mean_lat, mean_lng = np.mean(lats), np.mean(lngs)  # get the centroid
                x = get_outlier_idx(
                    (mean_lat, mean_lng), 
                    [(lat, lng) for lat, lng in zip(lats, lngs)]
                )
                del geolocs[x]
                del locs[x]
                # get the index of location with the smallest region
                j = get_smallest_region_idx(geolocs)
                location = locs[j]
                lat, lng = geolocs[j]['lat'], geolocs[j]['lng']
                ne, sw = geolocs[j]['northeast'], geolocs[j]['southwest']
            elif len(geolocs) == 2:
                j = get_smallest_region_idx(geolocs)
                location = locs[j]
                lat, lng = geolocs[j]['lat'], geolocs[j]['lng']
                ne, sw = geolocs[j]['northeast'], geolocs[j]['southwest']
            elif len(geolocs) == 1:
                location = locs[0]
                lat, lng = geolocs[0]['lat'], geolocs[0]['lng']
                ne, sw = geolocs[0]['northeast'], geolocs[0]['southwest']
            else:
                print(f"Locations in document {i} cannot be geocoded!")
                location, lat, lng, ne, sw = None, None, None, None, None
        else:
            result['locations'][i] = None
            location, lat, lng, ne, sw = None, None, None, None, None
        
        result['location'][i] = location
        result['latitude'][i], result['longitude'][i] = lat, lng
        result['diameter_km'][i] = get_distance(ne, sw)
    
    return pd.DataFrame(result)

In [13]:
example = pd.read_csv('../data/test_examples_100.csv', index_col=0)
results = predict(example, model) 
results

All locations in document 76 are empty!
All locations in document 94 are empty!


Unnamed: 0,locations,location,latitude,longitude,diameter_km
0,Cortez|Telluride|,Telluride,37.935800,-107.848050,1.416
1,Lillooet|British Columbia|the Fraser River,Lillooet,50.683333,-121.933330,14.497
3,Kavalapara|Kerala|Malappuram|,Kavalapara,10.768050,76.302740,3.111
4,Baglung|,Baglung,28.271890,83.589760,5.921
5,Vancouver|,Vancouver,49.261636,-123.113350,21.267
...,...,...,...,...,...
100,Gunma Prefecture|Tomioka|,Tomioka,36.259624,138.889437,45.978
101,Sorsogon|Albay|Catanduanes|Northern Samar|East...,Sorsogon,12.984820,123.997150,34.399
102,Palawan|Caraga|Visayas|Davao Region,Caraga,7.331090,126.562110,39.382
103,West Seattle|Alki Beach|Seattle|Bellevue|Wilbu...,Alki Beach,47.592020,-122.382380,1.342


In [14]:
results.to_csv('test_locations_v2.csv')