# Location Positive/Negative Sentence Classification
- Building a logistic regression model to detect whether a sentence contains the gold location description keywords
- Train and predict on the processed NASA example (through Badr's model) `new_filter_train_set.csv`

In [1]:
import pandas as pd
import re
import numpy as np
from sklearn.pipeline import make_pipeline
from sklearn.compose import make_column_transformer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import RandomizedSearchCV, cross_validate
from sklearn.dummy import DummyClassifier
from sklearn.metrics import (
    accuracy_score,
    classification_report,
    confusion_matrix,
    f1_score,
    make_scorer,
    precision_score,
    recall_score,
)
from collections import defaultdict
import geocoder
import numpy as np
import geopy.distance
from scipy.spatial.distance import euclidean

## 1. Data Preprocessing

In [2]:
nasa = pd.read_csv("../data/nasa_global_landslide_catalog_point.csv")
nasa = nasa.reset_index()
nasa = nasa.rename(columns={"index": "id"})  # id: index in NASA dataset

label = pd.read_csv('../data/new_filter_train_set.csv', index_col=0)

df = pd.merge(label, nasa, how='left', on='id')

In [3]:
new_df = df[['id', 'text', 'location_description', 'GPE', 'LOC', 'DATE', 'TIME']]
new_df = new_df.fillna('')  # replace NAN with empty string
new_df['locations'] = new_df[['GPE', 'LOC']].agg('|'.join, axis=1)  # join GPE and LOC by |
new_df['dates'] = new_df[['DATE','TIME']].agg('|'.join, axis=1)  # join DATE and TIME by |
new_df = new_df.drop(columns=['GPE', 'LOC', 'DATE', 'TIME'])  # keep only the joined column
new_df

Unnamed: 0,id,text,location_description,locations,dates
0,0,Reported By: | Edited By: |Source: ANI |Update...,Dabhol village of Maharashtra's Ratnagiri dist...,|,"Aug 04 , 2015|11:39 AM IST"
1,0,At least 12 people are feared to be buried und...,Dabhol village of Maharashtra's Ratnagiri dist...,Dabhol|Maharashtra|Ratnagiri|,|
2,0,The landslide is said to have taken place at 3...,Dabhol village of Maharashtra's Ratnagiri dist...,|,|3:30 am
3,0,Ratnagiri (Maharashtra): Landslide occurred at...,Dabhol village of Maharashtra's Ratnagiri dist...,Ratnagiri|Maharashtra|,|3 : 30 am today
4,0,"pic.twitter.com/MOcmTxnAZX June 22, 2015",Dabhol village of Maharashtra's Ratnagiri dist...,|,"June 22 , 2015|"
...,...,...,...,...,...
26128,35646,"Meanwhile, the Kanchanjunga Academy Higher Sec...",Phidim-Raanke section of the Mechi Highway at ...,Phidim-1|,|
26129,35649,Follow us on landslide blocks doda batote high...,"Chakwa bridge, 9 kilomters short of Doda",Jammu|,|
26130,35649,The highway was blocked after a landslide hit ...,"Chakwa bridge, 9 kilomters short of Doda",Doda|,|last evening
26131,35649,He said that the work to open the highway was ...,"Chakwa bridge, 9 kilomters short of Doda",|,today|evening


In [4]:
print(f"total number of documnets: {np.count_nonzero(new_df['id'].unique())}")

total number of documnets: 3208


- Get the POS/NEG label for each sentence
    - First do exact match: if `location_description` appears in `text`, mark the sentence as a positive sentence (`pos_setence=Yes`), otherwise, mark the sentence as a negative sentence (`pos_setence=No`).
    - Then do partial match: if one or more location entities in the sentence appear in the gold label (`location_description`), mark the sentence as a positive sentence (`pos_setence=Yes`), otherwise, mark the sentence as a negative sentence (`pos_setence=No`).

In [5]:
data = new_df.to_dict()  # transform dataframe into dictionary

data['pos_sentence'] = {}
n = defaultdict(int)
data['number_of_pos_sent'] = {}
data['contain_pos_sent'] = {}

# exact match
# iterate over each sentence in the data
for i in range(len(data['text'])):
    if data['location_description'][i] in data['text'][i]:
        data['pos_sentence'][i] = 'Yes'
        n[data['id'][i]] += 1
    else:
        data['pos_sentence'][i] = 'No'
        
# partial match
for i in range(len(data['text'])):
    if n[data['id'][i]] == 0:
        locs = list(filter(None, data['locations'][i].split("|"))) 
        if any(loc in data['location_description'][i] for loc in locs):
            data['pos_sentence'][i] = 'Yes'
            n[data['id'][i]] += 1
        else:
            data['pos_sentence'][i] = 'No'

# count how many pos_sentence each document has
# count how many documents contain gold place name, how many doesn't
for i in range(len(data['text'])):
    if n[data['id'][i]] == 0:
        data['number_of_pos_sent'][i] = 0
        data['contain_pos_sent'][i] = False
    else:
        data['number_of_pos_sent'][i] = n[data['id'][i]]
        data['contain_pos_sent'][i] = True

In [6]:
data_df = pd.DataFrame(data)
data_df

Unnamed: 0,id,text,location_description,locations,dates,pos_sentence,number_of_pos_sent,contain_pos_sent
0,0,Reported By: | Edited By: |Source: ANI |Update...,Dabhol village of Maharashtra's Ratnagiri dist...,|,"Aug 04 , 2015|11:39 AM IST",No,1,True
1,0,At least 12 people are feared to be buried und...,Dabhol village of Maharashtra's Ratnagiri dist...,Dabhol|Maharashtra|Ratnagiri|,|,Yes,1,True
2,0,The landslide is said to have taken place at 3...,Dabhol village of Maharashtra's Ratnagiri dist...,|,|3:30 am,No,1,True
3,0,Ratnagiri (Maharashtra): Landslide occurred at...,Dabhol village of Maharashtra's Ratnagiri dist...,Ratnagiri|Maharashtra|,|3 : 30 am today,No,1,True
4,0,"pic.twitter.com/MOcmTxnAZX June 22, 2015",Dabhol village of Maharashtra's Ratnagiri dist...,|,"June 22 , 2015|",No,1,True
...,...,...,...,...,...,...,...,...
26128,35646,"Meanwhile, the Kanchanjunga Academy Higher Sec...",Phidim-Raanke section of the Mechi Highway at ...,Phidim-1|,|,No,1,True
26129,35649,Follow us on landslide blocks doda batote high...,"Chakwa bridge, 9 kilomters short of Doda",Jammu|,|,No,1,True
26130,35649,The highway was blocked after a landslide hit ...,"Chakwa bridge, 9 kilomters short of Doda",Doda|,|last evening,Yes,1,True
26131,35649,He said that the work to open the highway was ...,"Chakwa bridge, 9 kilomters short of Doda",|,today|evening,No,1,True


In [7]:
describe_df = data_df[['id', 'number_of_pos_sent', 'contain_pos_sent']].drop_duplicates()
print(f"Distribution of number of positive sentence each document has: \n{describe_df['number_of_pos_sent'].value_counts().sort_index()}\n")
print(f"How many documents contain gold place name: \n{describe_df['contain_pos_sent'].value_counts()}")

Distribution of number of positive sentence each document has: 
0      567
1     2498
2       74
3       29
4       11
5        6
6        6
7        2
8        1
9        4
10       3
11       1
12       1
13       4
18       1
19       1
Name: number_of_pos_sent, dtype: int64

How many documents contain gold place name: 
True     2642
False     567
Name: contain_pos_sent, dtype: int64


In [8]:
data_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 26133 entries, 0 to 26132
Data columns (total 8 columns):
 #   Column                Non-Null Count  Dtype 
---  ------                --------------  ----- 
 0   id                    26133 non-null  int64 
 1   text                  26133 non-null  object
 2   location_description  26133 non-null  object
 3   locations             26133 non-null  object
 4   dates                 26133 non-null  object
 5   pos_sentence          26133 non-null  object
 6   number_of_pos_sent    26133 non-null  int64 
 7   contain_pos_sent      26133 non-null  bool  
dtypes: bool(1), int64(2), object(5)
memory usage: 1.6+ MB


In [9]:
data_df.shape

(26133, 8)

In [10]:
data_df['pos_sentence'].value_counts()

No     23090
Yes     3043
Name: pos_sentence, dtype: int64

In [11]:
data = data_df[data_df['contain_pos_sent']]  # delete documents that doesn't have gold location_description in the text

## 2. Prepration for Model

In [12]:
df_train, df_test = train_test_split(data, test_size=0.20, shuffle=False, random_state=123)

X_train, y_train = df_train["text"], df_train["pos_sentence"]  # change text into masked_text and try again
X_test, y_test = df_test["text"], df_test["pos_sentence"]

## 3. Train the best model

In [13]:
best_model = make_pipeline(TfidfVectorizer(ngram_range=(1,2)), LogisticRegression(max_iter=2000, class_weight='balanced'))
best_model.fit(X_train, y_train)

Pipeline(steps=[('tfidfvectorizer', TfidfVectorizer(ngram_range=(1, 2))),
                ('logisticregression',
                 LogisticRegression(class_weight='balanced', max_iter=2000))])

In [14]:
print(classification_report(y_test, best_model.predict(X_test)))  # precision for yes is important

              precision    recall  f1-score   support

          No       0.92      0.82      0.87      3826
         Yes       0.35      0.60      0.44       645

    accuracy                           0.78      4471
   macro avg       0.64      0.71      0.66      4471
weighted avg       0.84      0.78      0.81      4471



## 4. Generate prediction results
### 4.1 Example preparation
- Use the test set to evaluate the prediction results

In [15]:
best_model.classes_

array(['No', 'Yes'], dtype=object)

In [16]:
best_model.predict_proba(X_test[:20])[:,1]  # probability of being a positive sentence

array([0.17006319, 0.36382099, 0.60410395, 0.70408078, 0.42688206,
       0.31807105, 0.25624586, 0.87002064, 0.58640418, 0.17677036,
       0.21567752, 0.84952536, 0.25241687, 0.52955796, 0.44922223,
       0.71060289, 0.29952627, 0.2381091 , 0.87641206, 0.61065415])

In [17]:
example = df_test  # data_df[:996]
example['number_of_pos_sent'].value_counts().sort_index()

1     4226
2       80
3       65
4       37
5       14
9        9
10      10
11      11
19      19
Name: number_of_pos_sent, dtype: int64

In [18]:
example

Unnamed: 0,id,text,location_description,locations,dates,pos_sentence,number_of_pos_sent,contain_pos_sent
20739,27116,"Imphal, September 22 2017:","near the Khongjaron Khunthak village, Tamei Ro...",Imphal|,September 22 2017|,No,1,True
20740,27116,Incessant rain in the past few days has severe...,"near the Khongjaron Khunthak village, Tamei Ro...",Noney|,the past few days|,No,1,True
20741,27116,While most places in the valley districts of t...,"near the Khongjaron Khunthak village, Tamei Ro...",Kakching|Thoubal|,|,No,1,True
20742,27116,Report reaching here said that a massive lands...,"near the Khongjaron Khunthak village, Tamei Ro...",Inrailong|Khongjaron|,|,Yes,1,True
20743,27116,Tamenglong �Khongsang road last night.,"near the Khongjaron Khunthak village, Tamei Ro...",Tamenglong|Khongsang|,|last night,No,1,True
...,...,...,...,...,...,...,...,...
26128,35646,"Meanwhile, the Kanchanjunga Academy Higher Sec...",Phidim-Raanke section of the Mechi Highway at ...,Phidim-1|,|,No,1,True
26129,35649,Follow us on landslide blocks doda batote high...,"Chakwa bridge, 9 kilomters short of Doda",Jammu|,|,No,1,True
26130,35649,The highway was blocked after a landslide hit ...,"Chakwa bridge, 9 kilomters short of Doda",Doda|,|last evening,Yes,1,True
26131,35649,He said that the work to open the highway was ...,"Chakwa bridge, 9 kilomters short of Doda",|,today|evening,No,1,True


### 4.2 Get the predicted results

In [19]:
def get_distance(p1, p2):
    """Get the geographical distance between two points"""
    if p1 and p2:
        return round(geopy.distance.geodesic(p1, p2).km, 3)
    else:
        return None

In [20]:
def get_outlier_idx(centroid, points):
    """
    Parameters: 
        centroid: a tuple of centroid;
        points: a list of tuples
    Return:
        the index of the point that should be removed
    """
    dists = [get_distance(centroid, point) for point in points]
    return dists.index(max(dists))

In [21]:
def get_smallest_region_idx(locs):
    """
    Parameters
    ----------
    locs : list of dictionary
        a list of dictionary containing latitude, longitude, 
        northeast point, southwest point for all the location 
        entities in the positive sentence
    
    Returns
    ----------
        an integer indicating the index of the location entity 
        that has the smallest region
    """
    dists = [get_distance(loc['northeast'], loc['southwest']) for loc in locs]
    return dists.index(min(dists))

In [22]:
def get_pred_location(df, model):   # df=example, model=best_model
    """Get the most likely locations, latitude, longitude based on pred model

    Parameters
    ----------
    df: 
        a data frame containing document ID (id) and tokenized sentences (text) for each document, 
        extracted location entities (locations), and extracted date entities (dates)
    model:
        the prediction model (logistic model trained on NASA dataset)

    Returns
    -------
        a data frame with locations, the most likely location, latitude, longitude, diameter
    """
    # get predict_proba
    pd.options.mode.chained_assignment = None   # silent warning message
    df['predict_proba'] = model.predict_proba(df['text'])[:, 1]

    result = {'locations': defaultdict(str),
              'location': defaultdict(str),
              'latitude': defaultdict(float),
              'longitude': defaultdict(float),
              'diameter_km': defaultdict(float)}

    # get a dict of idxmax for each document
    idx_max = df.groupby('id')['predict_proba'].idxmax().to_dict()

    data = df.to_dict()
    for i, idx in idx_max.items():  # i: index of the document; idx: index of the df
        # ensure the `locations` column of the `idxmax` row is not empty 
        current_proba = df.query('id == @i')['predict_proba']
        while data['locations'][idx] == "|":
            try:
                current_proba = current_proba.drop(idx)  # drop the current idxmax
                idx = current_proba.idxmax()  # get the idxmax of the rest
            except ValueError:
                print(f"All locations in document {i} are empty!") 
                idx = -1  # set idx=-1 if all locations are empty
                break
        
        # store the locations, latitude, longitude in result dict
        if idx != -1:
            result['locations'][i] = data['locations'][idx]

            locs = list(filter(None, data['locations'][idx].split("|")))
            geolocs = []
            for loc in locs:
                geocoded = geocoder.arcgis(loc).json
                if geocoded:
                    geoloc = geocoded['bbox']
                    geoloc['lat'], geoloc['lng'] = geocoded['lat'], geocoded['lng']
                    geolocs.append(geoloc)
            
            if len(geolocs) > 2:
                # remove the farthest outlier
                lats, lngs = [loc['lat'] for loc in geolocs], [loc['lng'] for loc in geolocs]
                mean_lat, mean_lng = np.mean(lats), np.mean(lngs)  # get the centroid
                x = get_outlier_idx(
                    (mean_lat, mean_lng), 
                    [(lat, lng) for lat, lng in zip(lats, lngs)]
                )
                del geolocs[x]
                del locs[x]
                # get the index of location with the smallest region
                j = get_smallest_region_idx(geolocs)
                location = locs[j]
                lat, lng = geolocs[j]['lat'], geolocs[j]['lng']
                ne, sw = geolocs[j]['northeast'], geolocs[j]['southwest']
            elif len(geolocs) == 2:
                j = get_smallest_region_idx(geolocs)
                location = locs[j]
                lat, lng = geolocs[j]['lat'], geolocs[j]['lng']
                ne, sw = geolocs[j]['northeast'], geolocs[j]['southwest']
            elif len(geolocs) == 1:
                location = locs[0]
                lat, lng = geolocs[0]['lat'], geolocs[0]['lng']
                ne, sw = geolocs[0]['northeast'], geolocs[0]['southwest']
            else:
                print(f"Locations in document {i} cannot be geocoded!")
                location, lat, lng, ne, sw = None, None, None, None, None
        else:
            result['locations'][i] = None
            location, lat, lng, ne, sw = None, None, None, None, None
        
        result['location'][i] = location
        result['latitude'][i], result['longitude'][i] = lat, lng
        result['diameter_km'][i] = get_distance(ne, sw)
    
    return pd.DataFrame(result)

In [23]:
results = get_pred_location(example, best_model) 
results

All locations in document 28784 are empty!
All locations in document 33604 are empty!


Unnamed: 0,locations,location,latitude,longitude,diameter_km
27116,Purul-Lakhamai|Senapati|Paokho|,Senapati,25.26867,94.02224,6.289
27117,Kohima|,Kohima,25.66288,94.09311,13.755
27118,Kot village|Tehri|,Kot village,29.52405,81.58881,2.945
27132,West Java|Indonesia|,West Java,-6.75000,107.50000,342.130
27133,Princeton|,Princeton,40.34954,-74.66009,3.635
...,...,...,...,...,...
35610,Mishawaka|,Mishawaka,41.66017,-86.17217,15.826
35613,Tamenglong|Daniel Kamei Tamenglong|,Tamenglong,24.96538,93.49509,47.665
35633,Surigao del Sur|DAVAO CITY|Philippines|Surigao...,DAVAO CITY,7.06574,125.61080,93.480
35646,PAANCHTHAR|Phidim-Raanke|Samdin|Phidim-2|,Phidim-Raanke,27.12615,87.75943,2.974


In [24]:
results.to_dict()['locations']

{27116: 'Purul-Lakhamai|Senapati|Paokho|',
 27117: 'Kohima|',
 27118: 'Kot village|Tehri|',
 27132: 'West Java|Indonesia|',
 27133: 'Princeton|',
 27151: '|the Rio Pita',
 27179: 'Gopeshwar|Chamoli|Farkhet|Ghat|Chamoli|',
 27186: 'Malyana|Shimla|',
 27187: 'West Bengal|',
 27189: 'Chittakunnu|Thrissur|',
 27191: 'Luopu Village|Sichuan|',
 27192: 'Cornwall|',
 27193: 'Lochaber|',
 27194: 'Sungai Mapai|Kanowit|',
 27211: 'Hoàng Phú|Nha Trang City|',
 27216: 'Hiroshima|',
 27219: 'Florence|North Carolina|Ashe County|',
 27220: 'Mae La Oon|Ward 1|',
 27223: 'Folkestone|Warren|Kent|',
 27607: 'Sogod|Mt. Panjongon',
 27608: 'Pedreira|Sao Paulo|',
 27610: 'Kangding|China|Sichuan Province|',
 27627: 'China|Yunnan Province|',
 27634: 'Othar Nala|Kohistan|',
 27640: 'Sindhupalchowk|Sindhupalchowk|',
 27643: 'Udhampur-Katra|Udhampur|',
 27649: 'Sydney|Mudgee|Mudgee|Mudgee|',
 27650: '|Western Yukon|Haines Junction|Destruction Bay|Kluane Lake',
 27651: 'Guba|Chichi|',
 27656: 'Rudraprayag|Uttarakh

In [25]:
results.size

2775

### 4.3 Evaluate the results

In [26]:
nasa = pd.read_csv("../data/nasa_global_landslide_catalog_point.csv")
nasa = nasa.rename(columns={"latitude": "gold_latitude", "longitude": "gold_longitude"})  # id: index in NASA dataset

In [32]:
eval_df = results.join(nasa[['location_description', 'location_accuracy', 'gold_latitude', 'gold_longitude']])
eval_df

Unnamed: 0,locations,location,latitude,longitude,diameter_km,location_description,location_accuracy,gold_latitude,gold_longitude
27116,Purul-Lakhamai|Senapati|Paokho|,Senapati,25.26867,94.02224,6.289,"near the Khongjaron Khunthak village, Tamei Ro...",10km,24.938948,93.496837
27117,Kohima|,Kohima,25.66288,94.09311,13.755,"P.R. Hill, Kohima, Nagaland 797005, India.",1km,25.658708,94.098536
27118,Kot village|Tehri|,Kot village,29.52405,81.58881,2.945,"SH-8, Tehri Sub-District, Tehri Garhwal, Uttar...",1km,30.359485,78.377428
27132,West Java|Indonesia|,West Java,-6.75000,107.50000,342.130,"Cipelah, Rancabali, Bandung, West Java, Indone...",5km,-7.190355,107.289860
27133,Princeton|,Princeton,40.34954,-74.66009,3.635,"Mountainside, off of Highway 5A, 8 miles from ...",1km,49.526485,-120.521235
...,...,...,...,...,...,...,...,...,...
35610,Mishawaka|,Mishawaka,41.66017,-86.17217,15.826,On County Road 27 -- The Stove Prairie Road --...,5km,40.655800,-105.374200
35613,Tamenglong|Daniel Kamei Tamenglong|,Tamenglong,24.96538,93.49509,47.665,Tamenglong Ward No-IV,50km,24.981800,93.514200
35633,Surigao del Sur|DAVAO CITY|Philippines|Surigao...,DAVAO CITY,7.06574,125.61080,93.480,"Balibadon, Surigao del Sur",5km,9.234000,126.169600
35646,PAANCHTHAR|Phidim-Raanke|Samdin|Phidim-2|,Phidim-Raanke,27.12615,87.75943,2.974,Phidim-Raanke section of the Mechi Highway at ...,10km,27.100500,87.783100


In [34]:
eval_df['location_accuracy'].value_counts()

5km        167
1km        137
10km        76
25km        70
exact       49
50km        32
unknown     24
Name: location_accuracy, dtype: int64

In [50]:
def get_distance2(pred_lat, pred_lng, gold_lat, gold_lng):
    if pd.isnull(pred_lat):
        return None
    else:
        return round(geopy.distance.geodesic((pred_lat, pred_lng), (gold_lat, gold_lng)).km, 3)

def get_correct1(pred_lat, pred_lng, gold_lat, gold_lng):
    """return a boolean that indicates whether the prediction result is correct"""
    if pd.isnull(pred_lat):
        return False
    else:
        if round(pred_lat) == round(gold_lat) or round(pred_lng) == round(gold_lng):
            return True
        else:
            return False

def get_correct2(accuracy, distance):
    output = False
    if accuracy == 'exact' and distance <= 20:
        output = True
    if accuracy == 'unknown' and distance <= 100:
        output = True
    if accuracy.endswith('km'):
        if distance <= float(accuracy[:-2]):
            output = True 
    return output

eval_df = eval_df.assign(distance_km=eval_df.apply(lambda x: get_distance2(x.latitude, x.longitude, x.gold_latitude, x.gold_longitude), axis=1))
eval_df = eval_df.assign(correct1=eval_df.apply(lambda x: get_correct1(x.latitude, x.longitude, x.gold_latitude, x.gold_longitude), axis=1))
eval_df = eval_df.assign(correct2=eval_df.apply(lambda x: get_correct2(x.location_accuracy, x.distance_km), axis=1))
    
eval_df

Unnamed: 0,locations,location,latitude,longitude,diameter_km,location_description,location_accuracy,gold_latitude,gold_longitude,distance_km,correct1,correct2
27116,Purul-Lakhamai|Senapati|Paokho|,Senapati,25.26867,94.02224,6.289,"near the Khongjaron Khunthak village, Tamei Ro...",10km,24.938948,93.496837,64.362,True,False
27117,Kohima|,Kohima,25.66288,94.09311,13.755,"P.R. Hill, Kohima, Nagaland 797005, India.",1km,25.658708,94.098536,0.714,True,True
27118,Kot village|Tehri|,Kot village,29.52405,81.58881,2.945,"SH-8, Tehri Sub-District, Tehri Garhwal, Uttar...",1km,30.359485,78.377428,323.556,True,False
27132,West Java|Indonesia|,West Java,-6.75000,107.50000,342.130,"Cipelah, Rancabali, Bandung, West Java, Indone...",5km,-7.190355,107.289860,53.952,True,False
27133,Princeton|,Princeton,40.34954,-74.66009,3.635,"Mountainside, off of Highway 5A, 8 miles from ...",1km,49.526485,-120.521235,3698.089,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...
35610,Mishawaka|,Mishawaka,41.66017,-86.17217,15.826,On County Road 27 -- The Stove Prairie Road --...,5km,40.655800,-105.374200,1612.212,False,False
35613,Tamenglong|Daniel Kamei Tamenglong|,Tamenglong,24.96538,93.49509,47.665,Tamenglong Ward No-IV,50km,24.981800,93.514200,2.652,True,True
35633,Surigao del Sur|DAVAO CITY|Philippines|Surigao...,DAVAO CITY,7.06574,125.61080,93.480,"Balibadon, Surigao del Sur",5km,9.234000,126.169600,247.582,True,False
35646,PAANCHTHAR|Phidim-Raanke|Samdin|Phidim-2|,Phidim-Raanke,27.12615,87.75943,2.974,Phidim-Raanke section of the Mechi Highway at ...,10km,27.100500,87.783100,3.686,True,True


In [51]:
print(f"Prediction accuracy according to location_accuracy: {eval_df['correct2'].sum()/len(eval_df)}")

Prediction accuracy according to location_accuracy: 0.25765765765765763


In [52]:
print(f"lat/lng match accuracy: {eval_df['correct1'].sum()/len(eval_df)}")

lat/lng match accuracy: 0.6252252252252253


In [53]:
column = eval_df['distance_km']
print(f"Ratio of result within 200 km to the gold: {column[column < 200].count()/len(eval_df)}")

Ratio of result within 200 km to the gold: 0.6108108108108108


In [54]:
len(eval_df)

555