In [1]:
# 2. Event matching: geo, time, airport name (save # of tweets to calculate confidence) 

In [2]:
from enum import Enum
import pandas as pd
from math import sin, cos, sqrt, atan2, radians
EVENT_TYPE = Enum('EVENT_TYPE', 'TERRORIST_ATTACK LABOUR_STRIKE')

EVENT_TYPE.LABOUR_STRIKE

<EVENT_TYPE.LABOUR_STRIKE: 2>

In [3]:
class Mention:
    datetime_happened = None
    datetime_reported = None
    longitude = None
    lattitude = None
    type = None
    # Please use standard system: SVO, SCO, TSE
    airport_id = None
    city_name = None
    country_name = None
    raw_description = None
    def __init__(self, type, airport_id, datetime_reported, datetime_happened=None, lattitude=None, longitude=None, city_name=None, country_name=None, raw_description=None):
        self.datetime_happened = datetime_happened
        self.datetime_reported = datetime_reported
        self.longitude = longitude
        self.lattitude = lattitude
        self.type = type
        self.airport_id = airport_id
        self.city_name = None if city_name is None else city_name.lower()
        self.country_name = None if country_name is None else country_name.lower()
        self.raw_description = None

In [4]:
class Event(Mention):
    mentions = []
    def __init__(self, m:Mention):
        super().__init__(m.type, m.airport_id, m.datetime_reported, m.datetime_happened, m.lattitude, m.longitude, m.city_name, m.country_name, None)
        self.mentions.append(m)
        
    def get_confidence():
        pass

In [5]:
airport_db = pd.read_csv('airports.csv')

In [6]:
airport_db.head()

Unnamed: 0,airport_full,city,country,iata,icao,latitude,longitude
0,Goroka Airport,Goroka,Papua New Guinea,GKA,AYGA,-6.08169,145.391998
1,Madang Airport,Madang,Papua New Guinea,MAG,AYMD,-5.20708,145.789001
2,Mount Hagen Kagamuga Airport,Mount Hagen,Papua New Guinea,HGU,AYMH,-5.82679,144.296005
3,Nadzab Airport,Nadzab,Papua New Guinea,LAE,AYNZ,-6.569803,146.725977
4,Port Moresby Jacksons International Airport,Port Moresby,Papua New Guinea,POM,AYPY,-9.44338,147.220001


In [7]:
airport_db["city"] = airport_db["city"].str.lower()
airport_db["country"] = airport_db["country"].str.lower()

In [8]:
# How many cities have only one airport?
airport_names = {(str(a[1]['city']) + ' ' + str(a[1]['country'])):a[0] for a in airport_db.iterrows()}
airport_freq =  pd.DataFrame(list(airport_names.keys()), index=list(airport_names.values()))[0].value_counts()
one_airport_cities_i = [airport_names[a] for a in airport_freq.keys() if airport_freq[a] == 1]

In [37]:
airport_names

{'horovice czech republic': 1534,
 'cucuta colombia': 2576,
 'nazca peru': 4681,
 'bentonville united states': 4061,
 'mykonos greece': 1432,
 'abilene united states': 3502,
 'pecos united states': 6337,
 'khovd mongolia': 4977,
 'goetsenhoven belgium': 6300,
 'beni congo (kinshasa)': 6812,
 'escanaba united states': 5151,
 'brainerd united states': 5241,
 'warburton community australia': 6052,
 'wittmundhafen germany': 759,
 'aspen united states': 5268,
 'marfa united states': 6633,
 'reno united states': 3589,
 'aasiaat greenland': 3765,
 'scott city united states': 6736,
 'shapaja poland': 667,
 'pudasjarvi finland': 445,
 'maamigili maldives': 6291,
 'muir united states': 3577,
 'el-tor egypt': 1109,
 'gizo solomon islands': 4099,
 'perth united kingdom': 6203,
 'wyndham australia': 6110,
 'shark bay australia': 4930,
 'son-la vietnam': 5703,
 'lakeba island fiji': 4515,
 'chinchilla australia': 6354,
 'chapada dos guimaraes brazil': 6803,
 'nuernberg germany': 343,
 'neuchatel swi

In [10]:
# How many unique cities?
airport_city_freq =  airport_db['city'].value_counts()
city_one_airport_names = [a for a in airport_city_freq.keys() if airport_city_freq[a] == 1]

In [11]:
def geo_to_airport(lat, lon):
    distance_epsilon = 10 # km
    lat = radians(lat)
    lon = radians(lon)
    for airport in airport_db.iterrows():
        airport = airport[1]
        lat2 = radians(airport['latitude'])
        lon2 = radians(airport['longitude'])
        dlon = lon2 - lon
        dlat = lat2 - lat
        a = sin(dlat / 2)**2 + cos(lat) * cos(lat2) * sin(dlon / 2)**2
        c = 2 * atan2(sqrt(a), sqrt(1 - a))
        # Approximate earth radius is 6373.0
        distance = 6373.0 * c
        if distance < distance_epsilon:
            return airport.iata
    return None

In [12]:
def fill_missing_data(mention:Mention):
    if mention.datetime_happened is None:
        mention.datetime_happened = mention.datetime_reported
    if mention.airport_id is None:
        if mention.lattitude is not None and mention.longitude is not None:
            #Reverse geocodes
            mention.airport_id = geo_to_airport(mention.lattitude, mention.longitude)
        if mention.airport_id is None and mention.city_name is not None and mention.country_name is not None:
            if airport_names[mention.city_name + ' ' + mention.country_name] in one_airport_cities_i:
                # One airport in the city
                mention.airport_id = airport_db.loc[airport_names[mention.city_name + ' ' + mention.country_name]].iata
        
        if mention.airport_id is None and mention.city_name is not None and mention.city_name in city_one_airport_names:
            mention.airport_id = airport_db.loc[airport_db[airport_db['city'] == mention.city_name].index[0]].iata
            print(mention.airport_id)
    return mention

In [13]:
class Flights:
    events = []
    def add_event(self, mention:Mention):
        if mention.longitude is None and mention.lattitude is None and mention.airport_id is None and mention.city_name is None and mention.country_name is None:
            print("Not enough data in Mention! Not considering this.")
            return
        
        mention = fill_missing_data(mention)
        if len(self.events) == 0:
            print("Adding a new event_")
            self.events.append(Event(mention))
            return
        
        # Merge with some event
        TIME_EPSILON_SECONDS = 24*60*60
        
        similar_events = list(filter(
            lambda e: e.type == mention.type and
                (((e.airport_id is not None and 
                   mention.airport_id is not None and 
                   e.airport_id == mention.airport_id) or 
                (e.city_name is not None and 
                 mention.city_name is not None and
                 e.country_name is not None and mention.country_name is not None and
                 e.city_name == mention.city_name and e.country_name == mention.country_name)) and 
                abs(e.datetime_happened - mention.datetime_happened) < TIME_EPSILON_SECONDS), self.events))
        print("Found %d similar events" % len(similar_events))
        
        if len(similar_events) == 0:
            print("Adding a new event")
            self.events.append(Event(mention))
            return
        if len(similar_events) == 1:
            print("Merging with one similar event")
            similar_events[0].mentions.append(mention)
            return
        
        # weight similar events
        weights = [0 for _ in range(len(similar_events))]
        for i in range(len(similar_events)):
            e = similar_events[i]
            w = 0
            if e.airport_id == mention.airport_id:
                w += 1
            if e.city_name == mention.city_name and e.country_name == mention.country_name:
                w += 0.5
            
            w += 1/(10*abs(e.datetime_happened - mention.datetime_happened))
            weights[i] = w
        max_w = max(weights)
        indexes = [i for i, x in enumerate(weights) if x == max_w]
        if len(indexes) > 1:
            print("Warning, there are %d similar events for merging, selecting the first one" % len(indexes))
        
        similar_events[indexes[0]].mentions.append(mention)
        similar_events[indexes[0]].datetime_happened = sum([m.datetime_happened for m in similar_events[indexes[0]].mentions]) / float(len(l))
        print("Merged")

In [14]:
def test():
    f = Flights()
    mentions = [
                # This event is added as an initial one
                Mention(EVENT_TYPE.TERRORIST_ATTACK, 'SCO', 1511616649+23*60+60),
                # Another event that is not merged
                Mention(EVENT_TYPE.TERRORIST_ATTACK, 'TSE', 1511616609+5*60),
                # Time shift is OK, but type is different
                Mention(EVENT_TYPE.LABOUR_STRIKE, 'SCO', 1511616669+5*60),
                # No airport id provided
                Mention(EVENT_TYPE.TERRORIST_ATTACK, None, 1511616669 , None, 43.856578, 51.086457),
                # Too late tweet, no merge
                Mention(EVENT_TYPE.TERRORIST_ATTACK, 'SCO', 1511919749+23*60+60),
                # No geo, but city and contry are specified
                Mention(EVENT_TYPE.TERRORIST_ATTACK, None, 1511919749+23*60+60, city_name='Aktau', country_name='Kazakhstan'),
                # Only city provided!
                Mention(EVENT_TYPE.TERRORIST_ATTACK, None, 1511919749+23*60+60, city_name='Aktau'),]
    for m in mentions:
        print("=======")
        f.add_event(m)
        print("Total events:", len(f.events))
test()

Adding a new event_
Total events: 1
Found 0 similar events
Adding a new event
Total events: 2
Found 0 similar events
Adding a new event
Total events: 3
Found 1 similar events
Merging with one similar event
Total events: 3
Found 0 similar events
Adding a new event
Total events: 4
Found 1 similar events
Merging with one similar event
Total events: 4
SCO
Found 1 similar events
Merging with one similar event
Total events: 4


In [None]:
geo_to_airport(60.2979109, 25.0196232)