In [57]:
import os
import re
import string
import requests
import json
import pandas as pd
import numpy as np
import us
from pathlib import Path

from lib.lookups.geography_lookups import non_iso_3166_country_names, can_province_names, can_province_abbrev
from iso3166 import countries

paths = {
    'raw' : r'data/raw_nuforc.csv'
}

def get_state_name(x):
    try:
        x = str(us.states.lookup(x))
    except TypeError:
        x = np.nan
    return x

def get_country(name):  
    if type(name) == float:
        return None
#   Case when full string corresponds with country name.
    if search_country_name_lookup(name) != None:
        return search_country_name_lookup(name)
    
#   Case when string contains brackets and/or commas; then the partial string before first bracket and first comma
#   is the country name.
    partial_name = name
    if '(' in name:
        partial_name = partial_name[:partial_name.index('(')].strip()
        if '(' in name and not name.endswith(')'): 
            partial_name = partial_name + ')'
    if ',' in name:
        partial_name = partial_name.split(',')[0]
    if partial_name != name:
        if search_country_name_lookup(partial_name) != None:
            return search_country_name_lookup(partial_name)

#    Case when string contains country name in last bracket.
    try:
        # Regex returns all data in brackets.
        regex = re.compile(r'\((.*?)\)')
        mo = regex.findall(name)
        name_regex = mo[-1]
    #   If final bracket contains commas, take value after last comma.
        if ',' in name_regex:
            name_regex = name_regex.split(',')[-1].strip()
    #       Remove punctuation except forward slash.
        name_regex = name_regex.translate(str.maketrans('', '', string.punctuation.replace('/', '')))
        if search_country_name_lookup(name_regex) != None:
            return search_country_name_lookup(name_regex)
    except IndexError:
        return None
    
df = pd.read_csv(paths['raw'])\
.reset_index()\
.rename(columns={
    'state': 'state_abbrev',
    'city': 'original_location', 
    'shape': 'original_shape',
    'duration': 'original_duration'})

def save_debugs(df):
    try:
        locations = df.copy()
        locations\
        .sort_values(by='city', ascending=False)\
        [['original_location', 'state_abbrev', 'city', 'state']].to_excel(Path('temp') / 'locations.xlsx')
        print("Debugs saved successfuly.")
    except:
        print("Debugs failed.")

    
def search_country_name(name, lookup=geography_lookups.non_iso_3166_country_names):
    if type(name) == float:
        return None
    try:
        return lookup[name.lower()]
    except KeyError:
        try:
            return countries.get(name)[0]
        except KeyError:
            return None
        
def extract_city(location):
    if not isinstance(location, str):
        return None
    try:
        #       If name contains brackets, look at everything preceding brackets.
        regex = re.compile(r'.+?(?=\()')
        mo = regex.findall(location)
        s = mo[0]
        if '/' in s:
            return s.split('/')[0]
        elif ',' in s:
            return s.split(',')[0]
        else:
            return s
    except IndexError:
        #       If name doesn't contain brackets, take it as is and check if it's a country.
        if '/' in location:
            return location.split('/')[0]
        elif ',' in location:
            return location.split(',')[0]
        elif search_country_name(location) is not None:
            return None
        else:
            return location
                
def get_state(state_abbrev):
    if not isinstance(state_abbrev, str):
        return None
    
    state = us.states.lookup(state_abbrev)
    if state is None:
        state = can_province_names.get(state_abbrev)
    return state
    
df['city'] = df['original_location'].apply(extract_city)
df['state'] = df['state_abbrev'].apply(get_state)

In [58]:
df[df['state'] == 'lol']

Unnamed: 0,index,datetime,original_location,state_abbrev,original_shape,original_duration,summary,posted,city,state


In [23]:
for key, value in can_province_names.items():
    print(us.states.lookup(value), key)

None AB
None BC
None MB
None NB
None NL
None NS
None NT
None NU
None ON
None PE
None QC
None SK
None YT


In [38]:
x = us.states.lookup("YT")
type(x)

NoneType

In [59]:
df.iloc[75261]

index                                                            75261
datetime                                                 8/23/08 18:00
original_location                                   Wolfville (Canada)
state_abbrev                                                        NS
original_shape                                                  Sphere
original_duration                                                15min
summary              Orange metal sphere 200+' up reflecting sunlig...
posted                                                        10/31/08
city                                                        Wolfville 
state                                                      Nova Scotia
Name: 75261, dtype: object

# Debugs

In [60]:
save_debugs(df)

Debugs saved successfuly.


# Getting country data

In [5]:
# Case 1; state known, taking USA as country for granted
df = df\
.assign(state_name=df['state_abbrev'].apply(get_state_name))

df['city'] = df['original_location'].apply(get_city)
df.loc[~df['state_name'].isna(), ['country']] = 'USA'

In [10]:
# Case 2; state unknown, taking country name from last parentheses
df.loc[df['state_name'].isna(), ['country']] = df['original_location'].apply(get_country)
df['country'] = df['country'].fillna(value='unspecified')
df[~df['country'].isin(['USA', 'unspecified'])].to_csv('data/world.csv', index=False)


In [7]:
df[~df['state_abbrev'].isna()]

Unnamed: 0,index,datetime,original_location,state_abbrev,original_shape,original_duration,summary,posted
0,0,10/6/20 14:40,Alexander,NC,Cylinder,10 seconds,Translucent cylindrical silent aircraft Seen n...,11/5/20
1,1,10/6/20 11:50,Abilene,TX,Other,15 seconds,Craft with curved wings like an S shape (for t...,11/5/20
2,2,10/6/20 06:40,Tulsa,OK,Circle,4 minutes,Light came from southwest moving east northeas...,11/5/20
3,3,10/6/20 06:15,Farmers Branch,TX,Light,3-4 seconds,Fast moving object,11/5/20
4,4,10/6/20 04:30,Hubbard,OR,Flash,5-10 minutes,This morning I was sitting in my bedroom watch...,11/5/20
...,...,...,...,...,...,...,...,...
94416,94416,3/29/61 00:00,New York City (Manhattan),NY,Cross,unknown,"Strange, cross-shaped, object witnessed over N...",2/4/13
94417,94417,4/5/00 20:00,Baton Rouge,LA,Light,15 seconds,UFO report communicated by Thomas Jefferson,2/8/11
94418,94418,6/30/90 21:00,Carlisle,NY,Fireball,less than 1 minute,"Slow Moving Fireball, stench of burning sulpher.",8/7/07
94421,94421,9/1/39 20:00,Muddy River (Brookline)(Boston),MA,Changing,2-3 hours,Lost time at sight of swift moving flaming obj...,6/25/20


# Geocoding

In [3]:
world = pd.read_csv('data/world.csv', index_col=0)
world

Unnamed: 0_level_0,datetime,original_location,state_abbrev,original_shape,original_duration,summary,posted,city,state_name,country
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
58,10/1/20 06:30,Aberdeen (UK/Scotland),,Unknown,30 seconds,I was driving to work this morning and i notic...,11/5/20,Aberdeen,,Scotland
72,9/30/20 19:25,Salinas Victoria (Mexico),,Other,3 minutes,An object in the shape of a crescent moon was ...,11/5/20,Salinas Victoria,,Mexico
110,9/27/20,Brighton (UK/England),,Changing,1 hour,Stunning interdimensional craft with ET's inside,11/5/20,Brighton,,England
215,9/19/20 00:26,Tulum (Mexico),,Circle,15 minutes,Fireball Cylinder Shaped Craft over Tulum. Ss...,11/5/20,Tulum,,Mexico
343,9/9/20 00:30,Skopje (Macedonia),,Light,10 seconds,Flying object over Skopje,11/5/20,Skopje,,Macedonia
...,...,...,...,...,...,...,...,...,...,...
94401,10/13/17,Fatima (Portugal),,Disk,,"Miracle of the SunFrom Wikipedia, the free enc...",3/23/11,Fatima,,Portugal
94411,10/24/86 23:00,Maracaibo (outside of) (Venezuela),,Light,Minutes??,"Family members, and local vegetation, severely...",9/24/12,Maracaibo,,Venezuela
94420,12/11/62 21:00,"Lulworth, Dorsetshire (near) (UK/England)",,,>1 minute,Reported in a London paper in 1762: a bright l...,5/15/06,Lulworth,,England
94422,4/14/61 08:00,Nurnburg (Germany),,Cylinder,30 nins,I would think that Hanz Glaser had better thin...,8/19/12,Nurnburg,,Germany


In [6]:
len(world.query('country == "Canada"')

38