In [1]:
#Import packages
import pandas as pd
import numpy as np
import os
import time
import geocoder
from geopy.geocoders import Nominatim

In [2]:
#Read in BF locations data
BF_Locations = pd.read_csv(os.path.join("https://download.data.world/s/a5gdzsmxenwcwqmnbjaaizbfk4uea3"))
BF_Reports = pd.read_csv(os.path.join("https://download.data.world/s/td52m5guv2cpruwjo6pr2cd3fbk5cx"))

In [3]:
#Read in UFO data (may take a minute due to size)
UFO = pd.read_csv(os.path.join("https://download.data.world/s/lobqhnvklhbppb4ayke4b23lsupg57"))

In [4]:
#Change column name in BF locations dataframe
BF_Locations = BF_Locations.rename(columns={'number': 'report_number'})

In [5]:
#Merge two BF datasets
BF = pd.merge(BF_Locations, BF_Reports, how='left', left_on='report_number', right_on='report_number')

In [6]:
#Split BF 'title' column by report number and report
BF[['report_number2', 'report']] = BF.pop('title').str.split(pat=':', n=1, expand=True)

In [7]:
#Reformatting BF timestamp column
BF['timestamp'] = pd.to_datetime(BF['timestamp'], format = "%Y-%m-%dT%H:%M:%S.%f")

In [8]:
#Splitting BF timestamp column
BF['report_date'] = BF['timestamp'].dt.date
BF['time'] = BF['timestamp'].dt.time

In [9]:
#Removing unnecessary columns in BF dataframe
BF = BF.drop(['timestamp', 'year', 'season', 'report_class', 
              'month', 'date', 'a_g_references', 'time', 'report_number2'], axis=1)

In [10]:
#Changing dates to datetime format
BF['report_date'] = pd.to_datetime(BF['report_date'], errors='coerce')

In [11]:
#Due to the size of the UFO dataframe, I will be using only the first 1000 rows, to match the amount of rows in the BF dataframe.
UFO = UFO[:1000]

In [12]:
#Drop unnecessary columns
UFO = UFO.drop(['stats', 'report_link', 'text'], axis = 1)

In [13]:
#Remove all countries other than USA
UFO = UFO.loc[UFO["country"] == "USA"]

In [14]:
#Split 'date_time' column
UFO[['date', 'time']] = UFO.pop('date_time').str.split(pat=' ', n=1, expand=True)

In [15]:
#Changing dates to datetime format
UFO['date'] = pd.to_datetime(UFO['date'], errors='coerce')

In [16]:
#Dropping null values
UFO = UFO[UFO['date'].notna()]
UFO = UFO[UFO['city'].notna()]
UFO = UFO[UFO['state'].notna()]

In [17]:
#Capitalizing city names
UFO.city = UFO.city.str.title()

In [29]:
#Replacing any incorrect city or state names. 
UFO = UFO.replace({'city':{
    'St Augustine Pass Las Cruces':'San Augustin',
    'Kentucky (Somewhere In North Central Ky On I-65)': 'Covington',
    'Airmont, (Formerly Tallman)': 'Tallman',
    'Panther Creek State Park, Close To Morristown, Tennessee': 'Morristown',
    'Washington Dc (Suitland, Md)': 'Suitland',
    'Saguaro Lake (Mesa)': 'Mesa',
    'Grand Rapids - Godwin Heights': 'Grand Rapids',
    'Terre Haute (Just E Of; On Interstate 70)': 'Terre Haute',
    'Near Jemez': 'Jemez',
    ' Eldorado Hills-Placerville (Hwy 50': 'Placerville',
    'Odessa (Starkey Ranch)': 'Odessa',
    'Staton Island': 'New York City',
    'Zoar (Small Town In America)': 'Zoar',
    'Lordsburg And Silver City (Between)': 'Lordsburg',
    'Swayze': 'Swayzee',
    'Brainerd/Gull Lake': 'Brainerd',
    'Chrystal River': 'Crystal River',
    'Two Or Three Miles Northwest Of Detroit Metro Airport.': 'Detroit',
    'Seaford, Long Island': 'Long Island',
    'Bosie': 'Boise',
    'Seiverville': 'Sevierville',
    'Towsan': 'Towson',
    'Small Town Off The I-40 Not Sure Which)': 'Null',
    'Unkone': 'NaN',
    'Unsure Exactly': 'NaN',
    'Port St. Lucie': 'Port Saint Lucie',
    'De Funiak Springs': 'DeFuniak Springs',
    'South Ozone Park, Queens, Nyc': 'New York City',
    'St. Louis': 'Saint Louis',
    'Forest Hills (Queens; Nyc)': 'New York City',
    'Huntsville Alabama': 'Huntsville',
    'Idaho National Forest': 'NaN',
    'Ft Worth': 'Fort Worth',
    'Sellersville/Perkasie, Pa': 'Sellersville',
    'Mt. Vernon': 'Mount Vernon',
    'Rio Ranco': 'Rio Rancho',
    'Forest Hills (Queens)': 'New York City',
    'Lauderdale By The Sea Florida': 'Fort Lauderdale',
    'Newport Rhode Island': 'Newport',
    'Daytona Beach Shores': 'Daytona Beach',
    'Ft Pierce': 'Fort Pierce',
    'Road Above Stinson Beach': 'Stinson Beach',
    'Near The Wisconsin Dells': 'NaN',
    'South Of Zortman': 'NaN',
    'Pompano Beach Pompano': 'Pompano Beach',
    'Lakewood/Edgewater': 'Lakewood',
    'Carrabelle Florida': 'Carrabelle',
    'Interstate 80': 'NaN',
    'Blaine-Outside City Limits': 'Blaine',
    'Batesville And Floral': 'Batesville',
    'Clarkston, Washington': 'Clarkston',
    'Port St Lucie': 'Port Saint Lucie',
    'Between Faster,Al And Tuscaloosa, Al': 'Tuscaloosa',
    'St. Johns Du Quoin Address': 'Saint Johns',
    'St. George': 'Saint George',
    '21-Minutes, Flight Time, North Of Phoenix': 'Phoenix',
    'Boston Area': 'Boston',
    'Near Muhlenburg Airport': 'NaN',
    'Taylorsville, Salt Lake, Utah': 'Salt Lake City',
    'Orem, Provo': 'Orem',
    'N Las Vegas': 'Las Vegas',
    'Lafollette And Powell': 'LaFollette',
    'St. Joseph': 'Saint Joseph',
    'Rosalia, St. John, Colfax Area Of The Palouse': 'Rosalia',
    'Wentzville,': 'Wentzville',
    'Columbus, Ohio And Surrounding Areas': 'Columbus',
    'Monterey Bay/Fort Ord Rifle Range': 'Monterey Bay',
    'Woodberry Forest Preparatory School, Va': 'NaN',
    'Los Angeles (Over I-10, On Most Rural Bayou Stretch)': 'Los Angeles',
    'California Valley (Cattle Country, Not Too Distant Airbase)': 'California Valley',
    'Bayfield (Near, Lake Superior, South Shore)': 'Bayfield',
    'Between Tacoma And Gig Harbor, Wa': 'NaN',
    '20,000 Feet Over Trout Lake, Wa Looking Northwest': 'NaN',
    'Thomasville (Usaf Long Range Radar Facility)': 'Thomasville',
    'Myrtle Beach, Horry County South Carolina': 'Myrtle Beach',
    'Olyimpa': 'Olympia',
    'Vancounver': 'Vancouver',
    'Mckinelyville': 'Mckinleyville'
}})

UFO = UFO.replace({'state':{
    'Washington, DC': 'MD',
    'Cornwall': 'NaN',
    '\nMA': 'MA'
}})

In [30]:
#Entries that are too vague, or incorrectly labeled as U.S.A
UFO = UFO.drop(labels=[44, 65, 141, 536, 595, 106, 200, 449, 546, 616, 740, 906, 990, 972], axis=0)

KeyError: '[44, 65, 141, 536, 595, 106, 200, 449, 546, 616, 740, 906, 990, 972] not found in axis'

In [31]:
#Connecting to API for geocoding
geolocator = Nominatim(user_agent="MyApp")
query= 'Louisville, Ky'
results = geolocator.geocode(query)
print(results)

Louisville, Jefferson County, Kentucky, United States


In [32]:
#Geocoding (can take up to 7 minutes)
lat_list= []
long_list= []

incorrect_cities = []

for index, row in UFO.iterrows():
    
    city = row['city']
    state = row['state']
    query = str(city) + ',' + str(state)
    
    results = geolocator.geocode(query)
    
    if results:
        lat = results.latitude
        long = results.longitude
    
    else:
        incorrect_cities.append(query)
    
    lat_list.append(lat)
    long_list.append(long)
    
    time.sleep(0.1)

UFO['latitude'] = lat_list
UFO['longitude'] = long_list

In [33]:
#Checking for any incorrect cities
incorrect_cities

[]

In [34]:
#OMG IT WORKED
UFO

Unnamed: 0,city,state,country,shape,duration,summary,posted,date,time,latitude,longitude
0,Sisters,OR,USA,Cigar,4 minutes,Long and narrow illuminated craft flying high ...,5/20/21,2021-05-15,22:36,44.290948,-121.549252
1,Sarasota,FL,USA,Other,2 minutes,String of lights,5/20/21,2021-05-10,22:00,27.336581,-82.530855
2,Cleveland,TN,USA,Light,10 minutes,30+ lights in a line.,5/20/21,2021-05-05,22:35,35.159518,-84.876611
3,Galway,NY,USA,Circle,5 mins,I was standing at my kitchen sink and looked u...,8/20/21,2020-10-16,18:25,43.018686,-74.031516
4,Downingtown,PA,USA,Circle,1 hour,Multiple slow moving craft seen all over the s...,7/31/21,2021-06-18,01:00,40.006496,-75.703274
...,...,...,...,...,...,...,...,...,...,...,...
994,Haysville,KS,USA,Teardrop,5 minutes lights showing,We saw the actual ufo after she spotted a ligh...,4/22/22,2022-03-09,23:30,37.564462,-97.352267
996,Maple Shade,NJ,USA,Sphere,20 seconds,Thought it was the North Star. Same brightness...,11/21/98,1998-05-29,03:00,39.952613,-74.992391
997,Phoenix,AZ,USA,Oval,3 hours,"Oval plastic looking craft, translucent, refle...",11/21/98,1998-05-29,05:00,33.448437,-112.074141
998,Boston,MA,USA,Disk,unkown,encounter with approximatley seven saucers eac...,11/21/98,1998-06-01,14:00,42.355433,-71.060511


In [35]:
#Converting to exccel
BF.to_excel(r'C:\Users\geens\Desktop\Data\BF.xlsx', index=False, header=True)

In [36]:
#Converting to excel
UFO.to_excel(r'C:\Users\geens\Desktop\Data\UFO.xlsx', index=False, header=True)