In [3]:
#Import packages
import pandas as pd
import numpy as np
import os
import geocoder
from geopy.geocoders import Nominatim

In [4]:
#Read in BF locations data
BF_Locations = pd.read_csv(os.path.join("https://download.data.world/s/a5gdzsmxenwcwqmnbjaaizbfk4uea3"))
BF_Reports = pd.read_csv(os.path.join("https://download.data.world/s/td52m5guv2cpruwjo6pr2cd3fbk5cx"))

In [5]:
#Read in UFO data
UFO = pd.read_csv(os.path.join("https://download.data.world/s/lobqhnvklhbppb4ayke4b23lsupg57"))

In [6]:
#Change column name in BF locations dataframe
BF_Locations = BF_Locations.rename(columns={'number': 'report_number'})

In [7]:
#Merge two BF datasets
BF = pd.merge(BF_Locations, BF_Reports, how='left', left_on='report_number', right_on='report_number')

In [8]:
#Split 'title' column by report number and report
BF[['report_number2', 'report']] = BF.pop('title').str.split(pat=':', n=1, expand=True)

In [9]:
#Reformatting timestamp column
BF['timestamp'] = pd.to_datetime(BF['timestamp'], format = "%Y-%m-%dT%H:%M:%S.%f")

In [10]:
#Splitting timestamp column
BF['report_date'] = BF['timestamp'].dt.date
BF['time'] = BF['timestamp'].dt.time

In [11]:
#Removing unnecessary columns
BF = BF.drop(['timestamp', 'year', 'season', 'report_class', 
              'month', 'date', 'a_g_references', 'time', 'report_number2'], axis=1)

In [12]:
#Drop unnecessary columns
UFO = UFO.drop(['stats', 'report_link', 'text'], axis = 1)

In [13]:
#Remove all countries other than USA
UFO = UFO.loc[UFO["country"] == "USA"]

In [14]:
#Split 'date_time' column
UFO[['date', 'time']] = UFO.pop('date_time').str.split(pat=' ', n=1, expand=True)

In [15]:
#Changing dates to datetime format
BF['report_date'] = pd.to_datetime(BF['report_date'], errors='coerce')
UFO['date'] = pd.to_datetime(UFO['date'], errors='coerce')

In [16]:
#Dropping null values
UFO = UFO[UFO['date'].notna()]

In [17]:
UFO.head()

Unnamed: 0,city,state,country,shape,duration,summary,posted,date,time
0,Sisters,OR,USA,Cigar,4 minutes,Long and narrow illuminated craft flying high ...,5/20/21,2021-05-15,22:36
1,Sarasota,FL,USA,Other,2 minutes,String of lights,5/20/21,2021-05-10,22:00
2,Cleveland,TN,USA,Light,10 minutes,30+ lights in a line.,5/20/21,2021-05-05,22:35
3,Galway,NY,USA,Circle,5 mins,I was standing at my kitchen sink and looked u...,8/20/21,2020-10-16,18:25
4,Downingtown,PA,USA,Circle,1 hour,Multiple slow moving craft seen all over the s...,7/31/21,2021-06-18,01:00


In [18]:
UFO.dtypes

city                object
state               object
country             object
shape               object
duration            object
summary             object
posted              object
date        datetime64[ns]
time                object
dtype: object

In [24]:
geolocator = Nominatim(user_agent="MyApp")
query= 'Louisville, Ky'
results = geolocator.geocode(query)
print(results)

Louisville, Jefferson County, Kentucky, United States


In [27]:
lat_list= []
long_list= []

for index, row in UFO.iterrows():
    
    city = row['city']
    state = row['state']
    query = str(city) + ',' + str(state)
    
    results = geolocator.geocode(query)
    lat = results.latitude
    long = results.longitude
    
    lat_list.append(lat)
    long_list.append(long)

UFO['latitude'] = lat_list
UFO['longitude'] = long_list

AttributeError: 'NoneType' object has no attribute 'latitude'

In [42]:
#Due to the size of the dataframe, I am only mapping reports starting on 1/1/1970
UFO = UFO[~(UFO['date'] < '1/1/1970')]
BF = BF[~(BF['report_date'] < '1970-01-01')]

In [43]:
#Converting to exccel
BF.to_excel(r'C:\Users\geens\Desktop\Data\BF.xlsx', index=False, header=True)

In [44]:
#Converting to excel
UFO.to_excel(r'C:\Users\geens\Desktop\Data\UFO.xlsx', index=False, header=True)