In [1]:
import pandas as pd
import datetime
import folium

# Data loading

In [2]:
df = pd.read_csv('ufo.csv')

# First of all, let's explore our dataframe

In [3]:
df.head()

Unnamed: 0.1,Unnamed: 0,datetime,city,state,country,shape,duration,total_time,comments,date_posted,latitude,longitude,year,distance
0,0,10/10/1949 20:30,san marcos,tx,us,cylinder,2700.0,45 minutes,This event took place in early fall around 194...,4/27/2004,29.883056,-97.941111,2004,1242.667772
1,1,10/10/1949 21:00,lackland afb,tx,,light,7200.0,1-2 hrs,1949 Lackland AFB&#44 TX. Lights racing acros...,12/16/2005,29.38421,-98.581082,2005,1325.486319
2,2,10/10/1955 17:00,chester (uk/england),,gb,circle,20.0,20 seconds,Green/Orange circular disc over Chester&#44 En...,1/21/2008,53.2,-2.916667,2008,6515.416577
3,3,10/10/1956 21:00,edna,tx,us,circle,20.0,1/2 hour,My older brother and twin sister were leaving ...,1/17/2004,28.978333,-96.645833,2004,1211.971352
4,4,10/10/1960 20:00,kaneohe,hi,us,light,900.0,15 minutes,AS a Marine 1st Lt. flying an FJ4B fighter/att...,1/22/2004,21.418056,-157.803611,2004,6960.923396


In [4]:
ufo = df.copy()

In [5]:
ufo.shape

(78509, 14)

In [6]:
ufo.isna().sum()

Unnamed: 0        0
datetime          0
city              0
state          5795
country        9562
shape          1910
duration          0
total_time        0
comments         14
date_posted       0
latitude          0
longitude         0
year              0
distance          0
dtype: int64

In [7]:
ufo.info(memory_usage='deep')

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 78509 entries, 0 to 78508
Data columns (total 14 columns):
Unnamed: 0     78509 non-null int64
datetime       78509 non-null object
city           78509 non-null object
state          72714 non-null object
country        68947 non-null object
shape          76599 non-null object
duration       78509 non-null float64
total_time     78509 non-null object
comments       78495 non-null object
date_posted    78509 non-null object
latitude       78509 non-null float64
longitude      78509 non-null float64
year           78509 non-null int64
distance       78509 non-null float64
dtypes: float64(4), int64(2), object(8)
memory usage: 47.6 MB


In [8]:
ufo.describe(include='all')

Unnamed: 0.1,Unnamed: 0,datetime,city,state,country,shape,duration,total_time,comments,date_posted,latitude,longitude,year,distance
count,78509.0,78509,78509,72714,68947,76599,78509.0,78509,78495,78509,78509.0,78509.0,78509.0,78509.0
unique,,68186,19613,67,5,29,,8230,78179,317,,,,
top,,7/4/2010 22:00,seattle,ca,us,light,,5 minutes,Fireball,12/12/2009,,,,
freq,,36,525,9655,63399,16203,,4595,11,1476,,,,
mean,40190.587461,,,,,,9196.769,,,,38.166745,-86.780908,2007.003312,2128.202038
std,23196.723816,,,,,,627371.4,,,,10.584565,40.154765,4.559303,2332.983361
min,0.0,,,,,,0.001,,,,-82.862752,-176.658056,1998.0,12.911623
25%,20108.0,,,,,,30.0,,,,34.092222,-112.435,2003.0,827.662263
50%,40190.0,,,,,,180.0,,,,39.628333,-88.083333,2007.0,1289.726145
75%,60298.0,,,,,,600.0,,,,42.898611,-78.456667,2011.0,2835.084692


In [9]:
ufo.columns

Index(['Unnamed: 0', 'datetime', 'city', 'state', 'country', 'shape',
       'duration', 'total_time', 'comments', 'date_posted', 'latitude',
       'longitude', 'year', 'distance'],
      dtype='object')

# Now, we are going to drop useless columns

In [10]:
ufo = ufo.drop(['Unnamed: 0', 'shape',
       'duration', 'total_time', 'comments', 'date_posted', 'year'], axis=1)
ufo.head()

Unnamed: 0,datetime,city,state,country,latitude,longitude,distance
0,10/10/1949 20:30,san marcos,tx,us,29.883056,-97.941111,1242.667772
1,10/10/1949 21:00,lackland afb,tx,,29.38421,-98.581082,1325.486319
2,10/10/1955 17:00,chester (uk/england),,gb,53.2,-2.916667,6515.416577
3,10/10/1956 21:00,edna,tx,us,28.978333,-96.645833,1211.971352
4,10/10/1960 20:00,kaneohe,hi,us,21.418056,-157.803611,6960.923396


# "We know that the device has been active since 2004 in one city in the USA". Let's drop rows about other countries.

In [11]:
ufo.country.unique()

array(['us', nan, 'gb', 'ca', 'au', 'de'], dtype=object)

In [12]:
ufo = ufo[ufo.country == 'us']

In [13]:
ufo.shape

(63399, 7)

# "Our only hope is to replicate a device that can block all alien technology in a radius of 300km."

In [14]:
ufo = ufo[ufo.distance >= 300]
ufo = ufo[ufo.distance <= 400 ]

In [15]:
ufo.shape

(2151, 7)

# We give column "datetime" a format easier to work with. Also, in order to filter by date, let's set it up as index


In [16]:
ufo["datetime"] = ufo['datetime'].str.replace('24:00', '0:00')
ufo["datetime"] = pd.to_datetime(ufo["datetime"], format = "%m/%d/%Y %H:%M")
ufo.set_index('datetime', inplace=True)

In [17]:
ufo.head()

Unnamed: 0_level_0,city,state,country,latitude,longitude,distance
datetime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
1968-10-10 19:00:00,brevard,nc,us,35.233333,-82.734444,393.707403
1970-10-10 19:00:00,manchester,ky,us,37.153611,-83.761944,302.476051
1972-10-10 19:00:00,harlan county,ky,us,36.843056,-83.321944,330.895925
2000-10-10 03:00:00,perryville,mo,us,37.724167,-89.861111,312.173928
2000-10-10 21:30:00,florence,ky,us,38.998889,-84.626667,372.329684


# "We know that the device has been active since 2004". So let's drop precedent years.

In [18]:
ufo = ufo.loc['2004-01-01':]

In [19]:
ufo.head()

Unnamed: 0_level_0,city,state,country,latitude,longitude,distance
datetime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2004-10-10 04:18:00,terre haute,in,us,39.466667,-87.413889,366.596468
2006-10-10 12:37:00,blairsville,ga,us,34.876111,-83.958333,305.862395
2007-10-10 01:00:00,stockbridge,ga,us,33.544167,-84.233889,382.729821
2007-10-10 20:30:00,conyers,ga,us,33.6675,-84.017778,385.562497
2008-10-10 21:30:00,cincinnati,oh,us,39.161944,-84.456944,395.54763


In [20]:
ufo.shape

(1465, 6)

In [21]:
ufo.distance.idxmin()

Timestamp('2007-02-05 12:30:00')

In [22]:
ufo['2007-02-05 12:30:00']

Unnamed: 0_level_0,city,state,country,latitude,longitude,distance
datetime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2007-02-05 12:30:00,new tazewell,tn,us,36.4425,-83.599722,300.016444


# Map representation

In [23]:
map = folium.Map(location=[36.4425, -83.599722], tile='New Tazawell', zoom_start=12)
map