In [1]:
from datetime import timedelta
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import reverse_geocoder as rg
from funpymodeling.exploratory import freq_tbl, status, profiling_num, cat_vars, num_vars
%matplotlib inline

In [2]:
# Reading main database from scrubbed_data
df_aliens = pd.read_csv('../raw_data/scrubbed.csv', low_memory=False)

# Cleaning the locations portion of the original alien scrubbed dataset

## Reading and cleaning

In [3]:
df_aliens.head()

Unnamed: 0,datetime,city,state,country,shape,duration (seconds),duration (hours/min),comments,date posted,latitude,longitude
0,10/10/1949 20:30,san marcos,tx,us,cylinder,2700,45 minutes,This event took place in early fall around 194...,4/27/2004,29.8830556,-97.941111
1,10/10/1949 21:00,lackland afb,tx,,light,7200,1-2 hrs,1949 Lackland AFB&#44 TX. Lights racing acros...,12/16/2005,29.38421,-98.581082
2,10/10/1955 17:00,chester (uk/england),,gb,circle,20,20 seconds,Green/Orange circular disc over Chester&#44 En...,1/21/2008,53.2,-2.916667
3,10/10/1956 21:00,edna,tx,us,circle,20,1/2 hour,My older brother and twin sister were leaving ...,1/17/2004,28.9783333,-96.645833
4,10/10/1960 20:00,kaneohe,hi,us,light,900,15 minutes,AS a Marine 1st Lt. flying an FJ4B fighter/att...,1/22/2004,21.4180556,-157.803611


In [4]:
status(df_aliens)

Unnamed: 0,variable,q_nan,p_nan,q_zeros,p_zeros,unique,type
0,datetime,0,0.0,0,0.0,69586,object
1,city,0,0.0,0,0.0,19900,object
2,state,5797,0.072163,0,0.0,67,object
3,country,9670,0.120375,0,0.0,5,object
4,shape,1932,0.02405,0,0.0,29,object
5,duration (seconds),0,0.0,0,0.0,537,object
6,duration (hours/min),0,0.0,0,0.0,8349,object
7,comments,15,0.000187,0,0.0,79997,object
8,date posted,0,0.0,0,0.0,317,object
9,latitude,0,0.0,0,0.0,18445,object


In [5]:
df_aliens.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 80332 entries, 0 to 80331
Data columns (total 11 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   datetime              80332 non-null  object 
 1   city                  80332 non-null  object 
 2   state                 74535 non-null  object 
 3   country               70662 non-null  object 
 4   shape                 78400 non-null  object 
 5   duration (seconds)    80332 non-null  object 
 6   duration (hours/min)  80332 non-null  object 
 7   comments              80317 non-null  object 
 8   date posted           80332 non-null  object 
 9   latitude              80332 non-null  object 
 10  longitude             80332 non-null  float64
dtypes: float64(1), object(10)
memory usage: 6.7+ MB


In [6]:
# Checking where are the wierd values inside latitude

df_aliens[df_aliens['latitude'].str.contains('[A-Za-z]', na=False)]

Unnamed: 0,datetime,city,state,country,shape,duration (seconds),duration (hours/min),comments,date posted,latitude,longitude
43782,5/22/1974 05:30,mescalero indian reservation,nm,,rectangle,180,two hours,Huge rectangular object emmitting intense whit...,4/18/2012,33q.200088,-105.624152


In [8]:
X = df_aliens[df_aliens['latitude'].str.contains('[A-Za-z]', na=False)]
X.index[0]

43782

In [None]:
# Changing the wierd values inside latitude to float

df_aliens = df_aliens.drop([43782])
df_aliens = df_aliens.reset_index(drop=True)
df_aliens['latitude'] = df_aliens['latitude'].dropna(axis=0)
df_aliens['longitude'] =df_aliens['longitude '].dropna(axis=0)
df_aliens['latitude'] = df_aliens['latitude'].astype(float)
df_aliens['longitude'] = df_aliens['longitude '].astype(float)

## Reverse geocode

In [None]:
import reverse_geocoder as rg
coordinates = list(zip(df_aliens['latitude'], df_aliens['longitude']))
results = rg.search(coordinates)

In [None]:
df_aliens.loc[1,['latitude','longitude ','city','state','country']]

In [None]:
results_df = pd.DataFrame(results)
df_aliens['country_c']=results_df['cc']
df_aliens['city_c']=results_df['name']
df_aliens['state_c'] = results_df['admin1']

In [None]:
df_aliens[['country','country_c','state','state_c','city','city_c']]

In [None]:
# Checking for nan or '' values in state_c
df_aliens[ (df_aliens['state_c'].notnull()) & (df_aliens['state_c']=='') ].index

In [None]:
# Replace empty values in state_c with the city name
df_aliens['state_c'] = df_aliens['state_c'].replace({'': np.nan})
df_aliens['state_c'] = df_aliens['state_c'].fillna(df_aliens['city_c'])

In [None]:
# Checking for nan or '' values in country_c
df_aliens[ (df_aliens['country_c'].notnull()) & (df_aliens['country_c']=='') ].index # not a problem!

In [None]:
# Checking for nan or '' values in city_c
df_aliens[ (df_aliens['city_c'].notnull()) & (df_aliens['city_c']=='') ].index # not a problem!

## Drop the original city, state and country values

In [None]:
df_aliens_filtered = df_aliens.drop(columns = ['longitude ', 'comments', 'city', 'state', 'country'])

In [None]:
# Keeping only US cities/States
df_aliens_filtered = df_aliens_filtered[df_aliens_filtered['country_c'] == 'US']

## End product

In [None]:
df_aliens_filtered.head()

# Cleaning the worldcities location/population dataset - NOT IN USE

In [None]:
# Reading locations datbase
df_loc = pd.read_csv('../raw_data/worldcities.csv', low_memory=False)

In [None]:
df_loc.head()

In [None]:
df_loc.info()

In [None]:
df_loc.info()

In [None]:
# Checking for nan or '' values in iso2
df_loc[ (df_loc['iso2'].notnull()) & (df_loc['iso2']=='') ].index # not a probelm!

In [None]:
# Checking for nan or '' values in population
df_loc[ (df_loc['population'].notnull()) & (df_loc['population']=='') ].index # not a probelm!

In [None]:
# Dropping columns city, city_ascii, country, iso3, admin_name, capital, id
df_loc_filtered = df_loc.drop(['lat', 'lng','city_ascii', 'country', 'iso3', 'admin_name', 'capital', 'id'],axis=1)

In [None]:
df_loc_filtered = df_loc_filtered.rename(columns={
    'iso2':'country_p'})
df_loc_filtered.head() # This is the cleaned locations database of all US cities

In [None]:
df_loc_filtered.shape

# Cleaning the US cities database

## Reading and cleaning

In [None]:
# Reading locations datbase
df_us_cities = pd.read_csv('../raw_data/uscities.csv', low_memory=False)

#df_us_cities= df_us_cities[['State', 'City', 'Population']].rename(columns = {'State': 'state_c','City':'city_c'})
df_us_cities.head()

# Ploting

In [None]:
import folium
from folium.plugins import HeatMap

center_location = 29.8830556, -97.9411111
m = folium.Map(location=center_location, control_scale=True, zoom_start=3)

location_df = df_aliens_filtered[['latitude','longitude']]
population_df = df_us_cities[['lat','lng','population']]
population_df = population_df[population_df['population']>100000].sort_values(by =['population'])

location_df['count'] = 1
heatmap_data = location_df.groupby(['latitude', 'longitude']).sum().reset_index().values.tolist()
gradient = {0.2: 'blue', 0.4: 'lime', 0.6: 'orange', 1: 'red'}
HeatMap(data=heatmap_data, radius=5, gradient=gradient, max_zoom=13).add_to(m)

for i in range(0,len(population_df)):
    folium.Circle(
      location=[population_df.iloc[i]['lat'], population_df.iloc[i]['lng']],
      radius=float(population_df.iloc[i]['population'])*(1/100),
      color='crimson',
      fill=True,
      fill_color='crimson'
   ).add_to(m)

m

In [None]:
center_location = 29.8830556, -97.9411111
m = folium.Map(location=center_location, control_scale=True, zoom_start=3)

location_df = df_aliens_filtered[['latitude','longitude', 'city_c']]
population_df = df_us_cities[['lat','lng','population']]
population_df = population_df[population_df['population']>10000].sort_values(by=['population'])

for i in range(0,len(location_df.head(1000))):
    folium.Marker(
      location=[location_df.iloc[i]['latitude'], location_df.iloc[i]['longitude']],
      popup=location_df.iloc[i]['city_c'],
   ).add_to(m)


for i in range(0,len(population_df.head(10000))):
    folium.Circle(
      location=[population_df.iloc[i]['lat'], population_df.iloc[i]['lng']],
      radius=float(population_df.iloc[i]['population'])*(1/50),
      color='crimson',
      fill=True,
      fill_color='crimson'
   ).add_to(m)

m

In [None]:
center_location = 29.8830556, -97.9411111
m = folium.Map(location=center_location, control_scale=True, zoom_start=3)

location_df = df_aliens_filtered[['latitude','longitude']]
population_df = df_us_cities[['lat','lng','population','city', 'military']]
population_df = population_df[population_df['military']==True]

location_df['count'] = 1
heatmap_data = location_df.groupby(['latitude', 'longitude']).sum().reset_index().values.tolist()
gradient = {0.2: 'blue', 0.4: 'lime', 0.6: 'orange', 1: 'red'}
HeatMap(data=heatmap_data, radius=5, gradient=gradient, max_zoom=13).add_to(m)

for i in range(0,len(population_df)):
    folium.Marker(
      location=[population_df.iloc[i]['lat'], population_df.iloc[i]['lng']],
      popup=population_df.iloc[i]['city'],
   ).add_to(m)

m

In [None]:
df_aliens_filtered.head()

In [None]:
df_aliens_filtered['sightinghs'] = True
df_aliens_3= df_aliens_filtered.rename(columns= {'city_c':'city'})
df_aliens_3 = df_aliens_3[['city','latitude','longitude','sightinghs']]
df_cities = df_us_cities[['city', 'military', 'population']]

In [None]:
merged = pd.merge(df_cities, df_aliens_3, on='city', how = 'outer')
merged = merged[merged["sightinghs"]!= 0]
merged["sightinghs"].fillna(False, inplace = False)
merged['sightinghs'] = np.where((df['sightinghs'] == True),1,df['sightinghs'])

In [None]:
merged.corr()

In [None]:
sns.pairplot(merged)
sns.plt.show()