# WeatherPy
----

#### Note
* Instructions have been included for each segment. You do not have to follow them exactly, but they are included to help you think through the steps.

In [1]:
# Dependencies and Setup
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import requests
import time
import math
from scipy.stats import linregress

# Import API keys
from api_keys import weather_api_key
from api_keys import g_key

# Incorporated citipy to determine city based on latitude and longitude
from citipy import citipy

# Output File (CSV)
output_data_file = "output_data/cities.csv"

# Range of latitudes and longitudes
lat_range = (-90, 90)
lng_range = (-180, 180)

## Generate Cities List

In [2]:
# List for holding lat_lngs and cities
lat_lngs = []
success_lats = []
success_lngs = []
cities = []
city_names = []
countries = []
city_country = []

# Create a set of random lat and lng combinations
lats = np.random.uniform(lat_range[0], lat_range[1], size=3500)
lngs = np.random.uniform(lng_range[0], lng_range[1], size=3500)
lat_lngs = zip(lats, lngs)

# Identify nearest city for each lat, lng combination
for lat_lng in lat_lngs:
    city = citipy.nearest_city(lat_lng[0], lat_lng[1]) #.city_name
    
    # If the city is unique, then add it to the cities list
    if city not in cities:
        cities.append(city)
        city_names.append(city.city_name)
        countries.append(city.country_code)
        success_lats.append(lat_lng[0])
        success_lngs.append(lat_lng[1])

# city_country = zip(city_names, countries)
# city_country = list(map(list, city_country))


# Print the city count to confirm sufficient count
print(f"The number randomly generated list of cities = {len(cities)}")
print(f"The number randomly generated list of city names = {len(city_names)}")
print(f"The number randomly generated list of countries = {len(countries)}")
print(f"The number randomly generated list of latitude coordinates = {len(success_lats)}")
print(f"The number randomly generated list of longitude coordinates = {len(success_lngs)}")
# print(f"The number randomly generated list of city-country names = {len(city_country)}")
print()
print(f"The number randomly generated list of unique countries = {len(list(set(countries)))}")

# print(success_lats)

The number randomly generated list of cities = 1125
The number randomly generated list of city names = 1125
The number randomly generated list of countries = 1125
The number randomly generated list of latitude coordinates = 1125
The number randomly generated list of longitude coordinates = 1125

The number randomly generated list of unique countries = 150


In [3]:
cities_data = {"City": city_names, "Country": countries, "Search Lats": success_lats, "Search Lngs": success_lngs}
cities_df = pd.DataFrame(cities_data)
cities_df["City-Country"] = cities_df[["City", "Country"]].apply(lambda x: ', '.join(x[x.notnull()]), axis = 1)
cities_df = cities_df[["City-Country", "City", "Country", "Search Lats", "Search Lngs"]]

cities_df.head()

Unnamed: 0,City-Country,City,Country,Search Lats,Search Lngs
0,"punta arenas, cl",punta arenas,cl,-86.101579,-97.374532
1,"taolanaro, mg",taolanaro,mg,-52.88888,58.505838
2,"ha tinh, vn",ha tinh,vn,18.989508,106.487704
3,"longyearbyen, sj",longyearbyen,sj,81.758946,21.492339
4,"burnie, au",burnie,au,-40.588201,144.139729


In [4]:
len(cities_df)

1125

In [5]:
# In the starter code example, 1,500 sets of randomly chosen latitude and longitude 
# yielded 635 unique city names.  That means that 58% of randomly chosen lat-lng 
# coordinates were duplicates and were rejected, if I read the documentation correctly.

In [6]:
# Since 2/3 of the surface area of the globe is water, there is a likelihood that 2/3 
# of the choices made by randomly selecting geocoordinates will be somewhere other than 
# on land, which means that at least 2/3 of the cities selected by using random 
# coordinates will be clustered on shorelines.  Bottom line, the real task is to randomly
# select geocoordinates only for the 1/3 of the planet surface that is land.

# My solution is to reject any cities that are greater than 15 miles from the randomly
# chosen geocoordinates.  This requires me to measure the distance from the randomly 
# chosen geocoordinates to the nearest city selected.  I may need to play with the method
# further to eyeball the best maximum distance.  With a highly iterative method, I could
# perhaps get maximum approximate equal distribution between cities to minimize any 
# clustering tendencies. But I'm going to settle for an eyeball check on top of a
# pretty decent methodology.

# This methodology is likely also to skew away from cities in more remote locations.
# That said, our formula is still likely to be fairly representative.

In [7]:
# List of cities for which will will try to retrieve geocoordinates
cities_input = cities_df["City-Country"].tolist()

# ****NOTE IN LINE BELOW THAT head() IN FRONT OF .tolist() AFFECTS SIZE OF LISTS BUT IS USEFUL FOR TEST PURPOSES****
# cities_input = cities_df["City-Country"].head(30).tolist()

cities_input

['punta arenas, cl',
 'taolanaro, mg',
 'ha tinh, vn',
 'longyearbyen, sj',
 'burnie, au',
 'basco, ph',
 'mataura, pf',
 'kuching, my',
 'cape town, za',
 'rikitea, pf',
 'nizhneyansk, ru',
 'new norfolk, au',
 'butaritari, ki',
 'mozarlandia, br',
 'tuktoyaktuk, ca',
 'hermanus, za',
 'comodoro rivadavia, ar',
 'castro, cl',
 'hilo, us',
 'auki, sb',
 'vaini, to',
 'sechura, pe',
 'albany, au',
 'sioux lookout, ca',
 'port hedland, au',
 'nikolskoye, ru',
 'busselton, au',
 'tasiilaq, gl',
 'yulara, au',
 'safaga, eg',
 'shimoda, jp',
 'inverell, au',
 'kapaa, us',
 'narsaq, gl',
 'pisco, pe',
 'baleshwar, in',
 'noumea, nc',
 'hamilton, bm',
 'pacifica, us',
 'chokurdakh, ru',
 'prieska, za',
 'torbay, ca',
 'san cristobal, ec',
 'itarema, br',
 'dwarka, in',
 'kavieng, pg',
 'bethel, us',
 'mahebourg, mu',
 'clyde river, ca',
 'tumannyy, ru',
 'kedrovyy, ru',
 'barentsburg, sj',
 'qaanaaq, gl',
 'hobart, au',
 'miraflores, co',
 'vaitupu, wf',
 'puerto ayora, ec',
 'andros town, bs

In [8]:
# Set empty lists to hold characters height and mass
target_cities = []
target_lats = []
target_lngs = []

# Loop through each character
for city1 in cities_input:
    
   # Try to extract latitude and longitude
    try:
        # Set url for API
        target_url = f'https://maps.googleapis.com/maps/api/geocode/json?address={city1}&key={g_key}'
        
        # Run a request to endpoint and convert result to json
        geo_data = requests.get(target_url).json()
        
        # Pause a few seconds to allow for processing delays
        time.sleep(1) 
        
        # Append returned latitude, longitude, and city
        target_lats.append(geo_data["results"][0]["geometry"]["location"]["lat"])
        target_lngs.append(geo_data["results"][0]["geometry"]["location"]["lng"])
        target_cities.append(city1) # THE ORDER OF THIS APPEND IS CRITICAL!!!
        
        # Printed as Check
        # print(f"{city1} found! Appending longitude and latitude")
        
    # Handle exceptions for cities whose geocoordiantes are not returned in the Google API
    except:
        # Append null values
        print(f"City {city1} not found")
        pass 

City constitucion, mx not found
City bumba, cd not found
City seda, lv not found
City bur gabo, so not found
City esso, ru not found
City bondo, cd not found
City sosnovskiy, ru not found
City mao, td not found
City ati, td not found
City roma, au not found
City lolua, tv not found
City laguna, br not found
City puebloviejo, ec not found
City naze, jp not found
City sola, vu not found
City yandoon, mm not found
City khani, ru not found
City dauriya, ru not found
City senno, by not found
City labuhan, id not found
City aden, ye not found
City mon, in not found
City latung, ph not found
City vostok, ru not found
City providencia, mx not found
City saint-georges, gf not found
City harer, et not found
City wajid, so not found
City pimentel, pe not found
City fare, pf not found
City bargal, so not found
City ibra, om not found
City silvan, tr not found
City lapi, ph not found
City kirakira, sb not found
City dicabisagan, ph not found
City mumford, gh not found
City kleck, by not found
City 

In [9]:
print(f"Original Input Cities = {len(cities_input)}")
print(f"Cities Not Found = {len(cities_input) - len(target_cities)}")
print()
print(f"Target Cities = {len(target_cities)}")
print(f"Target Latitutdes = {len(target_lats)}")
print(f"Target Longitudes = {len(target_lngs)}")

Original Input Cities = 1125
Cities Not Found = 57

Target Cities = 1068
Target Latitutdes = 1068
Target Longitudes = 1068


In [10]:
# Populate new dataframe with extracted with successfully extracted city-county, latitude and longitude data
cities_data2 = {"City-Country": target_cities, "Actual Lats": target_lats, "Actual Lngs": target_lngs}
cities_df2 = pd.DataFrame(cities_data2)

cities_df2.head()

Unnamed: 0,City-Country,Actual Lats,Actual Lngs
0,"punta arenas, cl",-53.163833,-70.917068
1,"taolanaro, mg",-25.022531,46.985369
2,"ha tinh, vn",18.294378,105.674525
3,"longyearbyen, sj",78.223172,15.626723
4,"burnie, au",-41.052858,145.905217


In [11]:
len(cities_df2)

1068

In [12]:
# Proof no duplicates were entered
cities_dedupe_df = cities_df.drop_duplicates(subset="City-Country")
cities_dedupe_df2 = cities_df2.drop_duplicates(subset="City-Country")

print(f"Number of records removed by deduplicaiton of Nearest City Search DF = {len(cities_df) - len(cities_dedupe_df)}")
print(f"Number of records removed by deduplicaiton of City Geocoordinates DF = {len(cities_df2) - len(cities_dedupe_df2)}")

Number of records removed by deduplicaiton of Nearest City Search DF = 0
Number of records removed by deduplicaiton of City Geocoordinates DF = 0


In [13]:
df1 = cities_df
df2 = cities_df2
df = pd.merge(df1, df2, on="City-Country")

# Alternative syntax
# df = df1.merge(df2, on="City-Country")

df.head()

Unnamed: 0,City-Country,City,Country,Search Lats,Search Lngs,Actual Lats,Actual Lngs
0,"punta arenas, cl",punta arenas,cl,-86.101579,-97.374532,-53.163833,-70.917068
1,"taolanaro, mg",taolanaro,mg,-52.88888,58.505838,-25.022531,46.985369
2,"ha tinh, vn",ha tinh,vn,18.989508,106.487704,18.294378,105.674525
3,"longyearbyen, sj",longyearbyen,sj,81.758946,21.492339,78.223172,15.626723
4,"burnie, au",burnie,au,-40.588201,144.139729,-41.052858,145.905217


In [14]:
# Inner merge successful
len(df)

1068

In [15]:
df.dtypes

City-Country     object
City             object
Country          object
Search Lats     float64
Search Lngs     float64
Actual Lats     float64
Actual Lngs     float64
dtype: object

# The trigonometry underlying the Haversine Formula
![Trigonometry Explanation](../Images/trig_1.jpg)

In [16]:
df_z = df.head(2)
df_z

Unnamed: 0,City-Country,City,Country,Search Lats,Search Lngs,Actual Lats,Actual Lngs
0,"punta arenas, cl",punta arenas,cl,-86.101579,-97.374532,-53.163833,-70.917068
1,"taolanaro, mg",taolanaro,mg,-52.88888,58.505838,-25.022531,46.985369


In [17]:
# Create a list to capture distance calculations
distance_list = []

# radius of the Earth, 3,958.8 mi (6373.0 km)
# presumably a blend of polar and equatorial radii
R = 3958.8

for index, row in df.iterrows():
    
    # coordinates - Used to search for Nearest City
    lat1 = math.radians(row["Search Lats"])
    lon1 = math.radians(row["Search Lngs"])
    
    # coordinates - Actual for City Selected
    lat2 = math.radians(row["Actual Lats"])
    lon2 = math.radians(row["Search Lngs"])
    
    # change in coordinates
    dlon = lon2 - lon1
    dlat = lat2 - lat1
    
    # Haversine formula
    a = math.sin(dlat / 2)**2 + math.cos(lat1) * math.cos(lat2) * math.sin(dlon / 2)**2
    
    c = 2 * math.atan2(math.sqrt(a), math.sqrt(1 - a))
    
    distance = R * c
    
    distance_list.append(distance)

# print(distance_list)

In [18]:
df["Distance (mi)"] = distance_list
df.head()

Unnamed: 0,City-Country,City,Country,Search Lats,Search Lngs,Actual Lats,Actual Lngs,Distance (mi)
0,"punta arenas, cl",punta arenas,cl,-86.101579,-97.374532,-53.163833,-70.917068,2275.803709
1,"taolanaro, mg",taolanaro,mg,-52.88888,58.505838,-25.022531,46.985369,1925.400171
2,"ha tinh, vn",ha tinh,vn,18.989508,106.487704,18.294378,105.674525,48.029436
3,"longyearbyen, sj",longyearbyen,sj,81.758946,21.492339,78.223172,15.626723,244.301118
4,"burnie, au",burnie,au,-40.588201,144.139729,-41.052858,145.905217,32.105098


In [19]:
# Couunt the number of cities selected are in the combined data set
len(df)

1068

In [23]:
# Count the number of cities selected are within 50 miles
len(df.loc[df["Distance (mi)"] <= 60])

513

In [24]:
# The original search for nearest city involved 1,500 random geocoordinates.  This yielded just over 
# 200 cities that were less than or equal to 60 miles from the original random geocoordinates.  As a 
# result, we # expanded our random geocoordinates set by a factor of 2.5x or higher to increase the 
# odds of selecting at least 500 target cities that were within the maximum 60 mile search radius. So, 
# we reran the next time with 4,000 random geocoordinates.

# That involved a roughly hour-long series of API requests!  I cannot imagine there's isn't a better
# way.  I haven't sufficiently researched the Google API documentation to learn how to batch my
# requests and perhaps save time.

In [27]:
# Remove rows that have Distance values greater than 60
cities_60mi_df = df[~(df["Distance (mi)"] > 60)]
cities_60mi_df.head()

Unnamed: 0,City-Country,City,Country,Search Lats,Search Lngs,Actual Lats,Actual Lngs,Distance (mi)
2,"ha tinh, vn",ha tinh,vn,18.989508,106.487704,18.294378,105.674525,48.029436
4,"burnie, au",burnie,au,-40.588201,144.139729,-41.052858,145.905217,32.105098
13,"mozarlandia, br",mozarlandia,br,-15.105019,-50.989055,-14.746144,-50.571775,24.796092
29,"safaga, eg",safaga,eg,26.685563,35.413631,26.750017,33.935976,4.453396
31,"inverell, au",inverell,au,-29.679041,151.993691,-29.762751,151.110282,5.783863


### Perform API Calls
* Perform a weather check on each city using a series of successive API calls.
* Include a print log of each city as it'sbeing processed (with the city number and city name).


### Convert Raw Data to DataFrame
* Export the city data into a .csv.
* Display the DataFrame

## Inspect the data and remove the cities where the humidity > 100%.
----
Skip this step if there are no cities that have humidity > 100%. 

In [None]:
#  Get the indices of cities that have humidity over 100%.


In [None]:
# Make a new DataFrame equal to the city data to drop all humidity outliers by index.
# Passing "inplace=False" will make a copy of the city_data DataFrame, which we call "clean_city_data".


## Plotting the Data
* Use proper labeling of the plots using plot titles (including date of analysis) and axes labels.
* Save the plotted figures as .pngs.

## Latitude vs. Temperature Plot

## Latitude vs. Humidity Plot

## Latitude vs. Cloudiness Plot

## Latitude vs. Wind Speed Plot

## Linear Regression

####  Northern Hemisphere - Max Temp vs. Latitude Linear Regression

####  Southern Hemisphere - Max Temp vs. Latitude Linear Regression

####  Northern Hemisphere - Humidity (%) vs. Latitude Linear Regression

####  Southern Hemisphere - Humidity (%) vs. Latitude Linear Regression

####  Northern Hemisphere - Cloudiness (%) vs. Latitude Linear Regression

####  Southern Hemisphere - Cloudiness (%) vs. Latitude Linear Regression

####  Northern Hemisphere - Wind Speed (mph) vs. Latitude Linear Regression

####  Southern Hemisphere - Wind Speed (mph) vs. Latitude Linear Regression