# WeatherPy
----

#### Note
* Instructions have been included for each segment. You do not have to follow them exactly, but they are included to help you think through the steps.

In [1]:
# Dependencies and Setup
%matplotlib inline
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import requests
import time
import calendar
import math
from datetime import date
from scipy.stats import linregress

# Import API keys
from api_keys import weather_api_key
from api_keys import g_key

# Incorporated citipy to determine city based on latitude and longitude
from citipy import citipy

# Output File (CSV)
output_data_file = "../output_data/cities.csv"

# Range of latitudes and longitudes
lat_range = (-90, 90)
lng_range = (-180, 180)

## Generate Cities List

In [2]:
# List for holding lat_lngs and cities
lat_lngs = []
success_lats = []
success_lngs = []
cities = []
city_names = []
countries = []
city_country = []

# Create a set of random lat and lng combinations
lats = np.random.uniform(lat_range[0], lat_range[1], size=3500)
lngs = np.random.uniform(lng_range[0], lng_range[1], size=3500)
lat_lngs = zip(lats, lngs)

# Identify nearest city for each lat, lng combination
for lat_lng in lat_lngs:
    city = citipy.nearest_city(lat_lng[0], lat_lng[1]) #.city_name
    
    # If the city is unique, then add it to the cities list
    if city not in cities:
        cities.append(city)
        city_names.append(city.city_name)
        countries.append(city.country_code)
        success_lats.append(lat_lng[0])
        success_lngs.append(lat_lng[1])

# Print the city count to confirm sufficient count
print(f"The number randomly generated list of cities = {len(cities)}")
print(f"The number randomly generated list of city names = {len(city_names)}")
print(f"The number randomly generated list of countries = {len(countries)}")
print(f"The number randomly generated list of latitude coordinates = {len(success_lats)}")
print(f"The number randomly generated list of longitude coordinates = {len(success_lngs)}")
# print(f"The number randomly generated list of city-country names = {len(city_country)}")
print()
print(f"The number randomly generated list of unique countries = {len(list(set(countries)))}")

# print(success_lats)

The number randomly generated list of cities = 1147
The number randomly generated list of city names = 1147
The number randomly generated list of countries = 1147
The number randomly generated list of latitude coordinates = 1147
The number randomly generated list of longitude coordinates = 1147

The number randomly generated list of unique countries = 154


In [3]:
cities_data = {"City": city_names, "Country": countries, "Search Lats": success_lats, "Search Lngs": success_lngs}
cities_df = pd.DataFrame(cities_data)
cities_df["City-Country"] = cities_df[["City", "Country"]].apply(lambda x: ', '.join(x[x.notnull()]), axis = 1)
cities_df = cities_df[["City-Country", "City", "Country", "Search Lats", "Search Lngs"]]

cities_df.head()

Unnamed: 0,City-Country,City,Country,Search Lats,Search Lngs
0,"rikitea, pf",rikitea,pf,-48.841207,-129.272584
1,"taolanaro, mg",taolanaro,mg,-66.111554,68.31664
2,"bredasdorp, za",bredasdorp,za,-87.130722,21.477709
3,"tuktoyaktuk, ca",tuktoyaktuk,ca,89.338145,-119.150144
4,"utinga, br",utinga,br,-11.136789,-42.962918


In [4]:
len(cities_df)

1147

### In the starter code example, 1,500 sets of randomly chosen latitude and longitude yielded 635 unique city names.  That means that 58% of randomly chosen latitude-longitude coordinates were duplicates and were rejected, if we read the documentation correctly.

### Since 2/3 of the surface area of the globe is water, there is a likelihood that 2/3 of the choices made by randomly selecting geocoordinates will be somewhere other than on land, which means that at least 2/3 of the cities selected by using random coordinates will be clustered on shorelines.  Bottom line, the real task is to randomly select geocoordinates only for the 1/3 of the planet surface that is land.

### Our solution is to reject any cities that are greater than 60 miles from the randomly chosen geocoordinates.  This requires me to measure the distance from the randomly chosen geocoordinates to the nearest city selected.  I may need to play with the method further to eyeball the best maximum distance.  With a highly iterative method, I could perhaps get maximum approximate equal distribution between cities to minimize any clustering tendencies. But I'm going to settle for an eyeball check on top of a pretty decent methodology.

### We note that this methodology might skew away from cities in more remote locations. 

In [5]:
# List of cities for which will will try to retrieve geocoordinates
cities_input = cities_df["City-Country"].tolist()

# ****NOTE IN LINE BELOW THAT head() IN FRONT OF .tolist() AFFECTS SIZE OF LISTS BUT IS USEFUL FOR TEST PURPOSES****
# cities_input = cities_df["City-Country"].head(30).tolist()

cities_input

['rikitea, pf',
 'taolanaro, mg',
 'bredasdorp, za',
 'tuktoyaktuk, ca',
 'utinga, br',
 'upernavik, gl',
 'ushuaia, ar',
 'verkhnevilyuysk, ru',
 'lompoc, us',
 'inderborskiy, kz',
 'hobart, au',
 'kinsale, ie',
 'hamilton, bm',
 'bengkulu, id',
 'wodonga, au',
 'port alfred, za',
 'micomeseng, gq',
 'puerto leguizamo, co',
 'aksu, kz',
 'albany, au',
 'papara, pf',
 'luwingu, zm',
 'khatanga, ru',
 'ngunguru, nz',
 'raga, sd',
 'yellowknife, ca',
 'broome, au',
 'sao felix do xingu, br',
 'vaini, to',
 'puerto ayora, ec',
 'homer, us',
 'avarua, ck',
 'butaritari, ki',
 'waipawa, nz',
 'punta arenas, cl',
 'kodiak, us',
 'narsaq, gl',
 'mataura, pf',
 'mount gambier, au',
 'richards bay, za',
 'kapaa, us',
 'khonuu, ru',
 'castro, cl',
 'marsabit, ke',
 'dambulla, lk',
 'busselton, au',
 'chagda, ru',
 'erzin, ru',
 'chokurdakh, ru',
 'bluff, nz',
 'alofi, nu',
 'dakar, sn',
 'nicoya, cr',
 'jamestown, sh',
 'belushya guba, ru',
 'hermanus, za',
 'jumla, np',
 'velyka oleksandrivka, 

### Perform Google API Calls for City Geocoordinates
* Gather geocoordinates for each city using a series of successive Google API calls.
* Include a print log of each city not found.


In [None]:
# Set empty lists to hold characters height and mass
target_cities = []
target_lats = []
target_lngs = []

# Loop through each character
for city1 in cities_input:
    
   # Try to extract latitude and longitude
    try:
        # Set url for API
        target_url = f'https://maps.googleapis.com/maps/api/geocode/json?address={city1}&key={g_key}'
        
        # Run a request to endpoint and convert result to json
        geo_data = requests.get(target_url).json()
        
        # Pause a few seconds to allow for processing delays
        time.sleep(1) 
        
        # Append returned latitude, longitude, and city
        target_lats.append(geo_data["results"][0]["geometry"]["location"]["lat"])
        target_lngs.append(geo_data["results"][0]["geometry"]["location"]["lng"])
        target_cities.append(city1) # THE ORDER OF THIS APPEND IS CRITICAL!!!
        
        # Printed as Check
        # print(f"{city1} found! Appending longitude and latitude")
        
    # Handle exceptions for cities whose geocoordiantes are not returned in the Google API
    except:
        # Append null values
        print(f"City {city1} not found")
        pass 

City papara, pf not found
City raga, sd not found
City gat, ly not found
City tim, ru not found
City constitucion, mx not found
City daru, pg not found
City oktyabrskoye, ru not found
City mikuni, jp not found
City naze, jp not found
City soe, id not found
City komsomolskiy, ru not found
City lata, sb not found
City sur, om not found
City labuhan, id not found
City bereda, so not found
City sola, vu not found
City borba, pt not found
City mutis, co not found
City mercedes, uy not found
City biltine, td not found
City saint-georges, gf not found
City muli, mv not found
City lapua, fi not found
City sale, au not found
City pop, uz not found
City santander, es not found
City buchanan, lr not found
City dinar, tr not found
City tokur, ru not found
City ngorongoro, tz not found
City along, in not found
City yamada, jp not found
City twentynine palms, us not found
City guider, cm not found
City mahon, es not found
City aras, no not found
City warrington, us not found
City asau, tv not found


In [None]:
# Calculate number of cities for which lat-lng data successfully extracted
# And verify lists to be merged into dataframe are of same length
print(f"Original Input Cities = {len(cities_input)}")
print(f"Cities Not Found = {len(cities_input) - len(target_cities)}")
print()
print(f"Target Cities = {len(target_cities)}")
print(f"Target Latitutdes = {len(target_lats)}")
print(f"Target Longitudes = {len(target_lngs)}")

In [None]:
# Populate new dataframe with extracted with successfully extracted city-county, latitude and longitude data
cities_dict = {"City-Country": target_cities, "Actual Lats": target_lats, "Actual Lngs": target_lngs}
cities_df2 = pd.DataFrame(cities_dict)

cities_df2.head()

In [None]:
len(cities_df2)

In [None]:
# Proof no duplicates were entered
cities_dedupe_df = cities_df.drop_duplicates(subset="City-Country")
cities_dedupe_df2 = cities_df2.drop_duplicates(subset="City-Country")

print(f"Number of records removed by deduplicaiton of Nearest City Search DF = {len(cities_df) - len(cities_dedupe_df)}")
print(f"Number of records removed by deduplicaiton of City Geocoordinates DF = {len(cities_df2) - len(cities_dedupe_df2)}")

In [None]:
# Perform inner merge on the 2 dataframes
df1 = cities_df
df2 = cities_df2
df = pd.merge(df1, df2, on="City-Country")

# Alternative syntax
# df = df1.merge(df2, on="City-Country")

df.head()

In [None]:
# Inner merge successful
len(df)

In [None]:
df.dtypes

### Calculate distance using the trigonometric Haversine Formula
![Trigonometry Explanation](../Images/trig_1.jpg)

In [None]:
# Create a list to capture distance calculations
distance_list = []

# radius of the Earth, 3,958.8 mi (6373.0 km)
# presumably a blend of polar and equatorial radii
R = 3958.8

for index, row in df.iterrows():
    
    # coordinates - Used to search for Nearest City
    lat1 = math.radians(row["Search Lats"])
    lon1 = math.radians(row["Search Lngs"])
    
    # coordinates - Actual for City Selected
    lat2 = math.radians(row["Actual Lats"])
    lon2 = math.radians(row["Search Lngs"])
    
    # change in coordinates
    dlon = lon2 - lon1
    dlat = lat2 - lat1
    
    # Haversine formula
    a = math.sin(dlat / 2)**2 + math.cos(lat1) * math.cos(lat2) * math.sin(dlon / 2)**2
    
    c = 2 * math.atan2(math.sqrt(a), math.sqrt(1 - a))
    
    distance = R * c
    
    distance_list.append(distance)

# print(distance_list)

In [None]:
# For convenience
df["Distance (mi)"] = distance_list
df.head()

In [None]:
# Couunt the number of cities selected are in the combined data set
len(df)

In [None]:
# Count the number of cities selected are within 60 miles
len(df.loc[df["Distance (mi)"] <= 60])

### The original search for the nearest city involved 1,500 random geocoordinates.  This yielded just over 200 cities that were less than or equal to 60 miles from the original random geocoordinates.  As a result, we # expanded our random geocoordinates set by a factor of 2x or higher to increase the odds of selecting at least 500 target cities that were within the maximum 60 mile search radius. So, we reran the next time with 3,500 random geocoordinates.

In [None]:
# Remove rows that have Distance values greater than 60
cities_60mi_df = df[~(df["Distance (mi)"] > 60)]
cities_60mi_df.head()

### Perform OpenWeatherMap API calls for each city
* Perform a weather check on each city using a series of successive API calls.
* Include a print log of each city as it'sbeing processed (with the city number and city name).


In [None]:
# Save config information.
url = "http://api.openweathermap.org/data/2.5/weather?"
units = "imperial"

# Build partial query URL
query_url = f"{url}appid={weather_api_key}&units={units}&q="

In [None]:
cities_60mi_list = cities_60mi_df["City-Country"].tolist()
cities_60mi_list

In [None]:
# Create list of cities within the 60 mi parameter from which to query the OpenWeather API
cities_60mi_list = cities_60mi_df["City-Country"].tolist()

# set up lists to hold reponse info
cities_60_found_list = []
temp_max_list = []
humid_list = []
cloud_list = []
wind_list = []
time_list = []

# Loop through the list of cities and perform a request for data on each
for city_60mi in cities_60mi_list:
    
    # Try to extract maximum-temperature, humidity, cloudiness, and wind speed
    try:
        response = requests.get(query_url + city_60mi).json()
        
        # Pause a few seconds to allow for processing delays
        time.sleep(1) 
        
        temp_max_list.append(response["main"]["temp_max"])
        humid_list.append(response["main"]["humidity"])
        cloud_list.append(response["clouds"]["all"])
        wind_list.append(response["wind"]["speed"])
        timestamp_list.append(response["dt"])
        cities_60_found_list.append(city_60mi) 
        
        # Printed as Check
        print(f"{city_60mi} found! Appending maximum temperature, humidity, cloudiness, and wind speed.")

    # Handle exceptions for cities whose geocoordiantes are not returned in the Google API
    except:
        # Append null values
        print(f"City {city_60mi} not found")
        pass 

In [None]:
# Calculate number of cities for which weather data successfully extracted
# And verify lists to be merged into dataframe are of same length
print(f"Original Input Cities = {len(cities_60mi_list)}")
print(f"Cities Not Found = {len(cities_60mi_list) - len(cities_60_found_list)}")
print()
print(f"Target Cities = {len(cities_60_found_list)}")
print(f"Target Latitutdes = {len(temp_max_list)}")
print(f"Target Longitudes = {len(humid_list)}")
print(f"Target Longitudes = {len(cloud_list)}")
print(f"Target Longitudes = {len(wind_list)}")
print(f"Target Longitudes = {len(timestamp_list)}")

### Convert Raw Data to DataFrame
* Export the city data into a .csv.
* Display the DataFrame

In [None]:
# Populate new dataframe with extracted with successfully extracted weather data and city-country data
city_weather_dict = {"City-Country": cities_60_found_list, "Max Temp (F)": temp_max_list, "Humidity (%)": humid_list, \
                     "Cloud Cover (%)": cloud_list, "Wind Speed (mph)": wind_list, "Timestamp": timestamp_list}
city_weather_df = pd.DataFrame(city_weather_dict)

city_weather_df.head()

In [None]:
# Add column converting timestamp to date
city_weather_df["Date"] = pd.to_datetime(city_weather_df["Timestamp"], unit="s")

city_weather_df.head()

In [None]:
# Perform inner merge on the 2 dataframes
df01 = cities_60mi_df
df02 = city_weather_df
final_merge_df = pd.merge(df01, df02, on="City-Country")

# Alternative syntax
# final_merge_df = df01.merge(df02, on="City-Country")

final_merge_df.head()

In [None]:
# Inner merge successful
final_merge_dataset_size = len(final_merge_df)
final_merge_dataset_size

In [None]:
# Create a list of conditions for determining assigning hemisphere
hemisphere_conditions = [
    (final_merge_df["Actual Lats"] > 0),
    (final_merge_df["Actual Lats"] < 0),
    (final_merge_df["Actual Lats"] == 0)
    ]

# create a list of the values we want to assign for each condition
hemishere_values = ["Northern", "Southern", "Equator"]

In [None]:
final_merge_df["Hemisphere"] = np.select(hemisphere_conditions, hemishere_values)
final_merge_df.head()

In [None]:
hemisphere_df = pd.DataFrame(final_merge_df["Hemisphere"].value_counts())
hemisphere_df["Percent (%)"] = hemisphere_df["Hemisphere"] / final_merge_dataset_size *100
hemisphere_df

### The Northern Hemisphere has 68% of the Earth's land by area, while the Southern Hemisphere has 32%.  How close is our random distribution of cities to the disposition of landmass?

In [None]:
print(f"{round(hemisphere_df.iloc[0, 1], 1)}% of our randomly selected cities were in the Northern Hemisphere.")
print(f"That's {round(abs(68 - hemisphere_df.iloc[0, 1]))}% points variance from the known percentage landmass in the Northern Hemisphere.")

### Export dataframe with city name, country code, geocoordinates, search distance, and weather data to csv file
* We have included the original search geocoordinates that selected nearest cities and the actual city geocoordinates.

In [None]:
final_merge_df.to_csv(output_data_file, index = False)

## Inspect the data and remove the cities where the humidity > 100%.
----
Skip this step if there are no cities that have humidity > 100%. 

In [None]:
# Inspect for any rows with humidity > 100%
humid_over_100 = len(final_merge_df.loc[final_merge_df["Humidity (%)"] > 100])
print(f"Number of cities reporting humiidty > 100% = {humid_over_100}")
print(f'''
      100% is the maximum possible humidity measurement.
      So measurement above 100% representes an error.
      Any record that does so should be removed from the dataset.
     ''')

In [None]:
# Filter out rows with humidity levels > 100% (or skip if not present)
cw_filter_df1 = final_merge_df.loc[final_merge_df["Humidity (%)"] <= 100]
# cw_filter_df1.head()

In [None]:
len(cw_filter_df1)

In [None]:
#  Get the indices of cities that have humidity over 100%.
index_over_100_humid = final_merge_df.index[final_merge_df["Humidity (%)"] > 100].tolist()
index_over_100_humid

In [None]:
# ALTERNATIVE METHOD TO TASK PERFORMED 2 CELLS ABOVE
# Make a new DataFrame equal to the city data to drop all humidity outliers by index.
# Passing "inplace=False" will make a copy of the city_data DataFrame, which we call "clean_city_data".

cw_filter_df2 = final_merge_df.drop(index=index_over_100_humid)
# cw_filter_df2.head()

In [None]:
len(cw_filter_df2)


## Plotting the Data
* Use proper labeling of the plots using plot titles (including date of analysis) and axes labels.
* Save the plotted figures as .pngs.

In [None]:
# For convenience
dfx = cw_filter_df1

# Create separate dataframes for Northern and Southern Hemispheres
dfn = dfx.loc[dfx["Hemisphere"] == "Northern"]
dfs = dfx.loc[dfx["Hemisphere"] == "Southern"]

In [None]:
dfn.head()

## Latitude vs. Temperature Plot

In [None]:
ax1 = dfx.plot.scatter(x='Actual Lats',
                      y='Max Temp (F)',
                      title=f"City Latitude vs. Max Temperature ({date.today()})\n",
                      grid=True, 
                      c='DarkBlue')

# Set the x scale because otherwise it goes into weird negative numbers
ax1.set_xlim((-80, 80))

# Set the x-axis label
ax1.set_xlabel("Latitude (degrees)")

## Latitude vs. Humidity Plot

In [None]:
ax2 = dfx.plot.scatter(x='Actual Lats',
                      y='Humidity (%)',
                      title=f"City Latitude vs. Humidity ({date.today()})\n",
                      grid=True, 
                      c='DarkBlue')

# Set the x scale because otherwise it goes into weird negative numbers
ax2.set_xlim((-80, 80))

# Set the x-axis label
ax2.set_xlabel("Latitude (degrees)")

## Latitude vs. Cloudiness Plot

In [None]:
ax3 = dfx.plot.scatter(x='Actual Lats',
                      y='Cloud Cover (%)',
                      title=f"City Latitude vs. Cloudiness ({date.today()})\n",
                      grid=True, 
                      c='DarkBlue')

# Set the x scale because otherwise it goes into weird negative numbers
ax3.set_xlim((-80, 80))

# Set the x-axis label
ax3.set_xlabel("Latitude (degrees)")

## Latitude vs. Wind Speed Plot

In [None]:
ax4 = dfx.plot.scatter(x='Actual Lats',
                      y='Wind Speed (mph)',
                      title=f"City Latitude vs. Wind Speed ({date.today()})\n",
                      grid=True, 
                      c='DarkBlue')

# Set the x scale because otherwise it goes into weird negative numbers
ax4.set_xlim((-80, 80))

# Set the x-axis label
ax4.set_xlabel("Latitude (degrees)")

## Linear Regression

### Regression formula explanation
![Trigonometry Explanation](../Images/slopegraphlabel2.jpg)

####  Calculate Regression Line Coordinates - Northern Hemisphere

In [None]:
# Northern Hemisphere - Max Temperature vs. Latitude Linear Regression
# The polyfit function from numpy performs a least squares polynomial fit over the data that it is given. 
# We want a linear regression over the data in columns Yr and Tmax so we pass these as parameters. 
# The final parameter is the degree of the polynomial. For linear regression the degree is 1.
# Calculate y-axis coordinates and insert as column TLregr (North Hemisphere Temperature v Latitutde regression line)

d = np.polyfit(dfn['Actual Lats'],dfn['Max Temp (F)'],1)
f = np.poly1d(d)
dfn.insert(15,'TLregr',f(dfn['Actual Lats']))

# Calculate regression formula
TLn_m = d[0]
TLn_b = d[1]
TLn_rl = f"y = {round(TLn_m, 1)}x + {round(TLn_b, 1)}"
print(TLn_rl)

In [None]:
# Northern Hemisphere - Humidity vs. Latitude Linear Regression
# The polyfit function from numpy performs a least squares polynomial fit over the data that it is given. 
# We want a linear regression over the data in columns Yr and Tmax so we pass these as parameters. 
# The final parameter is the degree of the polynomial. For linear regression the degree is 1.
# Calculate y-axis coordinates and insert as column TLregr (North Hemisphere Temperature v Latitutde regression line)

d = np.polyfit(dfn['Actual Lats'],dfn['Humidity (%)'],1)
f = np.poly1d(d)
dfn.insert(16,'HLregr',f(dfn['Actual Lats']))

# Calculate regression formula
HLn_m = d[0]
HLn_b = d[1]
HLn_rl = f"y = {round(HLn_m, 1)}x + {round(HLn_b, 1)}"
print(HLn_rl)

In [None]:
# Northern Hemisphere - Cloudiness vs. Latitude Linear Regression
# The polyfit function from numpy performs a least squares polynomial fit over the data that it is given. 
# We want a linear regression over the data in columns Yr and Tmax so we pass these as parameters. 
# The final parameter is the degree of the polynomial. For linear regression the degree is 1.
# Calculate y-axis coordinates and insert as column TLregr (North Hemisphere Temperature v Latitutde regression line)

d = np.polyfit(dfn['Actual Lats'],dfn['Cloud Cover (%)'],1)
f = np.poly1d(d)
dfn.insert(17,'CLregr',f(dfn['Actual Lats']))

# Calculate regression formula
CLn_m = d[0]
CLn_b = d[1]
CLn_rl = f"y = {round(CLn_m, 1)}x + {round(CLn_b, 1)}"
print(CLn_rl)

In [None]:
# Northern Hemisphere - Wind Speed vs. Latitude Linear Regression
# The polyfit function from numpy performs a least squares polynomial fit over the data that it is given. 
# We want a linear regression over the data in columns Yr and Tmax so we pass these as parameters. 
# The final parameter is the degree of the polynomial. For linear regression the degree is 1.
# Calculate y-axis coordinates and insert as column TLregr (North Hemisphere Temperature v Latitutde regression line)

d = np.polyfit(dfn['Actual Lats'],dfn['Wind Speed (mph)'],1)
f = np.poly1d(d)
dfn.insert(18,'WLregr',f(dfn['Actual Lats']))

# Calculate regression formula
WLn_m = d[0]
WLn_b = d[1]
WLn_rl = f"y = {round(WLn_m, 1)}x + {round(WLn_b, 1)}"
print(WLn_rl)

In [None]:
dfn.head()

####  Calculate Regression Line Coordinates - Southern Hemisphere

In [None]:
# Southern Hemisphere - Max Temperature vs. Latitude Linear Regression
# The polyfit function from numpy performs a least squares polynomial fit over the data that it is given. 
# We want a linear regression over the data in columns Yr and Tmax so we pass these as parameters. 
# The final parameter is the degree of the polynomial. For linear regression the degree is 1.
# Calculate y-axis coordinates and insert as column TLregr (North Hemisphere Temperature v Latitutde regression line)

d = np.polyfit(dfs['Actual Lats'],dfs['Max Temp (F)'],1)
f = np.poly1d(d)
dfs.insert(15,'TLregr',f(dfs['Actual Lats']))

# Calculate regression formula
TLs_m = d[0]
TLs_b = d[1]
TLs_rl = f"y = {round(TLs_m, 1)}x + {round(TLs_b, 1)}"
print(TLs_rl)

In [None]:
# Southern Hemisphere - Humidity vs. Latitude Linear Regression
# The polyfit function from numpy performs a least squares polynomial fit over the data that it is given. 
# We want a linear regression over the data in columns Yr and Tmax so we pass these as parameters. 
# The final parameter is the degree of the polynomial. For linear regression the degree is 1.
# Calculate y-axis coordinates and insert as column TLregr (North Hemisphere Temperature v Latitutde regression line)

d = np.polyfit(dfs['Actual Lats'],dfs['Humidity (%)'],1)
f = np.poly1d(d)
dfs.insert(16,'HLregr',f(dfs['Actual Lats']))

# Calculate regression formula
HLs_m = d[0]
HLs_b = d[1]
HLs_rl = f"y = {round(HLs_m, 1)}x + {round(HLs_b, 1)}"
print(HLs_rl)

In [None]:
# Southern Hemisphere - Cloudiness vs. Latitude Linear Regression
# The polyfit function from numpy performs a least squares polynomial fit over the data that it is given. 
# We want a linear regression over the data in columns Yr and Tmax so we pass these as parameters. 
# The final parameter is the degree of the polynomial. For linear regression the degree is 1.
# Calculate y-axis coordinates and insert as column TLregr (North Hemisphere Temperature v Latitutde regression line)

d = np.polyfit(dfs['Actual Lats'],dfs['Cloud Cover (%)'],1)
f = np.poly1d(d)
dfs.insert(17,'CLregr',f(dfs['Actual Lats']))

# Calculate regression formula
CLs_m = d[0]
CLs_b = d[1]
CLs_rl = f"y = {round(CLs_m, 1)}x + {round(CLs_b, 1)}"
print(CLs_rl)

In [None]:
# Southern Hemisphere - Wind Speed vs. Latitude Linear Regression
# The polyfit function from numpy performs a least squares polynomial fit over the data that it is given. 
# We want a linear regression over the data in columns Yr and Tmax so we pass these as parameters. 
# The final parameter is the degree of the polynomial. For linear regression the degree is 1.
# Calculate y-axis coordinates and insert as column TLregr (North Hemisphere Temperature v Latitutde regression line)

d = np.polyfit(dfs['Actual Lats'],dfs['Wind Speed (mph)'],1)
f = np.poly1d(d)
dfs.insert(18,'WLregr',f(dfs['Actual Lats']))

# Calculate regression formula
WLs_m = d[0]
WLs_b = d[1]
WLs_rl = f"y = {round(WLs_m, 1)}x + {round(WLs_b, 1)}"
print(WLs_rl)

In [None]:
dfs.head()

####  Northern Hemisphere - Max Temp vs. Latitude Linear Regression

In [None]:
a = f"legend={TLn_rl}"

# Plot scatter
ax = dfn.plot.scatter(x='Actual Lats',
                      y='Max Temp (F)',
                      title=f"City Latitude vs. Max Temperature ({date.today()})\n Northern Hemisphere",
                      c='DarkBlue')

# Plot regression line, with a grid, omitting legend (important)
dfn.plot(x='Actual Lats', y='TLregr', color='Red', legend=False, grid=True, ax=ax)

# Set the x-axis label and regression line formula as annotation
ax.set_xlabel("Latitude (degrees)")
ax.annotate(f'{TLn_rl}', xy= (0, 0))

####  Southern Hemisphere - Max Temp vs. Latitude Linear Regression

In [None]:
# Plot scatter
ax = dfs.plot.scatter(x='Actual Lats',
                      y='Max Temp (F)',
                      title=f"City Latitude vs. Max Temperature ({date.today()})\n Southern Hemisphere",
                      c='DarkBlue')

# Plot regression line, with a grid, omitting legend (important)
dfs.plot(x='Actual Lats', y='TLregr', color='Red', legend=False, grid=True, ax=ax)

# Set the x-axis label and regression line formula as annotation
ax.set_xlabel("Latitude (degrees)")
ax.annotate(f'{TLs_rl}', xy= (-50, 80))

####  Northern Hemisphere - Humidity (%) vs. Latitude Linear Regression

In [None]:
# Plot scatter
ax = dfn.plot.scatter(x='Actual Lats',
                      y='Humidity (%)',
                      title=f"City Latitude vs. Humidity ({date.today()})\n Northern Hemisphere",
                      c='DarkBlue')

# Plot regression line, with a grid, omitting legend (important)
dfn.plot(x='Actual Lats', y='HLregr', color='Red', legend=False, grid=True, ax=ax)

# Set the x-axis label and regression line formula as annotation
ax.set_xlabel("Latitude (degrees)")
ax.annotate(f'{HLn_rl}', xy= (50, 30))

####  Southern Hemisphere - Humidity (%) vs. Latitude Linear Regression

In [None]:
# Plot scatter
ax = dfs.plot.scatter(x='Actual Lats',
                      y='Humidity (%)',
                      title=f"City Latitude vs. Humidity ({date.today()})\n Southern Hemisphere",
                      c='DarkBlue')

# Plot regression line, with a grid, omitting legend (important)
dfs.plot(x='Actual Lats', y='HLregr', color='Red', legend=False, grid=True, ax=ax)

# Set the x-axis label and regression line formula as annotation
ax.set_xlabel("Latitude (degrees)")
ax.annotate(f'{HLs_rl}', xy= (-45, 40))

####  Northern Hemisphere - Cloudiness (%) vs. Latitude Linear Regression

In [None]:
# Plot scatter
ax = dfn.plot.scatter(x='Actual Lats',
                      y='Cloud Cover (%)',
                      title=f"City Latitude vs. Cloudiness ({date.today()})\n Northern Hemisphere",
                      c='DarkBlue')

# Plot regression line, with a grid, omitting legend (important)
dfn.plot(x='Actual Lats', y='CLregr', color='Red', legend=False, grid=True, ax=ax)

# Set the x-axis label and regression line formula as annotation
ax.set_xlabel("Latitude (degrees)")
ax.annotate(f'{CLn_rl}', xy= (25, 25))

####  Southern Hemisphere - Cloudiness (%) vs. Latitude Linear Regression

In [None]:
# Plot scatter
ax = dfs.plot.scatter(x='Actual Lats',
                      y='Cloud Cover (%)',
                      title=f"City Latitude vs. Cloudiness ({date.today()})\n Southern Hemisphere",
                      c='DarkBlue')

# Plot regression line, with a grid, omitting legend (important)
dfs.plot(x='Actual Lats', y='CLregr', color='Red', legend=False, grid=True, ax=ax)

# Set the x-axis label and regression line formula as annotation
ax.set_xlabel("Latitude (degrees)")
ax.annotate(f'{CLs_rl}', xy= (-45, 50))

####  Northern Hemisphere - Wind Speed (mph) vs. Latitude Linear Regression

In [None]:
# Plot scatter
ax = dfn.plot.scatter(x='Actual Lats',
                      y='Wind Speed (mph)',
                      title=f"City Latitude vs. Wind Speed ({date.today()})\n Northern Hemisphere",
                      c='DarkBlue')

# Plot regression line, with a grid, omitting legend (important)
dfn.plot(x='Actual Lats', y='WLregr', color='Red', legend=False, grid=True, ax=ax)

# Set the x-axis label and regression line formula as annotation
ax.set_xlabel("Latitude (degrees)")
ax.annotate(f'{WLn_rl}', xy= (5, 27))

####  Southern Hemisphere - Wind Speed (mph) vs. Latitude Linear Regression

In [None]:
# Plot scatter
ax = dfs.plot.scatter(x='Actual Lats',
                      y='Wind Speed (mph)',
                      title=f"City Latitude vs. Wind Speed ({date.today()})\n Southern Hemisphere",
                      c='DarkBlue')

# Plot regression line, with a grid, omitting legend (important)
dfs.plot(x='Actual Lats', y='WLregr', color='Red', legend=False, grid=True, ax=ax)

# Set the x-axis label and regression line formula as annotation
ax.set_xlabel("Latitude (degrees)")
ax.annotate(f'{CLn_rl}', xy= (-45, 12.5))