In [1]:
# Dependencies and Setup
import matplotlib.pyplot as plt
import scipy.stats as st
import pandas as pd
import numpy as np
import requests
import gmaps
import os
import datetime

In [2]:
# Read country file and store into Pandas data frame
land_temp_by_country_csv = "../../ProjectOne_datasets/GlobalLandTemperaturesByCountry.csv"
land_temp_by_country_df = pd.read_csv(land_temp_by_country_csv)
land_temp_by_country_df.tail()

Unnamed: 0,dt,AverageTemperature,AverageTemperatureUncertainty,Country
577457,2013-05-01,19.059,1.022,Zimbabwe
577458,2013-06-01,17.613,0.473,Zimbabwe
577459,2013-07-01,17.0,0.453,Zimbabwe
577460,2013-08-01,19.759,0.717,Zimbabwe
577461,2013-09-01,,,Zimbabwe


In [3]:
#Load Population Growth by Country dataset
population_growth_country_csv = "../../ProjectOne_datasets/population-by-country.csv"
population_growth_country_df = pd.read_csv(population_growth_country_csv)
population_growth_country_df.tail()

#Rename Column for World Population data
population_growth_country_df = population_growth_country_df.rename(columns={"Population by Country (Clio Infra (2016))":"Population",
                                                                           "Entity":"Country"})
population_growth_country_df.head()

Unnamed: 0,Country,Code,Year,Population
0,Afghanistan,AFG,1500,2000000.0
1,Afghanistan,AFG,1600,2500000.0
2,Afghanistan,AFG,1700,2500000.0
3,Afghanistan,AFG,1800,3280000.0
4,Afghanistan,AFG,1820,3280000.0


In [4]:
print(f"Country Data Size: {len(land_temp_by_country_df)}")


Country Data Size: 577462


In [5]:
# Cleanup Datasets
#Drop AverageTemperatureUncertainty
land_temp_by_country_df = land_temp_by_country_df.drop(["AverageTemperatureUncertainty"], axis=1)

#Remove rows with null temperatures
land_temp_by_country_df = land_temp_by_country_df[land_temp_by_country_df["AverageTemperature"].notna()]

print(f"Country Data Size: {len(land_temp_by_country_df)}")

Country Data Size: 544811


In [6]:
#Retrieve rows for the last century
cent_land_temp_by_country_df = land_temp_by_country_df.loc[land_temp_by_country_df["dt"] >= "1900-01-01"]

print(f"Country Size: {len(cent_land_temp_by_country_df)}")


Country Size: 328818


In [7]:
cent_land_temp_by_country_df.dtypes

dt                     object
AverageTemperature    float64
Country                object
dtype: object

In [8]:
#USA Data for Surface Temperatures
#Filter by only USA data
usa_land_temp_by_country_df = cent_land_temp_by_country_df.loc[land_temp_by_country_df["Country"] == "United States"]
usa_land_temp_by_country_df = usa_land_temp_by_country_df.groupby((pd.DatetimeIndex(usa_land_temp_by_country_df["dt"]).year//10)*10).mean()
usa_land_temp_by_country_df["Country"] = "United States"

#Reset index, and rename columns appropriately
usa_land_temp_by_country_df.reset_index(inplace=True)
usa_land_temp_by_country_df = usa_land_temp_by_country_df.rename(columns = {'dt':'Decade'})
usa_land_temp_by_country_df = usa_land_temp_by_country_df.rename(columns = {'AverageTemperature':'Land Temperature'})
usa_land_temp_by_country_df = usa_land_temp_by_country_df.loc[usa_land_temp_by_country_df["Decade"] != 2010]
usa_land_temp_by_country_df

Unnamed: 0,Decade,Land Temperature,Country
0,1900,8.448225,United States
1,1910,8.46025,United States
2,1920,8.75525,United States
3,1930,9.086992,United States
4,1940,8.924092,United States
5,1950,8.913233,United States
6,1960,8.721917,United States
7,1970,8.696925,United States
8,1980,9.104308,United States
9,1990,9.317558,United States


In [9]:
#USA Data for Population Growth
#Filter by only USA data
usa_population_growth_country_df = population_growth_country_df.loc[population_growth_country_df["Country"] == "United States"]

#Only take the last century worth of data
usa_population_growth_country_df = usa_population_growth_country_df.loc[usa_population_growth_country_df["Year"] >= 1900]

#Rename column
usa_population_growth_country_df = usa_population_growth_country_df.rename(columns = {'Year':'Decade'})
usa_population_growth_country_df

Unnamed: 0,Country,Code,Decade,Population
3081,United States,USA,1900,76391000.0
3082,United States,USA,1910,92767000.0
3083,United States,USA,1920,106881000.0
3084,United States,USA,1930,123668000.0
3085,United States,USA,1940,132637000.0
3086,United States,USA,1950,152271000.0
3087,United States,USA,1960,180671000.0
3088,United States,USA,1970,205052000.0
3089,United States,USA,1980,227726463.0
3090,United States,USA,1990,250131894.0


In [10]:
#Final cleansed and merged datasets
final_df = pd.merge(usa_land_temp_by_country_df, usa_population_growth_country_df, on="Decade")
final_df = final_df.drop(["Code"], axis=1)
final_df = final_df.drop(["Country_y"], axis=1)
final_df = final_df.rename(columns = {'Country_x':'Country'})

final_df

Unnamed: 0,Decade,Land Temperature,Country,Population
0,1900,8.448225,United States,76391000.0
1,1910,8.46025,United States,92767000.0
2,1920,8.75525,United States,106881000.0
3,1930,9.086992,United States,123668000.0
4,1940,8.924092,United States,132637000.0
5,1950,8.913233,United States,152271000.0
6,1960,8.721917,United States,180671000.0
7,1970,8.696925,United States,205052000.0
8,1980,9.104308,United States,227726463.0
9,1990,9.317558,United States,250131894.0


In [11]:
#Reorganize the column names for ease of redability
column_names = ["Country", "Decade", "Land Temperature", "Population"]
final_df = final_df.reindex(columns=column_names)
final_df

Unnamed: 0,Country,Decade,Land Temperature,Population
0,United States,1900,8.448225,76391000.0
1,United States,1910,8.46025,92767000.0
2,United States,1920,8.75525,106881000.0
3,United States,1930,9.086992,123668000.0
4,United States,1940,8.924092,132637000.0
5,United States,1950,8.913233,152271000.0
6,United States,1960,8.721917,180671000.0
7,United States,1970,8.696925,205052000.0
8,United States,1980,9.104308,227726463.0
9,United States,1990,9.317558,250131894.0


In [12]:
# Push the remade DataFrame to a new CSV file
final_df.to_csv("../Output/USA_Dataset.csv",
                  encoding="utf-8", index=False, header=True)