In [3]:
# Dependencies and Setup
import matplotlib.pyplot as plt
import scipy.stats as st
import pandas as pd
import numpy as np
import requests
import gmaps
import os
import datetime

In [4]:
# Read country file and store into Pandas data frame
land_temp_by_country_csv = "../../ProjectOne_datasets/GlobalLandTemperaturesByCountry.csv"
land_temp_by_country_df = pd.read_csv(land_temp_by_country_csv)
land_temp_by_country_df.tail()

Unnamed: 0,dt,AverageTemperature,AverageTemperatureUncertainty,Country
577457,2013-05-01,19.059,1.022,Zimbabwe
577458,2013-06-01,17.613,0.473,Zimbabwe
577459,2013-07-01,17.0,0.453,Zimbabwe
577460,2013-08-01,19.759,0.717,Zimbabwe
577461,2013-09-01,,,Zimbabwe


In [5]:
#Load Population Growth by Country dataset
population_growth_country_csv = "../../ProjectOne_datasets/population-by-country.csv"
population_growth_country_df = pd.read_csv(population_growth_country_csv)
population_growth_country_df.tail()

#Rename Column for World Population data
population_growth_country_df = population_growth_country_df.rename(columns={"Population by Country (Clio Infra (2016))":"Population",
                                                                           "Entity":"Country"})
population_growth_country_df.head()

Unnamed: 0,Country,Code,Year,Population
0,Afghanistan,AFG,1500,2000000.0
1,Afghanistan,AFG,1600,2500000.0
2,Afghanistan,AFG,1700,2500000.0
3,Afghanistan,AFG,1800,3280000.0
4,Afghanistan,AFG,1820,3280000.0


In [6]:
#Load Population Growth by Country dataset
co2_country_csv = "../../ProjectOne_datasets/annual-co-emissions-by-region.csv"
co2_country_df = pd.read_csv(co2_country_csv)
co2_country_df.tail()

#Rename Column for World Population data
co2_country_df = co2_country_df.rename(columns={"Entity":"Country"})
co2_country_df.head()

Unnamed: 0,Country,Code,Year,Annual CO2 emissions
0,Afghanistan,AFG,1750,0.0
1,Afghanistan,AFG,1751,0.0
2,Afghanistan,AFG,1752,0.0
3,Afghanistan,AFG,1753,0.0
4,Afghanistan,AFG,1754,0.0


In [7]:
print(f"Country Data Size: {len(land_temp_by_country_df)}")


Country Data Size: 577462


In [8]:
# Cleanup Datasets
#Drop AverageTemperatureUncertainty
land_temp_by_country_df = land_temp_by_country_df.drop(["AverageTemperatureUncertainty"], axis=1)

#Remove rows with null temperatures
land_temp_by_country_df = land_temp_by_country_df[land_temp_by_country_df["AverageTemperature"].notna()]

print(f"Country Data Size: {len(land_temp_by_country_df)}")

Country Data Size: 544811


In [9]:
#Retrieve rows for the last century
cent_land_temp_by_country_df = land_temp_by_country_df.loc[land_temp_by_country_df["dt"] >= "1900-01-01"]
clean_co2_df=co2_country_df.loc[co2_country_df['Year'] >= 1900]
co2_country_df = co2_country_df.loc[co2_country_df["Year"] >= 1900]

print(f"Country Size: {len(cent_land_temp_by_country_df)}")
clean_co2_df

Country Size: 328818


Unnamed: 0,Country,Code,Year,Annual CO2 emissions
150,Afghanistan,AFG,1900,0.0
151,Afghanistan,AFG,1901,0.0
152,Afghanistan,AFG,1902,0.0
153,Afghanistan,AFG,1903,0.0
154,Afghanistan,AFG,1904,0.0
...,...,...,...,...
63175,Zimbabwe,ZWE,2015,12170460.0
63176,Zimbabwe,ZWE,2016,10814761.0
63177,Zimbabwe,ZWE,2017,10246841.0
63178,Zimbabwe,ZWE,2018,11340575.0


In [10]:
bins = [1900, 1910, 1920, 1930, 1940, 1950, 1960, 1970, 1980, 1990, 2000, 2010]
group_names = ["1900", "1910", "1920", "1930", "1940", "1950", "1960", "1970", "1980", "1990", "2000"]

clean_co2_df["Decade"] = pd.cut(clean_co2_df["Year"], bins, labels=group_names, include_lowest=True)

decade_co2 = clean_co2_df.groupby(['Decade', 'Country']).agg({'Annual CO2 emissions': ['mean']})
decade_co2


Unnamed: 0_level_0,Unnamed: 1_level_0,Annual CO2 emissions
Unnamed: 0_level_1,Unnamed: 1_level_1,mean
Decade,Country,Unnamed: 2_level_2
1900,Afghanistan,0.000000e+00
1900,Africa,9.998390e+06
1900,Albania,0.000000e+00
1900,Algeria,0.000000e+00
1900,Andorra,0.000000e+00
...,...,...
2000,Wallis and Futuna Islands,2.601440e+04
2000,World,2.945115e+10
2000,Yemen,1.983259e+07
2000,Zambia,2.144057e+06


In [11]:
global countries_temperatures_df 
columns = ["Decade", "Land Temperature", "Country"]
countries_temperatures_df = pd.DataFrame(columns = columns)

def get_surface_temps_by_country(country):
    global countries_temperatures_df 
    temp_df = cent_land_temp_by_country_df.loc[land_temp_by_country_df["Country"]==country]
    temp_df = temp_df.groupby((pd.DatetimeIndex(temp_df["dt"]).year//10)*10).mean()
    temp_df["Country"] = country

    #Reset index, and rename columns appropriately
    temp_df.reset_index(inplace=True)
    temp_df = temp_df.rename(columns = {'dt':'Decade'})
    temp_df = temp_df.rename(columns = {'AverageTemperature':'Land Temperature'})
    temp_df = temp_df.loc[temp_df["Decade"] != 2010]
    temp_df
    countries_temperatures_df = pd.concat([countries_temperatures_df, temp_df], axis=0)

In [12]:
global countries_population_df 
columns = ["Decade", "Population", "Country"]
countries_population_df = pd.DataFrame(columns = columns)

def get_population_by_country(country):
    global countries_population_df 
    
    temp_df = population_growth_country_df.loc[population_growth_country_df["Country"] == country]
    temp_df = temp_df.loc[temp_df["Year"] >= 1900]
    temp_df = temp_df.rename(columns = {'Year':'Decade'})
    temp_df = temp_df.drop(["Code"], axis=1)
    countries_population_df = pd.concat([countries_population_df, temp_df], axis=0)

In [13]:
# global countries_cO2_df
# columns = ["Decade", "CO2 Emissions", "Country"]
# countries_cO2_df = pd.DataFrame(columns = columns)

# def get_CO2_by_country(country):
#     global countries_cO2_df 
#     temp_df = co2_country_df.loc[co2_country_df["Entity"]==country]
#     temp_df = temp_df.groupby((temp_df["Year"]//10)*10).mean()
#     temp_df["Country"] = country

#     #Reset index, and rename columns appropriately
# #     temp_df.reset_index(inplace=True)
#     temp_df = temp_df.rename(columns = {'Entity':'Country'})
#     temp_df = temp_df.rename(columns = {'Year':'Decade'})
# #     temp_df = temp_df.rename(columns = {'AverageTemperature':'Land Temperature'})
#     temp_df = temp_df.loc[temp_df["Decade"] != 2010]
#     temp_df
#     countries_cO2_df = pd.concat([countries_cO2_df, temp_df], axis=0)

In [19]:
list_of_countries = ["United States", "China", "Brazil", "Denmark", "India"]

for country in list_of_countries:
    get_surface_temps_by_country(country)
    get_population_by_country(country)
#     get_CO2_by_country(country)

idx = pd.IndexSlice
countries_CO2_df=decade_co2.loc[idx[:,list_of_countries], :]
# countries_CO2_df.reset_index(drop=True)


# print(f"Temperature lenght: {len(countries_temperatures_df)}")
# print(f"Temperature lenght: {len(countries_population_df)}")     
countries_CO2_df


Unnamed: 0_level_0,Unnamed: 1_level_0,Annual CO2 emissions
Unnamed: 0_level_1,Unnamed: 1_level_1,mean
Decade,Country,Unnamed: 2_level_2
1900,United States,969736400.0
1910,United States,1493924000.0
1920,United States,1753823000.0
1930,United States,1556552000.0
1940,United States,2330513000.0
1950,United States,2710035000.0
1960,United States,3504613000.0
1970,United States,4650222000.0
1980,United States,4625146000.0
1990,United States,5527101000.0


In [20]:


#Final cleansed and merged datasets
final_df = pd.merge(countries_temperatures_df, countries_population_df, on=["Decade", "Country"])
# final_df.set_index(['Decade', 'Country'])
# final_df = pd.merge(final_df, countries_CO2_df, on=["Decade", "Country"])
# final_df = final_df.drop(["Code"], axis=1)
# final_df = final_df.drop(["Country_y"], axis=1)
# final_df = final_df.rename(columns = {'Country_x':'Country'})

print(f"Merged Data Set lenght: {len(final_df)}") 
final_df



Merged Data Set lenght: 880


Unnamed: 0,Decade,Land Temperature,Country,Population
0,1900,8.448225,United States,7.639100e+07
1,1900,8.448225,United States,7.639100e+07
2,1900,8.448225,United States,7.639100e+07
3,1900,8.448225,United States,7.639100e+07
4,1900,8.448225,United States,7.639100e+07
...,...,...,...,...
875,2000,24.670750,India,1.004124e+09
876,2000,24.670750,India,1.004124e+09
877,2000,24.670750,India,1.004124e+09
878,2000,24.670750,India,1.004124e+09


In [21]:
#Reorganize the column names for ease of redability
column_names = ["Country", "Decade", "Land Temperature", "Population"]
final_df = final_df.reindex(columns=column_names)
final_df

Unnamed: 0,Country,Decade,Land Temperature,Population
0,United States,1900,8.448225,7.639100e+07
1,United States,1900,8.448225,7.639100e+07
2,United States,1900,8.448225,7.639100e+07
3,United States,1900,8.448225,7.639100e+07
4,United States,1900,8.448225,7.639100e+07
...,...,...,...,...
875,India,2000,24.670750,1.004124e+09
876,India,2000,24.670750,1.004124e+09
877,India,2000,24.670750,1.004124e+09
878,India,2000,24.670750,1.004124e+09


In [None]:
# Push the remade DataFrame to a new CSV file
final_df.to_csv("../Output/USA_Dataset.csv",
                  encoding="utf-8", index=False, header=True)