https://www.eia.gov/opendata/qb.php?category=2251604

https://www.nationalgeographic.org/maps/united-states-regions/

# Imports

In [3]:
from itertools import combinations
import json
import matplotlib.pyplot as plt
import numpy as np
import os
import pandas as pd
import pickle 
import requests
import seaborn as sns
from scipy import stats

%matplotlib inline


# Methods/ Globals/ Constants 

In [30]:
%run -i "scripts//methods.py"
%run -i "scripts//global_constants_attributes_script.py"
%run -i "scripts//global_attributes_script.py"

# Data Prep & Visuals

In [33]:
%run -i "scripts//data_prep.py"

# Explore Data

In [None]:
display(df.head())
display(df.describe().T)
display(df.info())
print('Shape =', df.shape)

In [None]:
fig, axes = plt.subplots(nrows=2, ncols=1, figsize=(10,20))

axes[0].set_title('Carbon Emissions Distribution')
axes[0].hist(df['Carbon_Emissions'])



axes[1].barh(y=df.sort_values(by=['Carbon_Emissions'])['State'], 
             width=df.sort_values(by=['Carbon_Emissions'])['Carbon_Emissions'])
axes[1].set_title('Carbon Emissions by State')
plt.tight_layout()

In [None]:
plt.figure(figsize=(50, 50))
sns.barplot(df.Carbon_Emissions, df.State, color='r')
plt.xlabel('CO2_Emissions(million metric tons)')
plt.ylabel('US States')
plt.title('Total CO2 Emissions by State')
sns.set(context='notebook',
        style='darkgrid',
        palette='deep',
        font='sans-serif',
        font_scale=5)

# Parse Date

In [None]:
df['Region'] = df['State'].apply(lambda state: region_maker(state, _REGIONS))

# Hypothesis Tests

#### nCk 5 choose 2 combinations of the 'Two Sample T Test'
We compare each regions carbon emissions
    
    Regions: 
        south_east
        north_east 
        mid_west
        west_coast 
        south_west

In [None]:
se = df[df['Region'] == 'south_east']['Carbon_Emissions'].to_numpy()
ne = df[df['Region'] == 'north_east']['Carbon_Emissions'].to_numpy()
mw = df[df['Region'] == 'mid_west']['Carbon_Emissions'].to_numpy()
wc = df[df['Region'] == 'west_coast']['Carbon_Emissions'].to_numpy()
sw = df[df['Region'] == 'south_west']['Carbon_Emissions'].to_numpy()

regions = ['se', 'ne', 'mw', 'wc', 'sw']
nCk_regions = list(combinations(regions, 2))
hypothesis = []

for region_pair in nCk_regions:
    a = region_pair[0]
    b = region_pair[1]
    exec(f"hypothesis.append((stats.ttest_ind({a}, {b}), {region_pair}))")

In [None]:
hypothesis

In [None]:
population_df.columns = population_df.iloc[2]

population_df.rename(columns = {np.nan: 'States',         
                                'Census': 'Census', 
                                'Estimates Base': 'Estimates Base',             
                                2010 : 2010,
                                2011.0 : 2011,
                                2012.0 : 2012,
                                2013.0 : 2013,
                                2014.0 : 2014,
                                2015.0 : 2015,           
                                2016.0 : 2016,           
                                2017.0 : 2017,           
                                2018.0 : 2018,
                                2019.0 : 2019 
                               }, 
                     inplace=True)

population_df.drop(range(3), inplace=True)

population_df.drop(list(range(59, 66)), inplace=True)

population_df['States'] = population_df['States'].map(lambda x: x.replace('.', ''))

population_df = population_df.T

population_df.columns = population_df.iloc[0]

population_df.reset_index(inplace = True)

population_df.rename(columns = {2: 'Year'}, inplace =True)

population_df.drop(range(3), inplace = True)

population_df.reset_index(inplace = True)

population_df.drop(columns=['index'], inplace=True)

population_df.index.name = None

population_df.columns = population_df.columns.get_level_values(0)

In [None]:
stats.chisquare([16, 18, 16, 14, 12, 12])