# Carbon Emissions Around the World with a focus on the United States

#### Create your virtual environment, ensure you're in the correct directory, and install the requirements.txt.

In [32]:
# Import modules needed to run project.
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import math


## Read in the Data

In [33]:
df = pd.read_csv('assets/co2_emissions_kt_by_country.csv')

# Use .info() to get a general idea of what you're working with.
df.info()
# Use .head() to see what data types are present and to get your column headers.
df.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 13953 entries, 0 to 13952
Data columns (total 4 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   country_code  13953 non-null  object 
 1   country_name  13953 non-null  object 
 2   year          13953 non-null  int64  
 3   value         13953 non-null  float64
dtypes: float64(1), int64(1), object(2)
memory usage: 436.2+ KB


Unnamed: 0,country_code,country_name,year,value
0,ABW,Aruba,1960,11092.675
1,ABW,Aruba,1961,11576.719
2,ABW,Aruba,1962,12713.489
3,ABW,Aruba,1963,12178.107
4,ABW,Aruba,1964,11840.743


## Clean the Data

## Consider revision!!!!!!!!!!!!!!
- #### Drop unnecessary columns for better readability and overall cleanness. We don't need both country name and the country code so let's get rid of the country code.




In [23]:
# ndf= df.drop(columns= 'country_code') ***can accomplish this using code to the left or using inplace= True as shown below. Use inplace if you want to overwright original df.***
# df.drop(columns= 'country_code', inplace= True)
# # Use .info() to check the dataframe again.
# df.info()

- ##### The "value" column currently rounds to the thousandths place and it isn't entirely necessary for such a simple demonstration and won't significantly distort the data once it's visualized later so let's round the values to the hundredths instead.

In [34]:
df['value'] = df['value'].round(decimals=2)
df.head()

Unnamed: 0,country_code,country_name,year,value
0,ABW,Aruba,1960,11092.68
1,ABW,Aruba,1961,11576.72
2,ABW,Aruba,1962,12713.49
3,ABW,Aruba,1963,12178.11
4,ABW,Aruba,1964,11840.74


- #### Rename columns for clarity. The reader won't know what the emissions were measured in based on the value column so let's be make it more accurate. Furthermore, I can format the other columns to achieve a cleaner, more professional look.

In [35]:
# Rename column/columns using ".rename() method". Must format as dict {x:x}. Use "inplace= True" to save in current variable (df). Could also just create new variable (df2) and not include "inplace= True"
df.rename(columns= {'value': 'Carbon Emissions (in kilotons)', 'country_name': 'Country', 'year': 'Year', 'country_code': 'Country Code'}, inplace= True)

## Analyze the data

- #### What quick insights can be gathered from this data?
    - ##### How about I try to determine which 5 countries are emitting the most carbon as of 2019?

In [41]:
filtered_df = df[df['Year'].between(2019, 2019)].sort_values(by='Year')
sorted_df = filtered_df.sort_values(by='Carbon Emissions (in kilotons)', ascending=False)
sorted_df.head()

Unnamed: 0,Country Code,Country,Year,Carbon Emissions (in kilotons)
13651,WLD,World,2019,34344006.07
5569,IBT,IDA & IBRD total,2019,22442415.57
7578,LMY,Low & middle income,2019,21909426.5
8254,MIC,Middle income,2019,21712598.06
5509,IBD,IBRD only,2019,21522419.75


- ##### Well that's odd, isn't it? I don't see a country in that list, do you? It appears that our dataframe has additional data point outside of simply countries. What can I do about that? Can I filter the dataframe so that only countries are shown? Let's find out!

- #####  I'm going to need something that can allow me to sort these columns by country. A simple google search has pointed me to a nice github that has a compiled csv file of official country codes. Fortunately, my data has a column which utilizes alpha-3 country codes. AHA! I've found the variable by which I can filter my data. (A link to the raw url of the data can be found in the README. Additionally, I've copied that data to a csv in the event that something happens to the URL. That can also  be found in the README under "backup_country_codes.csv" )

In [37]:

# Read in country codes from a helpful github user. Thanks!
country = pd.read_csv('https://raw.githubusercontent.com/lukes/ISO-3166-Countries-with-Regional-Codes/master/all/all.csv')
# Let's check the headers
country.head()


Unnamed: 0,name,alpha-2,alpha-3,country-code,iso_3166-2,region,sub-region,intermediate-region,region-code,sub-region-code,intermediate-region-code
0,Afghanistan,AF,AFG,4,ISO 3166-2:AF,Asia,Southern Asia,,142.0,34.0,
1,Åland Islands,AX,ALA,248,ISO 3166-2:AX,Europe,Northern Europe,,150.0,154.0,
2,Albania,AL,ALB,8,ISO 3166-2:AL,Europe,Southern Europe,,150.0,39.0,
3,Algeria,DZ,DZA,12,ISO 3166-2:DZ,Africa,Northern Africa,,2.0,15.0,
4,American Samoa,AS,ASM,16,ISO 3166-2:AS,Oceania,Polynesia,,9.0,61.0,


In [43]:
# We only need the alpha-3 column so let's assign that to its own variable.
country_codes= country['alpha-3']
# Use the above variable to filter our original dataframe so that only countries are displayed
code_filtered_df = df[df['Country Code'].isin(country_codes)]
# Check our work
code_filtered_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 11094 entries, 0 to 13952
Data columns (total 4 columns):
 #   Column                          Non-Null Count  Dtype  
---  ------                          --------------  -----  
 0   Country Code                    11094 non-null  object 
 1   Country                         11094 non-null  object 
 2   Year                            11094 non-null  int64  
 3   Carbon Emissions (in kilotons)  11094 non-null  float64
dtypes: float64(1), int64(1), object(2)
memory usage: 433.4+ KB


- ##### Let's check our top 5 again. Our top 5 carbon-emitting countries happen to be China, USA, India, Russian Federation, and Japan.

In [44]:
code_filtered_df = code_filtered_df[code_filtered_df['Year'].between(2019, 2019)].sort_values(by='Year')
new_df = code_filtered_df.sort_values(by='Carbon Emissions (in kilotons)', ascending=False)
new_df.head()

Unnamed: 0,Country Code,Country,Year,Carbon Emissions (in kilotons)
2159,CHN,China,2019,10707219.73
13268,USA,United States,2019,4817720.21
5869,IND,India,2019,2456300.05
10753,RUS,Russian Federation,2019,1703589.97
6409,JPN,Japan,2019,1081569.95


- ##### Manipulate data for easier visualization later. Now that I've determined our top 5 carbon-emitting countries, I've gone ahead and created individual dataframes for them. As you'll see later in the analysis, creating these unique dataframes will allow me to make graphs a bit more easily.

In [None]:
# Create df focused solely on USA data. If you read the line of code, then you'll see that we're creating a dataframe using data that is only associated with entries that contain "United States."
usa = df[df['Country'].str.contains('United States')]
# Use .info() to determine how much data will be accounted for in the USA-focused dataframe.
usa.info()

- ##### Now that I've shown you how to create country-specific data frames, let's go ahead and create the other four dataframes that we'll be comparing the USA to.

In [None]:
# Create df for other prominent countries for future comparisons
chn = df[df['Country'].str.contains('China')]
ind = df[df['Country'].str.contains('India')]
jpn = df[df['Country'].str.contains('Japan')]
rus = df[df['Country'].str.contains('Russian Federation')]

In [None]:
# Check to see if dataframes created for the other five countries all have the same amount of rows and columns as our original USA dataframe.
chn.info()
ind.info()
jpn.info()
rus.info()


- ##### China seems to be the only divergent dataframe in terms of data output. Let's investigate to see what happened.

In [None]:
chn

- ##### It seems as though "China" is included in the country name for multiple points in the original dataframe. I can make a slight modification to the original parameters of the code to ensure that it's stricter in its text selection. I simply change ".contains" to ".match" and it will focus only columns that strictly match "China". By using the code below the chn dataframe is now in line with the other country-specific dataframes created above.


In [None]:
chn = df[df['Country'].str.match('China')]
chn.info()
chn

## Visualize the data

##### Plotting the US Two ways

In [None]:
# .plot plotting implementation to visualize a line graph.
# usa.plot.line(x ='Year', rot= 45, y= 'Carbon Emissions (in kilotons)', title = 'United States Carbon Emissions', figsize= (10, 5), grid= True)

In [None]:
# plt.show implementation to generate same graph differently.
fig, ax = plt.subplots()
ax.plot(usa['Year'], usa['Carbon Emissions (in kilotons)'])
ax.set_xlabel('Year')
ax.set_ylabel('Emissions  (in kilotons)')
ax.set_title("United States Carbon Emissions")
ax.ticklabel_format(style='plain')
# calculate and plot the trendline using the three lines of code below.
z = np.polyfit(usa['Year'], usa['Carbon Emissions (in kilotons)'], 1)
p = np.poly1d(z)
ax.plot(usa['Year'], p(usa['Year']))
plt.show()



In [None]:
# Comparing USA and Russia emissions with accompanying trendlines.
fig, ax = plt.subplots()
ax.scatter(usa['Year'], usa['Carbon Emissions (in kilotons)'])
ax.scatter(rus['Year'], rus['Carbon Emissions (in kilotons)'])
ax.set_xlabel('Year')
ax.set_ylabel('Emissions  (in kilotons)')
ax.set_title("Carbon Emissions: USA vs Russia")
ax.ticklabel_format(style='plain')
z = np.polyfit(usa['Year'], usa['Carbon Emissions (in kilotons)'], 1)
zz = np.polyfit(rus['Year'], rus['Carbon Emissions (in kilotons)'], 1)
p = np.poly1d(z)
pp = np.poly1d(zz)
ax.plot(usa['Year'], p(usa['Year']))
ax.plot(rus['Year'], pp(rus['Year']))
ax.legend(['USA', 'Russia'])
plt.show()

In [None]:
# Focusing on just the 1990s in the USA
usa.loc[13239:13248]

In [None]:
# Experimenting with a focus on the 1990s trends in USA
fig, ax = plt.subplots()
ax.plot(usa['Year'].loc[13239:13248], usa['Carbon Emissions (in kilotons)'].loc[13239:13248])
ax.set_xlabel('Year')
ax.set_ylabel('Emissions  (in kilotons)')
ax.set_title("United States Carbon Emissions")
ax.ticklabel_format(style='plain')
# calculate and plot the trendline using the three lines of code below.
z = np.polyfit(usa['Year'].loc[13239:13248], usa['Carbon Emissions (in kilotons)'].loc[13239:13248], 1)
p = np.poly1d(z)
ax.plot(usa['Year'].loc[13239:13248], p(usa['Year'].loc[13239:13248]))
plt.show()