In [2]:
import pandas as pd 
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [3]:
def show_summary(df) :
    
    print(80 * "+")
    print(f"DIMENSIONS : ({df.shape[0]}, {df.shape[1]})")
    
    print(80 * "+")
    print("COLUMNS: \n")
    print(df.columns.values)
    
    print(80 * "+")
    print("DATA INFO: \n")
    print(df.dtypes)
    
    print(80 * "+")
    print("MISSING VALUES: \n")
    print(df.isnull().sum())
    
    print(80 * "+")
    print("NUMBER OF UNIQUE VALUES: \n")
    print(df.nunique())

In [4]:
df = pd.read_csv("climate_change_data.csv")
show_summary(df)

++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
DIMENSIONS : (10000, 9)
++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
COLUMNS: 

['Date' 'Location' 'Country' 'Temperature' 'CO2 Emissions'
 'Sea Level Rise' 'Precipitation' 'Humidity' 'Wind Speed']
++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
DATA INFO: 

Date               object
Location           object
Country            object
Temperature       float64
CO2 Emissions     float64
Sea Level Rise    float64
Precipitation     float64
Humidity          float64
Wind Speed        float64
dtype: object
++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
MISSING VALUES: 

Date              0
Location          0
Country           0
Temperature       0
CO2 Emissions     0
Sea Level Rise    0
Precipitation     0
Humidity          0
Wind Speed        0
dtype: int64
+++++++++++++++++++++++++++++++++++++++++++++++++++

In [5]:
df.head()

Unnamed: 0,Date,Location,Country,Temperature,CO2 Emissions,Sea Level Rise,Precipitation,Humidity,Wind Speed
0,2000-01-01 00:00:00.000000000,New Williamtown,Latvia,10.688986,403.118903,0.717506,13.835237,23.631256,18.492026
1,2000-01-01 20:09:43.258325832,North Rachel,South Africa,13.81443,396.663499,1.205715,40.974084,43.982946,34.2493
2,2000-01-02 16:19:26.516651665,West Williamland,French Guiana,27.323718,451.553155,-0.160783,42.697931,96.6526,34.124261
3,2000-01-03 12:29:09.774977497,South David,Vietnam,12.309581,422.404983,-0.475931,5.193341,47.467938,8.554563
4,2000-01-04 08:38:53.033303330,New Scottburgh,Moldova,13.210885,410.472999,1.135757,78.69528,61.789672,8.001164


In [6]:
df["Date"] = pd.to_datetime(df["Date"])
df["Date"] = df["Date"].dt.date

In [7]:
# convert celsius to Fahrenheit
df["Temperature (F)"] = (df["Temperature"] * 9/5) + 32
df["Wind Speed"] = df["Wind Speed"] * 0.621371
df = df[["Date", "Location", "Country", "Temperature (F)","CO2 Emissions", "Sea Level Rise", "Precipitation", "Humidity", "Wind Speed"]]

In [17]:
df.columns = ["date", "location", "country", "temperature(F)", "co2_emissions(PPM)", "sea_level_rise(mm)", "precipitation(mm)", "humidity(%)", "wind_speed(mi/h)"]

In [18]:
df

Unnamed: 0,date,location,country,temperature(F),co2_emissions(PPM),sea_level_rise(mm),precipitation(mm),humidity(%),wind_speed(mi/h)
0,2000-01-01,New Williamtown,Latvia,51.240175,403.118903,0.717506,13.835237,23.631256,11.490409
1,2000-01-01,North Rachel,South Africa,56.865975,396.663499,1.205715,40.974084,43.982946,21.281522
2,2000-01-02,West Williamland,French Guiana,81.182692,451.553155,-0.160783,42.697931,96.652600,21.203826
3,2000-01-03,South David,Vietnam,54.157245,422.404983,-0.475931,5.193341,47.467938,5.315558
4,2000-01-04,New Scottburgh,Moldova,55.779593,410.472999,1.135757,78.695280,61.789672,4.971691
...,...,...,...,...,...,...,...,...,...
9995,2022-12-27,South Elaineberg,Bhutan,59.036941,391.379537,-1.452243,93.417109,25.293814,4.058712
9996,2022-12-28,Leblancville,Congo,62.190411,346.921190,0.543616,49.882947,96.787402,26.252312
9997,2022-12-29,West Stephanie,Argentina,72.266044,466.042136,1.026704,30.659841,15.211825,11.367179
9998,2022-12-30,Port Steven,Albania,66.975536,337.899776,-0.895329,18.932275,82.774520,26.361202


TODO: try some bootstrapping

In [10]:
from sklearn.utils import resample

In [23]:
# boot strapping
results = []
co2_median = df["co2_emissions(PPM)"].median()
for n in range(10000) :
    sample = resample(df["co2_emissions(PPM)"])
    results.append(sample.median())

results = pd.Series(results)

print("Bootstrap Statistics")
print(f'Original: {co2_median}')
print(f"bias: {results.mean() - co2_median}")
print(f"std. error: {results.std()}")

Bootstrap Statistics
Original: 400.82132446979676
bias: -0.026600222936565387
std. error: 0.6333969718528314
