In [1]:
import pandas as pd
df = pd.read_csv("global_warming_dataset.csv")
print (df.shape)

(100000, 26)


In [2]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100000 entries, 0 to 99999
Data columns (total 26 columns):
 #   Column                         Non-Null Count   Dtype  
---  ------                         --------------   -----  
 0   Country                        100000 non-null  object 
 1   Year                           100000 non-null  int64  
 2   Temperature_Anomaly            100000 non-null  float64
 3   CO2_Emissions                  100000 non-null  float64
 4   Population                     100000 non-null  float64
 5   Forest_Area                    100000 non-null  float64
 6   GDP                            100000 non-null  float64
 7   Renewable_Energy_Usage         100000 non-null  float64
 8   Methane_Emissions              100000 non-null  float64
 9   Sea_Level_Rise                 100000 non-null  float64
 10  Arctic_Ice_Extent              100000 non-null  float64
 11  Urbanization                   100000 non-null  float64
 12  Deforestation_Rate             

In [9]:
#”Global Warming”

# Business : Where renewable energy company could easily be adopted and needed

## Hypothesis: Countries with high GDP growth also show an increase in CO2 emissions, especially where renewable energy adoption is low
### (GDP, CO2_Emissions, Renewable_Energy_Usage) 

# Business : Reforestation  □ See where carbon offsetting via afforetation is most urgent and has most impact

## Hypothesis: Countries with a decrease in forest area since 2000 have higher CO2 emissions per capita than countries with stable or an increase of the forest area
### ( Country, Forest_Area, Deforestation_Rate , CO2_Emissions, Year, Temperature_Anomaly)
#### Countries with greater forest area experience smaller temperature anomalies over time compared to countries with less forest coverage and succseed to reduce CO2 emissions.
### (Country, Forest_Area, Temperature_Anomaly, Year, CO2_Emissions)
#### Countries with higher forest area growth combined with strong environmental policies (Policy_Score) show better outcomes in Renewable Energy Usage and lower Air Pollution Index.
### (Country, Forest_Area, Policy_Score, Renewable_Energy_Usage, Air_Pollution_Index)
#### Countries with higher GDP are more likely to maintain or expand forest area, especially when coupled with strong environmental policy scores
### (Country, GDP, Forest_Area, Deforestation_Rate, Policy_Score)
# Business : Help Insurers, policy advisors or investment company □ Help identify climate-vulnerable economies that might need help due to those condition ?

## Countries with a lot of temparature anomalies also have a stagnation in GDP per capita in recent years
### (Country, Temperature_Anomaly, GDP, year)

In [12]:
df["Country"].value_counts()

Country
Country_186    557
Country_48     552
Country_30     551
Country_143    551
Country_154    551
              ... 
Country_193    467
Country_165    465
Country_174    464
Country_50     463
Country_34     457
Name: count, Length: 195, dtype: int64

In [13]:
df["Year"].value_counts()

Year
1961    881
1922    869
1957    862
1960    858
1986    857
       ... 
1969    758
1929    757
2023    751
1962    745
1953    732
Name: count, Length: 124, dtype: int64

In [14]:
df.groupby("Country")["Year"].nunique()


Country
Country_1      124
Country_10     122
Country_100    123
Country_101    121
Country_102    124
              ... 
Country_95     123
Country_96     122
Country_97     122
Country_98     119
Country_99     120
Name: Year, Length: 195, dtype: int64

In [23]:
## The main challenge for this dataset that it varies in the same year several time.
# We assume that happens because the data coming from different sources. 
# By mean, each row of the same year is an instance coming from a certain source.
# That is we grouped them by coutry and year and find how many times each country repeated in the same year:
repeats = df.groupby(['Country', 'Year']).size()
repeated_years = repeats[repeats > 1]
repeated_years = repeated_years.reset_index(name='Count')


In [18]:
print(repeated_years)

          Country  Year  Count
0       Country_1  1900      4
1       Country_1  1901      2
2       Country_1  1902      3
3       Country_1  1903      4
4       Country_1  1904      3
...           ...   ...    ...
22158  Country_99  2017      2
22159  Country_99  2018      4
22160  Country_99  2021      4
22161  Country_99  2022      5
22162  Country_99  2023      3

[22163 rows x 3 columns]


In [29]:
numerical_features = df.select_dtypes(include="number").columns.drop("Year")
df1 = df.copy()
df1= df1.groupby(["Year","Country"])[numerical_features].mean().reset_index().sort_values(by="Year", ascending = False).reset_index(drop=True)

In [32]:
type (df1)

pandas.core.frame.DataFrame

In [33]:
df1.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 23797 entries, 0 to 23796
Data columns (total 26 columns):
 #   Column                         Non-Null Count  Dtype  
---  ------                         --------------  -----  
 0   Year                           23797 non-null  int64  
 1   Country                        23797 non-null  object 
 2   Temperature_Anomaly            23797 non-null  float64
 3   CO2_Emissions                  23797 non-null  float64
 4   Population                     23797 non-null  float64
 5   Forest_Area                    23797 non-null  float64
 6   GDP                            23797 non-null  float64
 7   Renewable_Energy_Usage         23797 non-null  float64
 8   Methane_Emissions              23797 non-null  float64
 9   Sea_Level_Rise                 23797 non-null  float64
 10  Arctic_Ice_Extent              23797 non-null  float64
 11  Urbanization                   23797 non-null  float64
 12  Deforestation_Rate             23797 non-null 

In [36]:
df1["Country"].nunique()

195

In [37]:
df_real = pd.read_csv("gdp_co2_by_country_v2.csv")

In [38]:
df_real.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 12444 entries, 0 to 12443
Data columns (total 21 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   Country Name         12444 non-null  object 
 1   Country Code         12444 non-null  object 
 2   Year                 12444 non-null  int64  
 3   Population           12444 non-null  float64
 4   Pop Log              12444 non-null  float64
 5   Pop Outliers         12444 non-null  object 
 6   Pop Category         12444 non-null  object 
 7   CO2                  12444 non-null  float64
 8   CO2 %                12238 non-null  float64
 9   Cumulative CO2       12444 non-null  float64
 10  CO2 Log              12444 non-null  float64
 11  CO2 Outliers         12444 non-null  object 
 12  Emissions Category   12444 non-null  object 
 13  GDP USD              12444 non-null  float64
 14  GDP USD Log          12444 non-null  float64
 15  GDP %                12444 non-null 

In [39]:
from sklearn.preprocessing import MinMaxScaler

features = ['Population', 'GDP USD', 'CO2']

scaler = MinMaxScaler()

df_fake_scaled = df_fake.copy()
df_real_scaled = df_real.copy()

df_fake_scaled[features] = scaler.fit_transform(df_fake[features])
df_real_scaled[features] = scaler.transform(df_real[features])


ModuleNotFoundError: No module named 'sklearn'

In [47]:
!pip install scikit-learn




In [48]:
from sklearn.preprocessing import MinMaxScaler

features = ['Population', 'GDP', 'CO2']

scaler = MinMaxScaler()

df_fake_scaled = df_fake.copy()
df_real_scaled = df_real.copy()

df_fake_scaled[features] = scaler.fit_transform(df_fake[features])
df_real_scaled[features] = scaler.transform(df_real[features])


ModuleNotFoundError: No module named 'sklearn'

In [45]:
import sys
print(sys.executable)


/Users/souadmouajel/Desktop/Ironhack/lab-sessions/week-4/project-week4/proj-DA-midterm/.venv/bin/python


In [46]:
from sklearn.preprocessing import MinMaxScaler

features = ['Population', 'GDP USD', 'CO2']

scaler = MinMaxScaler()

df_fake_scaled = df_fake.copy()
df_real_scaled = df_real.copy()

df_fake_scaled[features] = scaler.fit_transform(df_fake[features])
df_real_scaled[features] = scaler.transform(df_real[features])


ModuleNotFoundError: No module named 'sklearn'

In [49]:
import sklearn
print(sklearn.__version__)  # Should print 1.5.1

ModuleNotFoundError: No module named 'sklearn'

In [50]:
!{sys.executable} -m pip install scikit-learn

Collecting scikit-learn
  Downloading scikit_learn-1.7.0-cp312-cp312-macosx_12_0_arm64.whl.metadata (31 kB)
Collecting scipy>=1.8.0 (from scikit-learn)
  Downloading scipy-1.16.0-cp312-cp312-macosx_14_0_arm64.whl.metadata (61 kB)
Collecting joblib>=1.2.0 (from scikit-learn)
  Downloading joblib-1.5.1-py3-none-any.whl.metadata (5.6 kB)
Collecting threadpoolctl>=3.1.0 (from scikit-learn)
  Downloading threadpoolctl-3.6.0-py3-none-any.whl.metadata (13 kB)
Downloading scikit_learn-1.7.0-cp312-cp312-macosx_12_0_arm64.whl (10.7 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m10.7/10.7 MB[0m [31m8.1 MB/s[0m eta [36m0:00:00[0m00:01[0m0:01[0m
[?25hDownloading joblib-1.5.1-py3-none-any.whl (307 kB)
Downloading scipy-1.16.0-cp312-cp312-macosx_14_0_arm64.whl (20.8 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m20.8/20.8 MB[0m [31m30.7 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25hDownloading threadpoolctl-3.6.0-py3-none-any.whl (18 kB)
Install

In [51]:
import sklearn
print(sklearn.__version__)  # Should print 1.5.1

1.7.0


In [54]:
from sklearn.preprocessing import MinMaxScaler

features = ['Population', 'GDP', 'CO2_Emissions']

scaler = MinMaxScaler()

df_fake_scaled = df1.copy()
df_real_scaled = df_real.copy()

df_fake_scaled[features] = scaler.fit_transform(df1[features])
df_real_scaled[features] = scaler.transform(df_real[features])

KeyError: "['GDP', 'CO2_Emissions'] not in index"

In [62]:
df_real.rename(columns={
    'Population': 'Population',
    'GDP USD': 'GDP',
    'CO2': 'CO2_Emissions',
    'Country Name': 'Country'
}, inplace=True)


In [63]:
from sklearn.preprocessing import MinMaxScaler

features = ['Population', 'GDP', 'CO2_Emissions']

scaler = MinMaxScaler()

df_fake_scaled = df1.copy()
df_real_scaled = df_real.copy()

df_fake_scaled[features] = scaler.fit_transform(df1[features])
df_real_scaled[features] = scaler.transform(df_real[features])

In [64]:
from scipy.spatial.distance import cdist

# Loop over each year in your dataset
matches = []

for year in df_fake_scaled['Year'].unique():
    fake_year_data = df_fake_scaled[df_fake_scaled['Year'] == year]
    real_year_data = df_real_scaled[df_real_scaled['Year'] == year]
    
    if real_year_data.empty:
        continue
    
    distances = cdist(fake_year_data[features], real_year_data[features])
    closest_idxs = distances.argmin(axis=1)
    
    matched_real = df_real[df_real['Year'] == year].iloc[closest_idxs].reset_index(drop=True)
    matched_fake = df1[df1['Year'] == year].reset_index(drop=True)
    
    matched_df = matched_fake.copy()
    matched_df['matched_country'] = matched_real['Country']
    
    matches.append(matched_df)

df_matched = pd.concat(matches, ignore_index=True)


In [65]:
df_matched.head()


Unnamed: 0,Year,Country,Temperature_Anomaly,CO2_Emissions,Population,Forest_Area,GDP,Renewable_Energy_Usage,Methane_Emissions,Sea_Level_Rise,...,Per_Capita_Emissions,Industrial_Activity,Air_Pollution_Index,Biodiversity_Index,Ocean_Acidification,Fossil_Fuel_Usage,Energy_Consumption_Per_Capita,Policy_Score,Average_Temperature,matched_country
0,2023,Country_99,0.401818,471812000.0,841657100.0,36.319203,5428063000000.0,42.954238,3281277.0,34.524529,...,15.085347,41.333634,200.324352,63.079314,7.843562,53.036316,2908.009948,64.82664,12.23806,United States
1,2023,Country_188,-0.57091,313596000.0,1239277000.0,30.800427,6193505000000.0,45.150983,5168892.0,26.538891,...,13.49867,54.049776,66.762735,77.087944,7.724294,46.290306,1831.464717,49.478209,13.238444,China
2,2023,Country_163,0.677952,542612600.0,752789600.0,59.404157,5607864000000.0,33.354475,3167448.0,10.025409,...,6.712261,76.799132,164.705112,62.712155,7.949439,37.29235,2903.212619,29.164099,19.949168,United States
3,2023,Country_162,0.21178,363614500.0,711769300.0,59.886803,4448401000000.0,63.694009,3445078.0,21.418411,...,13.54851,42.552386,162.115286,39.307273,8.040317,45.438518,1800.402082,30.819906,9.111106,United States
4,2023,Country_161,0.648779,594051600.0,196764600.0,66.845488,7657033000000.0,40.424885,5684342.0,31.896154,...,13.864168,60.445563,95.087722,63.356859,8.13195,23.265742,2075.861666,81.462259,18.894343,Brazil


In [67]:
(df_matched['Country'] == df_matched['matched_country']).mean()


np.float64(0.0)