In [None]:
# Libraries
import numpy as np
import matplotlib as plt
import streamlit as st
import pandas as pd
import altair as alt
import requests

In [2]:
############################
## EMISSIONS DATASET
############################

In [61]:
# Get data "Average CO2 emissions per km from new passenger cars" with API
url = "https://ec.europa.eu/eurostat/api/dissemination/statistics/1.0/data/sdg_13_31/?format=JSON&lang=en"
    
# Request the data
response = requests.get(url)


In [64]:
# Parse the JSON response
data = response.json()
    
values = data['value']
geo_labels = data['dimension']['geo']['category']['label']
geo_indices = data['dimension']['geo']['category']['index']
time_labels = data['dimension']['time']['category']['label']
time_indices = data['dimension']['time']['category']['index']    
    
# Create a sorted list of full country names based on the index
sorted_countries = [geo_labels[code] for code in sorted(geo_indices, key=geo_indices.get)]
sorted_times = sorted(time_indices, key=time_indices.get)

# Create an empty DataFrame to store the data with full country names
df = pd.DataFrame(index=sorted_countries, columns=sorted_times)

# Populate the DataFrame with the values
for index, value in values.items():
# Since the index is a single key, we need to map it to the correct country and time
    country_idx = int(index) // len(time_indices)  # Determinec which country the value belongs to
    time_idx = int(index) % len(time_indices)      # Determine which time period the value belongs to

# Get the actual country name and time label
    country_code = list(geo_indices.keys())[country_idx]
    country = geo_labels[country_code]  # Use full country name
    time = time_labels[list(time_indices.keys())[time_idx]]

# Insert the value into the correct place in the DataFrame
    df.loc[country, time] = value    


In [65]:
# Exploring the data
print(df.head())  

print(type(df))
print(str(df))

print(df.describe())

print(df.columns)

                                            2000   2001   2002   2003   2004  \
European Union - 27 countries (from 2020)    NaN    NaN    NaN    NaN    NaN   
Belgium                                    166.5  163.7  161.1  158.1  156.5   
Bulgaria                                     NaN    NaN    NaN    NaN    NaN   
Czechia                                      NaN    NaN    NaN    NaN    154   
Denmark                                    175.7  172.9    170    169  165.9   

                                            2005   2006   2007   2008   2009  \
European Union - 27 countries (from 2020)    NaN    NaN  157.5  152.8    145   
Belgium                                    155.2  153.9  152.8  147.8  142.1   
Bulgaria                                     NaN    NaN  171.6  171.5  172.1   
Czechia                                    155.3  154.2  154.2  154.4  155.5   
Denmark                                    163.7  162.5  159.8  146.4  139.1   

                                      

In [66]:
# Melting the data such that the seperate column years will become one column: "year"
df_em = pd.melt(df.reset_index(), id_vars='index', value_vars= df.iloc[1:],
                                  var_name = 'Year', value_name = 'Emissions').rename(columns={'index': 'Country'})
df_em = df_em.sort_values(by=['Country', 'Year'])
print(df_em.head())

     Country  Year Emissions
20   Austria  2000       168
50   Austria  2001     165.6
80   Austria  2002     164.4
110  Austria  2003     163.8
140  Austria  2004     161.9


In [None]:
# Exploring the trend of emissions over the years for each country

# Line plots
lst = df_em['Country'].unique()
for ls in lst:
    df_em_a = df_em[df_em['Country'] == ls]
    plt.plot('Year','Emissions',data=df_em_a)
    plt.title(f'Emissions {ls}')
    plt.xticks(rotation=75)
    plt.show()

# Bar plots
lst = df_em['Country'].unique()
for ls in lst:
    df_em_b = df_em[df_em['Country'] == ls]
    plt.bar('Year','Emissions',data=df_em_b)
    plt.title(f'Emissions {ls}')
    plt.xticks(rotation=75)
    plt.show()
    
# A peculiar trend was shown around the year 2018 for each country that had data from before and after 2018. When analysing the metadata further, it appeared a new 
# measuring system was used such that only the years 2000-2016 are comparable with each other and the years 2017 and up are comparable with each other.
# To obtain reliable insights regarding the trend, later on the data will be split for these two time periods.

In [None]:
lst = df_em['Country'].unique()
input_dropdown = alt.binding_select(options=lst, name='Country ')
selection = alt.selection_point(fields=['Country'], bind=input_dropdown)

# Trying out a interactive bar chart for just emissions
bar_chart = alt.Chart(df_em).mark_bar().encode(
    x='Year:O', 
    y='Emissions:Q',
    tooltip=['Year', 'Emissions']
).add_params(
    selection
).transform_filter(
    selection
).properties(
    title=(f'Average C02 emissions per year for selected country'),
    width=400,
    height=450
)    
    
bar_chart   

In [7]:
############################
## NEW CARS DATASET
############################

In [None]:
df2 = pd.read_csv("D:/Universiteit Utrecht - DSS/cleaned_NoEVS_data.csv")

print(df2)



                   Country    2014    2015    2016    2017     2018     2019  \
0           European Union  30.892  46.641  53.544  83.778  134.506  250.082   
1                  Belgium   1.167   1.360   2.061   2.717    3.763    8.892   
2                 Bulgaria       0      10      11      13      145      188   
3                  Czechia      58     312     210     376      731      778   
4                  Denmark   1.564   1.265   1.265     691    1.544    5.501   
5                  Germany   8.522  12.363  11.410  25.056   36.062   63.281   
6                  Estonia     331      30      32      26       83       77   
7                  Ireland     194     494     402     631    1.252    3.665   
8                   Greece      38      32      20      37       77      188   
9                    Spain   1.004   2.620   2.085   4.038    6.238   10.415   
10                  France  10.567  17.269  21.758  25.368   31.687   43.564   
11                 Croatia      47      

In [None]:
# Melting the data such that the seperate column years will become one column: "year"
df_EV = pd.melt(df2, id_vars='Country', value_vars= df2.iloc[1:],
                                  var_name = 'Year', value_name = 'Nr_of_new_EVs')
df_EV = df_EV.sort_values(by=['Country', 'Year'])
print(df_EV.head())
print(df_EV.tail())

     Country  Year Nr_of_new_EVs
37   Albania  2014             :
79   Albania  2015             :
121  Albania  2016             :
163  Albania  2017             :
205  Albania  2018             :
            Country  Year Nr_of_new_EVs
242  United Kingdom  2019             :
284  United Kingdom  2020             :
326  United Kingdom  2021       188.143
368  United Kingdom  2022       263.197
410  United Kingdom  2023             :


In [16]:
############################
## BOTH DATASETS
############################

In [122]:
# Left joining the data on emissions per country per year and data on new EVs and total new cars per country per year
df_all = df_em.merge(df_EV, how='left', on=['Country', 'Year'])
df_all

Unnamed: 0,Country,Year,Emissions,Nr_of_new_EVs
0,Austria,2000,168,
1,Austria,2001,165.6,
2,Austria,2002,164.4,
3,Austria,2003,163.8,
4,Austria,2004,161.9,
...,...,...,...,...
715,Sweden,2019,145.1,15.795
716,Sweden,2020,111.7,27.981
717,Sweden,2021,88.3,57.590
718,Sweden,2022,66.6,95.371


In [None]:
# Checking data types of dataframe
print(df_all.dtypes)

# Turning year into an integer for easier subsetting later on
df_all['Year'] = df_all['Year'].astype('int')
print(np.dtype(df_all['Year']))

# Turning Nr_of_new_EVs into an integer for easier working with the data
df_all['Nr_of_new_EVs'] = df_all['Nr_of_new_EVs'].astype('str').str.replace(".", "") # Turning it into a string and removing the thousands seperators
df_all['Nr_of_new_EVs'] = df_all['Nr_of_new_EVs'].replace(":", np.nan).astype('float').astype('Int64') # due to NaN's, turning the variable first into floats necessary
print(df_all.dtypes)

print(df_all)

Country          object
Year             object
Emissions        object
Nr_of_new_EVs    object
dtype: object
int64


In [None]:
###############################################
## VERY PROFESSIONAL TRYOUT REMOVING NA'S
################################################



# We will not replace the NA's with 0; in the plots, Nr of new EVs will simply start later on. If there is a missing value for a year while the year before and after 
# did have a value, the line will be consistent.

#n = 1     # check previous and next (1) entry

# rolling window size is (2n + 1)
#try_out = (df_all['Nr_of_new_EVs'].rolling(n * 2 + 1, min_periods=1, center=True)
#                                  .mean())

# Update into a new column `Consumption_New` for demo purpose
#df_all['Nr_of_new_EVs_New'] = df_all['Nr_of_new_EVs']    
#df_all.loc[df_all['Nr_of_new_EVs'] == 0, 'Nr_of_new_EVs_New'] = Consumption_mean

#df_all

Unnamed: 0,Country,Year,Emissions,Nr_of_new_EVs,Nr_of_new_EVs_New
0,Austria,2000,168,,
1,Austria,2001,165.6,,
2,Austria,2002,164.4,,
3,Austria,2003,163.8,,
4,Austria,2004,161.9,,
...,...,...,...,...,...
715,Sweden,2019,145.1,15.795,15.795
716,Sweden,2020,111.7,27.981,27.981
717,Sweden,2021,88.3,57.590,57.590
718,Sweden,2022,66.6,95.371,95.371


In [126]:
# Cutting the years at 2017, as the metadata states that data is only comparable for 2000 - 2016 and for 2017 - now

# Subsetting data for years 2000 - 2016
df_00_16 = df_all[df_all['Year'] <= 2016]
print(df_00_16.head)

# Subsetting data for years 2017 - now
df_17_up = df_all[df_all['Year'] >= 2017]
print(df_17_up.head)

<bound method NDFrame.head of      Country  Year Emissions  Nr_of_new_EVs
0    Austria  2000       168           <NA>
1    Austria  2001     165.6           <NA>
2    Austria  2002     164.4           <NA>
3    Austria  2003     163.8           <NA>
4    Austria  2004     161.9           <NA>
..       ...   ...       ...            ...
708   Sweden  2012     135.9           <NA>
709   Sweden  2013     133.2           <NA>
710   Sweden  2014       131           1266
711   Sweden  2015     126.3           2916
712   Sweden  2016     123.1           2993

[510 rows x 4 columns]>
<bound method NDFrame.head of      Country  Year Emissions  Nr_of_new_EVs
17   Austria  2017     146.3           5433
18   Austria  2018     149.1           6757
19   Austria  2019     152.1           9242
20   Austria  2020     135.7          15972
21   Austria  2021     116.1          33366
..       ...   ...       ...            ...
715   Sweden  2019     145.1          15795
716   Sweden  2020     111.7       

In [19]:
############################
## PLOTS PLOTS PLOTS
############################

In [None]:
# Dropdown for price charts
chart_option = st.selectbox('Choose a Period of Time', [
    '2000 - 2016',
    '2017 and up'
     ])

if chart_option == '2000 - 2016':
    base = alt.Chart(df_00_16).encode(
        alt.X('Year:O').title('Year'))

    bar_chart2 = base.mark_bar().encode(
        alt.Y('Emissions:Q').title('Emissions'),
        tooltip=['Year', 'Emissions']
    ).add_params(
        selection
    ).transform_filter(
        selection
    ).properties(
        title=(f'Average C02 emissions per km from new passenger cars for selected country'),
        width=400,
        height=450
    )    

    line_chart = base.mark_line(stroke='#57A44C', interpolate='monotone').encode(
        alt.Y('Nr_of_new_EVs').title('Nr of new EVs'),
        tooltip=['Year', 'Nr_of_new_EVs']
    ).add_params(
        selection
    ).transform_filter(
        selection
    ).properties(
        width=400,
        height=450
    )    
        
    full_chart = alt.layer(bar_chart2, line_chart).resolve_scale(
        y='independent'
    )
        
    full_chart

In [None]:
if chart_option == '2017 and up':
    base2 = alt.Chart(df_17_up).encode(
        alt.X('Year:O').title('Year'))

    bar_chart3 = base2.mark_bar().encode(
        alt.Y('Emissions:Q').title('Emissions'),
        tooltip=['Year', 'Emissions']
    ).add_params(
        selection
    ).transform_filter(
        selection
    ).properties(
        title=(f'Average C02 emissions per km from new passenger cars for selected country'),
        width=400,
        height=450
    )    

    line_chart2 = base2.mark_line(stroke='#57A44C', interpolate='monotone').encode(
        alt.Y('Nr_of_new_EVs').title('Nr of new EVs'),
        tooltip=['Year', 'Nr_of_new_EVs']
    ).add_params(
        selection
    ).transform_filter(
        selection
    ).properties(
        width=400,
        height=450
    )    
        
    full_chart2 = alt.layer(bar_chart3, line_chart2).resolve_scale(
        y='independent'
    )
        
    full_chart2