In [1]:
!pip install altair
import pandas as pd
import altair as alt



## 1) Reproduction of the graph on the left

### Step 1 : create a base plot with the "recon" file 

First we will handle the 'SPM1_1-2000_recon.csv'. 
- we can see that the first 19 rows are explanation of the data but we are interested only in the following rows containing the actual data. 
- there are 4 columns : 
    - the year YYYY, type int 
    - the surface temperature anomaly in celsius degrees type float, realtive to the period of reference 1850-1900
    - the 5th percentile :  the lower limit of the 90% uncertainty interval (celsius).
    - the 95th percentile : the upper limit of the 90% uncertainty interval (Celsius).

In [2]:
df_spm1_recon = pd.read_csv('/Users/ivanleboucher/Desktop/Cours/data_viz/assessment_IPCC/SPM1_1-2000_recon.csv', skiprows=19, encoding='latin1')
df_spm1_recon

Unnamed: 0,1,2,3,4,Unnamed: 4
0,1995,0.65,0.46,0.79,
1,1994,0.64,0.46,0.77,
2,1993,0.62,0.45,0.76,
3,1992,0.61,0.45,0.74,
4,1991,0.59,0.44,0.71,
...,...,...,...,...,...
1986,9,0.10,-0.11,0.38,
1987,8,0.10,-0.11,0.38,
1988,7,0.09,-0.11,0.38,
1989,6,0.09,-0.12,0.37,


In [3]:
df_spm1_recon = df_spm1_recon.rename(
    columns={
        "1": "Year", 
        "2": "surface temperature anomaly",
        "3": "5th percentile",
        "4": "95th percentile"
    }
).drop(columns=["Unnamed: 4"])
df_spm1_recon

Unnamed: 0,Year,surface temperature anomaly,5th percentile,95th percentile
0,1995,0.65,0.46,0.79
1,1994,0.64,0.46,0.77
2,1993,0.62,0.45,0.76
3,1992,0.61,0.45,0.74
4,1991,0.59,0.44,0.71
...,...,...,...,...
1986,9,0.10,-0.11,0.38
1987,8,0.10,-0.11,0.38
1988,7,0.09,-0.11,0.38
1989,6,0.09,-0.12,0.37


In [4]:
#we just plot the data to have a preview
alt.Chart(df_spm1_recon).mark_line().encode(
    x='Year',
    y='surface temperature anomaly'
)


There are several changes to do concerning the axis : 
- the x axis has to go from 1 to 2020 with values [1, 500, 1000, 1500, 1850, 2020]
- the y axis has to go from -1 to 2 with values [-1, -0.5, 0.0, 0.5, 1.0, 1.5, 2.0]

In [5]:
base = alt.Chart(df_spm1_recon).mark_line(strokeWidth=1).encode(
    alt.X('Year:Q',                                 # year on x-axis as quantitative
          scale=alt.Scale(domain=[1, 2020]),        # set the range of the x-axis
          axis=alt.Axis(
              tickCount=6,                          # set the number of ticks
              format = 'd',                         # set the format of the ticks
              values = [1, 500, 1000, 1500, 1850, 2020], # set the values of the ticks
              grid = False,                         # remove the grid
              title = None )),                      # remove the title of the axis
          
    alt.Y('surface temperature anomaly:Q',
          scale=alt.Scale(domain=[-1, 2]),
          axis=alt.Axis(
              tickCount=7, 
              values = [-1, -0.5, 0.0, 0.5, 1.0, 1.5, 2.0],
              title='°C',
              titleY=-10,                           # move the title up
              titleX=-10,                           # move the title to the right
              titleAngle=0),                        # set the title angle
              )
).properties(width=420, height=450)

base

### Step 2 : Add the percentile band around the curve 

In [6]:
percentile_band = base.mark_area(opacity=0.4,color="gray").encode(
    y="5th percentile:Q",
    y2="95th percentile:Q"
)

base + percentile_band

### Step 3 : Add the oberved data and the grey rectangle behind 

Now that we have the correct structure for the graph, we can add the observed value and the percentile columns. 

In [7]:
df_spm1_obs = pd.read_csv("/Users/ivanleboucher/Desktop/Cours/data_viz/assessment_IPCC/SPM1_1850-2020_obs.csv", skiprows=15, encoding="latin1")
df_spm1_obs

Unnamed: 0,1,2,Unnamed: 2,Unnamed: 3,Unnamed: 4
0,2016,1.09,,,
1,2015,1.06,,,
2,2014,1.03,,,
3,2013,1.00,,,
4,2012,0.98,,,
...,...,...,...,...,...
157,1859,-0.04,,,
158,1858,-0.03,,,
159,1857,-0.01,,,
160,1856,0.00,,,


In [8]:
df_spm1_obs = df_spm1_obs.rename(
    columns={
        "1": "Year", 
        "2": "Global surface temperature observed"} 
).drop(columns=["Unnamed: 2", "Unnamed: 3", "Unnamed: 4"])  
df_spm1_obs


Unnamed: 0,Year,Global surface temperature observed
0,2016,1.09
1,2015,1.06
2,2014,1.03
3,2013,1.00
4,2012,0.98
...,...,...
157,1859,-0.04
158,1858,-0.03
159,1857,-0.01
160,1856,0.00


In [9]:
observed = alt.Chart(df_spm1_obs).mark_line(strokeWidth=1,color='black').encode(
    x='Year:Q',
    y='Global surface temperature observed:Q'
)

rectangle = alt.Chart(pd.DataFrame(
    {'x': [1850], 'y': [-1], 'x2': [2020], 'y2': [2]})
    ).mark_rect(fill='gray', opacity=0.1).encode(
    x='x:Q',
    x2='x2:Q',
    y='y:Q',
    y2='y2:Q')

base + percentile_band + observed + rectangle


### Step 4 : Add legends and comments 

In [10]:
# add comments to the plot
comments = alt.Chart(pd.DataFrame({
    "text": ["Warming is unprecedented",
             "in more than 2000 years",
             "Warmest multi-century period",
             "in more than 100,000 years"],
    "x": [1650, 1650, 450, 450],
    "y": [1.5, 1.4, 1.2, 1.1]
})).mark_text( fontSize=12).encode(
    x="x:Q", y="y:Q", text="text:N"
)

base + percentile_band + observed + rectangle + comments 


  col = df[col_name].apply(to_list_if_array, convert_dtype=False)


In [11]:
#add the rectangle on the left side of the plot
scale_rect = alt.Chart(pd.DataFrame({
    "y1": [0.2], "y2": [1.0], "x": [-400], "x2": [-350]
})).mark_rect(color="gray", opacity=0.3).encode(
    y="y1:Q", y2="y2:Q", x="x:Q",  x2="x2:Q"
)

# add the rule on the left side of the plot
scale_rule = alt.Chart(pd.DataFrame({
    "y1": [-1], "y2": [2.0], "x": [-100]
})).mark_rule(color="grey").encode(
    y="y1:Q", y2="y2:Q", x=alt.value(-100)
)

# add the ticks on the rule 
ticks = alt.Chart(pd.DataFrame({
    "y": [-1, -0.5, 0.0, 0.2, 0.5, 1.0, 1.5, 2.0]
})).mark_tick(color="grey", size=10).encode(
    y="y:Q", x=alt.value(-100)
)

# add labels for speicific ticks
labels = alt.Chart(pd.DataFrame({
    "y": [0.2, 1.0], "x": [1, 1], "label": ["0.2", "1.0"]
})).mark_text(align='left', dx=-95, dy=-5, color="grey").encode(
    y="y:Q", x=alt.value(1), text="label"
)

# add annotations to the curve observed
scale_text_obs = alt.Chart(pd.DataFrame({
    "y": [0.6], "x": [1800], "text": ["observed"]
})).mark_text( fontSize=12, color = "black", fontWeight='bold').encode(
    y="y:Q", x="x:Q", text="text:N"
)

# add annotations to the curve reconstructed
scale_text_rec = alt.Chart(pd.DataFrame({
    "y": [-0.25], "x": [1000], "text": ["reconstructed"]
})).mark_text( fontSize=12, color = "#1f77b4", fontWeight='bold').encode(
    y="y:Q", x="x:Q", text="text:N"
)


In [12]:
#we put it all together
chart_complete = base + percentile_band + observed + rectangle +labels +scale_rect + scale_rule + ticks + scale_text_obs + scale_text_rec + comments

### Final plot 

In [13]:
#add the final legend 
left_chart_combined = chart_complete.properties(
    title=alt.TitleParams(
        text=["(a) Change in global surface temperature (decadal average)",
              "as reconstructed (1–2000) and observed (1850–2020)"],
        fontSize=12,
        anchor="start"
    )
)

left_chart_combined

  col = df[col_name].apply(to_list_if_array, convert_dtype=False)
  col = df[col_name].apply(to_list_if_array, convert_dtype=False)
  col = df[col_name].apply(to_list_if_array, convert_dtype=False)
  col = df[col_name].apply(to_list_if_array, convert_dtype=False)


## 2) Reproduction of the graph on the right


### Step 1 : extract the data 

When we look at the excel file we can see that the first 36 rows are for description, so we can skip them. 
We can also give a convenient name to the column. 

In [14]:
df_panel_b = pd.read_csv("/Users/ivanleboucher/Desktop/Cours/data_viz/assessment_IPCC/gmst_changes_model_and_obs.csv",encoding="latin1", skiprows=36)
df_panel_b

Unnamed: 0,1,2,3,4,5,6,7,8,Unnamed: 8,Unnamed: 9,...,Unnamed: 11,Unnamed: 12,Unnamed: 13,Unnamed: 14,Unnamed: 15,Unnamed: 16,Unnamed: 17,Unnamed: 18,Unnamed: 19,Unnamed: 20
0,1850,-4.758907e-02,-3.548055e-01,1.897134e-01,1.460171e-02,-1.611949e-01,1.961983e-01,-0.059,,,...,,,,,,,,,,
1,1851,-5.876090e-02,-3.134450e-01,1.492833e-01,2.419514e-02,-1.329608e-01,1.799279e-01,0.049,,,...,,,,,,,,,,
2,1852,-2.935225e-03,-1.744060e-01,1.983925e-01,5.255810e-02,-2.026655e-01,2.217057e-01,0.079,,,...,,,,,,,,,,
3,1853,-5.929254e-03,-2.148021e-01,2.176045e-01,2.940635e-02,-2.664158e-01,1.789874e-01,0.046,,,...,,,,,,,,,,
4,1854,-2.661051e-02,-2.244470e-01,2.214103e-01,-9.960666e-03,-2.558413e-01,1.524845e-01,0.049,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
167,2017,1.100092e+00,7.276342e-01,1.781020e+00,4.710649e-02,-2.014902e-01,3.191186e-01,1.179,,,...,,,,,,,,,,
168,2018,1.170450e+00,8.069424e-01,1.872942e+00,4.418525e-02,-2.225281e-01,3.838217e-01,1.101,,,...,,,,,,,,,,
169,2019,1.189952e+00,7.382615e-01,1.870786e+00,4.938523e-02,-1.363197e-01,3.065375e-01,1.226,,,...,,,,,,,,,,
170,2020,1.000000e+20,1.000000e+20,1.000000e+20,1.000000e+20,1.000000e+20,1.000000e+20,1.256,,,...,,,,,,,,,,


In [15]:
#rename the columns
df_panel_b = df_panel_b.rename(
    columns={
        "1": "Year", 
        "2": "GSTA_Human_Natural_Mean",
        "3": "GSTA_Human_Natural_5th",
        "4": "GSTA_Human_Natural_95th",
        "5": "GSTA_Natural_Mean", 
        "6": "GSTA_Natural_5th",
        "7": "GSTA_Natural_95th",
        "8": "GSTA_Observed",
    })

#drop the last row
df_panel_b = df_panel_b.drop(171)#.iloc[:,0:8] # drop the last row because it contains NaN values 
df_panel_b["Year"] = df_panel_b["Year"].astype(int)
df_panel_b

Unnamed: 0,Year,GSTA_Human_Natural_Mean,GSTA_Human_Natural_5th,GSTA_Human_Natural_95th,GSTA_Natural_Mean,GSTA_Natural_5th,GSTA_Natural_95th,GSTA_Observed,Unnamed: 8,Unnamed: 9,...,Unnamed: 11,Unnamed: 12,Unnamed: 13,Unnamed: 14,Unnamed: 15,Unnamed: 16,Unnamed: 17,Unnamed: 18,Unnamed: 19,Unnamed: 20
0,1850,-4.758907e-02,-3.548055e-01,1.897134e-01,1.460171e-02,-1.611949e-01,1.961983e-01,-0.059,,,...,,,,,,,,,,
1,1851,-5.876090e-02,-3.134450e-01,1.492833e-01,2.419514e-02,-1.329608e-01,1.799279e-01,0.049,,,...,,,,,,,,,,
2,1852,-2.935225e-03,-1.744060e-01,1.983925e-01,5.255810e-02,-2.026655e-01,2.217057e-01,0.079,,,...,,,,,,,,,,
3,1853,-5.929254e-03,-2.148021e-01,2.176045e-01,2.940635e-02,-2.664158e-01,1.789874e-01,0.046,,,...,,,,,,,,,,
4,1854,-2.661051e-02,-2.244470e-01,2.214103e-01,-9.960666e-03,-2.558413e-01,1.524845e-01,0.049,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
166,2016,1.057188e+00,5.073328e-01,1.744812e+00,4.893793e-02,-2.681074e-01,2.905333e-01,1.264,,,...,,,,,,,,,,
167,2017,1.100092e+00,7.276342e-01,1.781020e+00,4.710649e-02,-2.014902e-01,3.191186e-01,1.179,,,...,,,,,,,,,,
168,2018,1.170450e+00,8.069424e-01,1.872942e+00,4.418525e-02,-2.225281e-01,3.838217e-01,1.101,,,...,,,,,,,,,,
169,2019,1.189952e+00,7.382615e-01,1.870786e+00,4.938523e-02,-1.363197e-01,3.065375e-01,1.226,,,...,,,,,,,,,,


### Step 2 : create the base of the graph 

In [18]:
base_right = alt.Chart(df_panel_b).mark_line(strokeWidth=1, color="black").encode(
    alt.X('Year:Q',                                 # year on x-axis as quantitative
          scale=alt.Scale(domain=[1850, 2020]),        # set the range of the x-axis
          axis=alt.Axis(
              tickCount=5,                          # set the number of ticks
              format = 'd',                         # set the format of the ticks
              values = [1850, 1900, 1950, 2000, 2020], # set the values of the ticks
              grid = False,                         # remove the grid
              title = None )),                      # remove the title of the axis
          
    alt.Y('GSTA_Observed:Q',
          scale=alt.Scale(domain=[-1, 2]),
          axis=alt.Axis(
              tickCount=7, 
              values = [-1,-0.5, 0.0, 0.5, 1.0, 1.5, 2.0],
              title='°C',
              titleY=-10,                           # move the title up
              titleX=-10,                           # move the title to the right
              titleAngle=0),                        # set the title angle
              )
).properties(width=420, height=450)

base_right

 ### Step 3 : Add simulated human and natural with percentile band  and simulated natural on solar and volcanic with percentile band 

In [19]:
line2 = alt.Chart(df_panel_b).mark_line(strokeWidth=1,color='orange').encode(
    x='Year:Q',
    y='GSTA_Human_Natural_Mean:Q'
)
percentile_band2 = base_right.mark_area(
    opacity=0.2, 
    color="orange").encode(
    y="GSTA_Human_Natural_5th:Q",
    y2="GSTA_Human_Natural_95th:Q"
)

line3 = alt.Chart(df_panel_b).mark_line(strokeWidth=1,color='#1f77b4').encode(
    x='Year:Q',
    y='GSTA_Natural_Mean:Q'
)

percentile_band3 = base_right.mark_area(
    opacity=0.2, 
    color="#1f77b4").encode(
    y="GSTA_Natural_5th:Q",
    y2="GSTA_Natural_95th:Q"
)


all_curves = base_right + line2 + percentile_band2 + line3 + percentile_band3
all_curves

### Step 4 : add legends and annotations 

In [20]:
# add annotations to the curve observed
obs_legend = alt.Chart(pd.DataFrame({
    "y": [1.3], "x": [2022], "text": ["observed"]
})).mark_text( fontSize=12, color = "black", fontWeight='bold', align="left").encode(
    y="y:Q", x="x:Q", text="text:N"
)

# add annotations to the curve SHN
SHN_legend = alt.Chart(pd.DataFrame({
    "y": [1.2, 1.1, 1.0], "x": [2022, 2022, 2022], "text": ["simulated", "human &", "natural"]
})).mark_text( fontSize=12, color = "orange", fontWeight='bold', align="left").encode(
    y="y:Q", x="x:Q", text="text:N"
)

# add annotations to the curve SN
SN_legend = alt.Chart(pd.DataFrame({
    "y": [0.3, 0.2, 0.1, 0.0], "x": [2022, 2022, 2022, 2022], "text": ["simulated", "natural on", "(solar &", "volcanic)"]
})).mark_text( fontSize=12, color = "#1f77b4", fontWeight='bold', align="left").encode(
    y="y:Q", x="x:Q", text="text:N"
)

right_chart_complete = all_curves + obs_legend + SHN_legend + SN_legend

right_chart_complete

  col = df[col_name].apply(to_list_if_array, convert_dtype=False)
  col = df[col_name].apply(to_list_if_array, convert_dtype=False)
  col = df[col_name].apply(to_list_if_array, convert_dtype=False)


### Final plot 

In [21]:
#add the final legend 
right_chart = right_chart_complete.properties(
    title=alt.TitleParams(
        text=["(b) Change in global surface temperature (annual average) as observed and",
             "simulated using human & natural and only natural factors (both 1850-2020)"],
        fontSize=12,
        anchor="start"
    )
)

right_chart

  col = df[col_name].apply(to_list_if_array, convert_dtype=False)
  col = df[col_name].apply(to_list_if_array, convert_dtype=False)
  col = df[col_name].apply(to_list_if_array, convert_dtype=False)
