<a href="https://colab.research.google.com/github/LucianaNieto/CarbonSequestration/blob/main/SOC_analysis_EDA.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


----
Problem definition:


Objectives:



### Libraries

In [2]:
!pip install -q pycountry
!pip install -q plotly --upgrade
!pip install -q -U kaleido

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.2/6.2 MB[0m [31m49.0 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m16.4/16.4 MB[0m [31m35.9 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m79.9/79.9 MB[0m [31m9.7 MB/s[0m eta [36m0:00:00[0m
[?25h

In [31]:
#Libraries

import pandas as pd
import os
import numpy as np
import re
import traceback

import plotly.express as px
import plotly.graph_objects as go

import pycountry
from pycountry import countries

from scipy import stats

### Directories

In [4]:

# base path
base_path = '/content/drive/My Drive/2024/DataScience/CarbonSequestration/01-CarbonSequestration/'

# Define paths for data and figure subdirectories
paths = {
    'raw_data_path': base_path + '01-Data/01-RawData/',
    'processed_data_path': base_path + '01-Data/02-ProcessedData/',
    'exploratory_figures_path': base_path + '02-Figures/01-Exploratory/',
    'final_figures_path': base_path + '02-Figures/02-Finals/',
    'raw_code_path': base_path + '03-Code/01-RawCode/',
    'data_processing_code_path': base_path + '03-Code/02-DataProcessing/',
    'data_analysis_code_path': base_path + '03-Code/03-DataAnalysis/',
    'clean_scripts_path': base_path + '03-Code/04-CleanScripts/'
}

### Data loading

In [36]:
def load_data(file_name):
    csv_path = os.path.join(paths['processed_data_path'], f"{file_name}.csv")
    return pd.read_csv(csv_path)

# Load each dataset

df_effects = load_data('df_effects_SOC')
print("\nEffect-sizes Columns and Shape:")
print(df_effects.columns)
print(df_effects.shape)

df_primary = load_data('df_Primarystudies_Country')
print("\nPrimary_studies Columns and Shape:")
print(df_primary.columns)
print(df_primary.shape)


Effect-sizes Columns and Shape:
Index(['id', 'land_use', 'intervention', 'sub_cat_intervention', 'details',
       'control (c)', 'treatment_authors', 'summary_effect_size', 'management',
       'method', 'species', 'region_climate', 'soil', 'depth_original',
       'depth', 'group_depth', 'depth2', 'duration', 'carbon', 'unit',
       'rateyr', 'log_scale', 'metric', 'outcome', 'sub_cat_outcome',
       'details_outcome', 'lower_ci', 'effect size', 'upper_ci', 'p_value',
       'n_paired_data', 'es_se', 'duration_standardized'],
      dtype='object')
(5172, 33)

Primary_studies Columns and Shape:
Index(['ID', 'DOI', 'Primarystudies_Country', 'intervention', 'Outcome'], dtype='object')
(22772, 5)


In [37]:
df_primary.columns = df_primary.columns.str.lower()
df_effects.columns = df_effects.columns.str.lower()

### Data exploration

In [38]:
# Custom color scale Andromeda
custom_color_scale = [
    [0, '#e3fef7'],
    [1, '#003c43']
]

In [39]:
# Remove duplicates
df_primary.drop_duplicates(inplace=True)
df_effects.drop_duplicates(inplace=True)

print("\nPrimary Studies Dataset Info:")
print(df_primary.info())

print("\nEffects on SOC Dataset Info:")
print(df_effects.info())


Primary Studies Dataset Info:
<class 'pandas.core.frame.DataFrame'>
Index: 20135 entries, 0 to 22771
Data columns (total 5 columns):
 #   Column                  Non-Null Count  Dtype 
---  ------                  --------------  ----- 
 0   id                      20135 non-null  object
 1   doi                     20132 non-null  object
 2   primarystudies_country  18734 non-null  object
 3   intervention            11442 non-null  object
 4   outcome                 20125 non-null  object
dtypes: object(5)
memory usage: 943.8+ KB
None

Effects on SOC Dataset Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5172 entries, 0 to 5171
Data columns (total 33 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   id                     5172 non-null   object 
 1   land_use               5172 non-null   object 
 2   intervention           5172 non-null   object 
 3   sub_cat_intervention   5172 non-null   object 
 4   d

In [40]:
# Clean data by removing entries with multiple countries or 'global'
df_primary = df_primary[~df_primary['primarystudies_country'].str.contains(',', na=False)]
df_primary = df_primary[df_primary['primarystudies_country'].str.lower() != 'global']

# Aggregate data by country
country_counts = df_primary['primarystudies_country'].value_counts().reset_index()
country_counts.columns = ['Country', 'Counts']


# Count of unique countries in the dataset
unique_countries_with_data = country_counts['Country'].nunique()
percentage_with_data = (unique_countries_with_data /195) * 100

fig = go.Figure()

# layer for country fill
fig.add_trace(go.Choropleth(
    locations=country_counts['Country'],
    z=country_counts['Counts'],
    locationmode='country names',
    colorscale=custom_color_scale,
    colorbar_title="Entries",
    showscale=False,
    zmin=0,
    zmax=country_counts['Counts'].max(),
    marker_line_color='darkgray',
    marker_line_width=0.5,
    text=country_counts['Counts'],
    hoverinfo='location+z'
))

# Scatter geo layer for bubbles
fig.add_trace(go.Scattergeo(
    locationmode='country names',
    locations=country_counts['Country'],
    text=country_counts['Counts'],
    mode='markers',
    marker=dict(
        size=country_counts['Counts']/country_counts['Counts'].max()*50,
        color=country_counts['Counts'],
        colorscale=custom_color_scale,
        showscale=True,
        colorbar=dict(title="Entry Count"),
        line_color='black'
    )
))

# Top 10 countries with entry counts
top_countries = country_counts.nlargest(10, 'Counts').iloc[::-1]
annotations = [
    dict(
        x=0.01,
        y=0.1 + i*0.08,
        xref='paper',
        yref='paper',
        text=f"{country}: {count}",
        showarrow=False,
        align='right',
        bgcolor='rgba(255, 255, 255, 0.7)',

        borderpad=4
    ) for i, (country, count) in enumerate(zip(top_countries['Country'], top_countries['Counts']))
]
# Add an annotation for the percentage of countries with data
annotations.append(
    dict(
        x=0.01,
        y=1.05,
        xref='paper',
        yref='paper',
        text=f"Countries with data: {unique_countries_with_data}/195 ({percentage_with_data:.2f}%)",
        showarrow=False,
        font=dict(size=14, color='#003c43')
    )
)

# Update the layout
fig.update_layout(
    title_text='Global Distribution of Study Entries by Country',
    geo=dict(
        showframe=False,
        showcoastlines=False,
        projection_type='equirectangular'
    ),
    annotations=annotations
)

fig.show()


fig.write_html(os.path.join(paths['final_figures_path'], "country_entries_map.html"))
fig.write_image(os.path.join(paths['final_figures_path'], "country_entries_map.jpg"), format='jpg')


In [41]:
custom_color_scale = ['#e3fef7','#003c43']

# Plot distribution of Effect Sizes
fig1 = px.histogram(df_effects, x='effect size',
                    title='Distribution of Effect Sizes',
                    color_discrete_sequence=[custom_color_scale[0]])
fig1.update_layout(
    plot_bgcolor='#003c43',
    paper_bgcolor='#003c43',
    title_font=dict(color=custom_color_scale[0]),
    font=dict(color=custom_color_scale[0]),
    xaxis=dict(title_font=dict(color=custom_color_scale[0]), tickfont=dict(color=custom_color_scale[0])),
    yaxis=dict(title_font=dict(color=custom_color_scale[0]), tickfont=dict(color=custom_color_scale[0]))
)
fig1.show()

# Plot distribution of Metrics
fig2 = px.histogram(df_effects, x='metric',
                    title='Distribution of Metrics',
                    color_discrete_sequence=[custom_color_scale[0]])
fig2.update_layout(
    plot_bgcolor='#003c43',
    paper_bgcolor='#003c43',
    title_font=dict(color=custom_color_scale[0]),
    font=dict(color=custom_color_scale[0]),
    xaxis=dict(title_font=dict(color=custom_color_scale[0]), tickfont=dict(color=custom_color_scale[0])),
    yaxis=dict(title_font=dict(color=custom_color_scale[0]), tickfont=dict(color=custom_color_scale[0]))
)
fig2.show()

# Plot distribution of Land Use
fig3 = px.histogram(df_effects, x='land_use',
                    title='Distribution of Land Use',
                    color_discrete_sequence=[custom_color_scale[0]])
fig3.update_layout(
    plot_bgcolor='#003c43',
    paper_bgcolor='#003c43',
    title_font=dict(color=custom_color_scale[0]),
    font=dict(color=custom_color_scale[0]),
    xaxis=dict(title_font=dict(color=custom_color_scale[0]), tickfont=dict(color=custom_color_scale[0])),
    yaxis=dict(title_font=dict(color=custom_color_scale[0]), tickfont=dict(color=custom_color_scale[0]))
)
fig3.show()




In [44]:
# Filter the dataset for effect sizes greater than 3
df_effects_above_3 = df_effects[df_effects['effect size'] > 3]

# Get the unique metrics associated with these effect sizes
unique_metrics_above_3 = df_effects_above_3['metric'].unique()

print("Metrics associated with effect sizes greater than 3:")
print(unique_metrics_above_3)

# Count occurrences of each metric
metric_counts = df_effects_above_3['metric'].value_counts()

print("Counts of metrics associated with effect sizes greater than 3:")
print(metric_counts)


#keep < 3
df_effects_lower_3 = df_effects[df_effects['effect size'] < 3]


Metrics associated with effect sizes greater than 3:
['percent change']
Counts of metrics associated with effect sizes greater than 3:
metric
percent change    8
Name: count, dtype: int64


In [46]:
custom_color_scale = ['#e3fef7','#003c43']

# Plot distribution of Effect Sizes
fig1 = px.histogram(df_effects_lower_3, x='effect size',
                    title='Distribution of Effect Sizes',
                    color_discrete_sequence=[custom_color_scale[0]])
fig1.update_layout(
    plot_bgcolor='#003c43',
    paper_bgcolor='#003c43',
    title_font=dict(color=custom_color_scale[0]),
    font=dict(color=custom_color_scale[0]),
    xaxis=dict(title_font=dict(color=custom_color_scale[0]), tickfont=dict(color=custom_color_scale[0])),
    yaxis=dict(title_font=dict(color=custom_color_scale[0]), tickfont=dict(color=custom_color_scale[0]))
)
fig1.show()

# Plot distribution of Metrics
fig2 = px.histogram(df_effects_lower_3, x='metric',
                    title='Distribution of Metrics',
                    color_discrete_sequence=[custom_color_scale[0]])
fig2.update_layout(
    plot_bgcolor='#003c43',
    paper_bgcolor='#003c43',
    title_font=dict(color=custom_color_scale[0]),
    font=dict(color=custom_color_scale[0]),
    xaxis=dict(title_font=dict(color=custom_color_scale[0]), tickfont=dict(color=custom_color_scale[0])),
    yaxis=dict(title_font=dict(color=custom_color_scale[0]), tickfont=dict(color=custom_color_scale[0]))
)
fig2.show()

# Plot distribution of Land Use
fig3 = px.histogram(df_effects_lower_3, x='land_use',
                    title='Distribution of Land Use',
                    color_discrete_sequence=[custom_color_scale[0]])
fig3.update_layout(
    plot_bgcolor='#003c43',
    paper_bgcolor='#003c43',
    title_font=dict(color=custom_color_scale[0]),
    font=dict(color=custom_color_scale[0]),
    xaxis=dict(title_font=dict(color=custom_color_scale[0]), tickfont=dict(color=custom_color_scale[0])),
    yaxis=dict(title_font=dict(color=custom_color_scale[0]), tickfont=dict(color=custom_color_scale[0]))
)
fig3.show()
# Save figures as HTML and JPG
fig1.write_html(os.path.join(paths['final_figures_path'], 'effect_size_distribution.html'))
fig1.write_image(os.path.join(paths['final_figures_path'], 'effect_size_distribution.jpg'))
fig2.write_html(os.path.join(paths['final_figures_path'], 'metric_distribution.html'))
fig2.write_image(os.path.join(paths['final_figures_path'], 'metric_distribution.jpg'))
fig3.write_html(os.path.join(paths['final_figures_path'], 'land_use_distribution.html'))
fig3.write_image(os.path.join(paths['final_figures_path'], 'land_use_distribution.jpg'))

In [69]:
df_effects_lower_3.describe()

Unnamed: 0,depth,effect size,duration_standardized
count,1386.0,5164.0,905.0
mean,41.818182,0.813342,20.435503
std,30.616908,0.531947,15.643492
min,3.0,-1.490637,0.082192
25%,20.0,0.469878,10.0
50%,30.0,1.00068,20.0
75%,60.0,1.1122,25.0
max,200.0,2.823052,125.0


In [70]:
print(df_effects_lower_3.isnull().sum())

id                          0
land_use                    0
intervention                0
sub_cat_intervention        0
details                     0
control (c)                 8
treatment_authors         685
summary_effect_size      5039
management                 61
method                     85
species                    18
region_climate             14
soil                        0
depth_original            166
depth                    3778
group_depth              3637
depth2                   1078
duration                    0
carbon                      0
unit                      130
rateyr                     21
log_scale                   0
metric                      0
outcome                     0
sub_cat_outcome             0
details_outcome             0
lower_ci                  107
effect size                 0
upper_ci                  108
p_value                  3980
n_paired_data             559
es_se                    3401
duration_standardized    4259
dtype: int

In [81]:
complementary_color_palette = ['#e3fef7', '#003c43', '#a1d8e6', '#66b2b2', '#008080', '#004c4c']
# Histograms for numeric columns
num_cols = ['depth', 'duration_standardized',  'effect size']
for col in num_cols:
    fig = px.histogram(df_effects_lower_3, x=col, title=f'Histogram of {col}', color_discrete_sequence=[custom_color_scale[0]])
    fig.update_layout(
    plot_bgcolor='#003c43',
    paper_bgcolor='#003c43',
    title_font=dict(color=custom_color_scale[0]),
    font=dict(color=custom_color_scale[0]),
    xaxis=dict(title_font=dict(color=custom_color_scale[0]), tickfont=dict(color=custom_color_scale[0])),
    yaxis=dict(title_font=dict(color=custom_color_scale[0]), tickfont=dict(color=custom_color_scale[0]))
)
    fig.show()

# Correlation matrix
corr_matrix = df_effects_lower_3[num_cols].corr()
fig = go.Figure(data=go.Heatmap(
        z=corr_matrix.values,
        x=corr_matrix.columns,
        y=corr_matrix.index,
        colorscale=complementary_color_palette))

fig.update_layout(
    plot_bgcolor='#003c43',
    paper_bgcolor='#003c43',
    title_font=dict(color=custom_color_scale[0]),
    font=dict(color=custom_color_scale[0]),
    xaxis=dict(title_font=dict(color=custom_color_scale[0]), tickfont=dict(color=custom_color_scale[0])),
    yaxis=dict(title_font=dict(color=custom_color_scale[0]), tickfont=dict(color=custom_color_scale[0]))
)
fig.show()

# Boxplots by category (e.g., 'land_use')
fig = px.box(df_effects_lower_3, x='land_use', y='effect size', title='Effect Size by Land Use', color_discrete_sequence=[custom_color_scale[0]])
fig.update_layout(
    plot_bgcolor='#003c43',
    paper_bgcolor='#003c43',
    title_font=dict(color=custom_color_scale[0]),
    font=dict(color=custom_color_scale[0]),
    xaxis=dict(title_font=dict(color=custom_color_scale[0]), tickfont=dict(color=custom_color_scale[0])),
    yaxis=dict(title_font=dict(color=custom_color_scale[0]), tickfont=dict(color=custom_color_scale[0]))
)
fig.show()

# Scatter plot of effect size vs duration
fig = px.box(df_effects_lower_3, x='duration_standardized', y='effect size', title='Effect Size vs Duration', color_discrete_sequence=[custom_color_scale[0]])

fig.update_layout(
    plot_bgcolor='#003c43',
    paper_bgcolor='#003c43',
    title_font=dict(color=custom_color_scale[0]),
    font=dict(color=custom_color_scale[0]),
    xaxis=dict(title_font=dict(color=custom_color_scale[0]), tickfont=dict(color=custom_color_scale[0])),
    yaxis=dict(title_font=dict(color=custom_color_scale[0]), tickfont=dict(color=custom_color_scale[0]))
)
fig.show()

In [88]:
custom_color_palette = {
    'forest land': '#2e8b57',      # Dark green
    'cropland': '#98fb98',         # Light green
    'land_use change': '#ffd700',  # Yellow
    'grassland': '#7cfc00',        # Green
    'various land uses': '#a9a9a9',# Gray
    'desert': '#f4a460',           # Sandy brown
    'shrublands': '#808000',       # Olive
    'wetlands': '#4682b4',         # Blue
    'tundra': '#b0e0e6',           # Light blue
    'other land:various': '#d3d3d3',# Light gray
    'various': '#696969'           # Dark gray
}


# Map the colors to the land use categories
color_discrete_map = {category: color for category, color in custom_color_palette.items()}

# Update the violin plot with the new custom colors
fig = px.violin(df_effects_lower_3, y='effect size', color='land_use',
                title='Distribution of Different Land Use on SOC',
                labels={'land_use': 'Land Use', 'effect size': 'Effect Size'},
                color_discrete_map=color_discrete_map)


fig.update_layout(
    plot_bgcolor='#003c43',
    paper_bgcolor='#003c43',
    title_font=dict(color=custom_color_scale[0]),
    font=dict(color=custom_color_scale[0]),
    xaxis=dict(title_font=dict(color=custom_color_scale[0]), tickfont=dict(color=custom_color_scale[0])),
    yaxis=dict(title_font=dict(color=custom_color_scale[0]), tickfont=dict(color=custom_color_scale[0]))
)

fig.show()

In [90]:

# Violin plot of standardized durations
fig_violin = px.violin(df_effects_lower_3, y='duration_standardized',
                       title='Distribution of  Durations in years',
                       box=True, points='all',
                       color_discrete_sequence=[custom_color_scale[0]])

fig_violin.update_layout(
    plot_bgcolor='#003c43',
    paper_bgcolor='#003c43',
    title_font=dict(color=custom_color_scale[0]),
    font=dict(color=custom_color_scale[0]),
    xaxis=dict(title_font=dict(color=custom_color_scale[0]), tickfont=dict(color=custom_color_scale[0])),
    yaxis=dict(title_font=dict(color=custom_color_scale[0]), tickfont=dict(color=custom_color_scale[0]))
)

fig_violin.show()

# Save the figure
fig_violin.write_html(os.path.join(paths['exploratory_figures_path'], 'violin_plot_standardized_durations.html'))
fig_violin.write_image(os.path.join(paths['exploratory_figures_path'], 'violin_plot_standardized_durations.jpg'))


In [92]:
#Save the filtered DataFrame
full_path = os.path.join(paths['processed_data_path'], 'df_effects_SOC_filtered.csv')
df_effects_lower_3.to_csv(full_path, index=False)

print(f'DataFrame successfully saved to: {full_path}')
print(f'Total entries in the processed dataset: {len(df_effects_lower_3)}')

DataFrame successfully saved to: /content/drive/My Drive/2024/DataScience/CarbonSequestration/01-CarbonSequestration/01-Data/02-ProcessedData/df_effects_SOC_filtered.csv
Total entries in the processed dataset: 5164
