# Capstone - Satellite Data Monitoring forests
####  James Hoang

In [102]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

import statsmodels.api as sm

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score

In [104]:
C_data = pd.read_csv('binarized_carbon_data.csv')
C_data.shape

(1888, 266)

In [105]:
C_data.head()

Unnamed: 0,canopy_density_threshold_2000,forested_area_2000,carbon_stored_per_country_2000_mg,avg_carbon_stored_per_ha_2000,gross_carbon_emissions,gross_carbon_removed,net_carbon_emissions,carbon_emissions_2001,carbon_emissions_2002,carbon_emissions_2003,...,country_Vanuatu,country_Vatican City,country_Venezuela,country_Vietnam,"country_Virgin Islands, U.S.",country_Western Sahara,country_Yemen,country_Zambia,country_Zimbabwe,country_Åland
0,0,64385715,25912558,0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0
1,10,432115,21758845,50,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0
2,15,302660,16568110,55,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0
3,20,284357,15782996,56,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0
4,25,254867,14538150,57,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0


In [108]:
T_data = pd.read_csv('binarized_tree_data.csv')
T_data.shape

(1888, 264)

### Baseline Model - Linear Regression

We are building two  linear regression models 1 for carbon and the other for tree cover

In [147]:
# Defining features & target for carbon data
# Exclude 'avg_carbon_stored_per_ha_2000', 'forested_area_2000', 'canopy_density_threshold_2000'
X = C_data[[ 'carbon_stored_per_country_2000_mg'
          , 'gross_carbon_emissions', 'gross_carbon_removed', 'carbon_emissions_2001', 'carbon_emissions_2002'
          , 'carbon_emissions_2003', 'carbon_emissions_2004', 'carbon_emissions_2005', 'carbon_emissions_2006', 'carbon_emissions_2007'
          , 'carbon_emissions_2008', 'carbon_emissions_2009', 'carbon_emissions_2010', 'carbon_emissions_2011', 'carbon_emissions_2012'
          , 'carbon_emissions_2013', 'carbon_emissions_2014', 'carbon_emissions_2015', 'carbon_emissions_2016', 'carbon_emissions_2017'
          , 'carbon_emissions_2018', 'carbon_emissions_2019', 'carbon_emissions_2020', 'carbon_emissions_2021', 'carbon_emissions_2022'
          , 'carbon_emissions_2023', 'country_Brazil', 'country_Russia', 'country_Canada', 'country_Indonesia', 'country_United States'
          , 'country_China', 'country_Australia', 'country_Democratic Republic of the Congo']]

y = C_data['net_carbon_emissions']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=2)

# Instantiate and fit a linear regression model
lmodel = LinearRegression()
lmodel.fit(X_train, y_train)

# Predictions for train and test
y_train_pred = lmodel.predict(X_train)
y_test_pred = lmodel.predict(X_test)

# Evaluation Metrics
mean2_train = mean_squared_error(y_train, y_train_pred)
r2_train = r2_score(y_train, y_train_pred)

mean2_test = mean_squared_error(y_test, y_test_pred)
r2_test = r2_score(y_test, y_test_pred)

print(f"Training Mean squared error {mean2_train}, Training R squared: {r2_train}")
print(f"Test Mean squared error: {mean2_test}, Test R squared: {r2_test}")


Training Mean squared error 0.05225609144072146, Training R squared: 1.0
Test Mean squared error: 0.06347479634427515, Test R squared: 1.0


**Analysis**: A mean squared of 1.0 is concerning and is likely due to overfitting and is apparent by the significantly low mean squared error. This should be looked into for Sprint2 to apply some PCA and hyperparameter tuning

In [109]:
T_data.head()

Unnamed: 0,threshold,area_ha,extent_2000_ha,extent_2010_ha,gain_2000-2020_ha,tc_loss_ha_2001,tc_loss_ha_2002,tc_loss_ha_2003,tc_loss_ha_2004,tc_loss_ha_2005,...,country_Vanuatu,country_Vatican City,country_Venezuela,country_Vietnam,"country_Virgin Islands, U.S.",country_Western Sahara,country_Yemen,country_Zambia,country_Zimbabwe,country_Åland
0,0,64385715,64385715,64385715,10741,103,214,267,225,268,...,0,0,0,0,0,0,0,0,0,0
1,10,64385715,432115,126247,10741,92,190,253,207,246,...,0,0,0,0,0,0,0,0,0,0
2,15,64385715,302660,106867,10741,91,186,247,205,240,...,0,0,0,0,0,0,0,0,0,0
3,20,64385715,284357,105733,10741,89,180,245,203,238,...,0,0,0,0,0,0,0,0,0,0
4,25,64385715,254867,72395,10741,89,180,245,202,237,...,0,0,0,0,0,0,0,0,0,0


In [152]:
# Defining features & target for carbon data
# Exclude 'threshold',
X = T_data[['area_ha', 'extent_2000_ha',
          'extent_2010_ha', 'tc_loss_ha_2001', 'tc_loss_ha_2002'
          , 'tc_loss_ha_2003', 'tc_loss_ha_2004', 'tc_loss_ha_2005', 'tc_loss_ha_2006', 'tc_loss_ha_2007'
          , 'tc_loss_ha_2008', 'tc_loss_ha_2009', 'tc_loss_ha_2010', 'tc_loss_ha_2011', 'tc_loss_ha_2012'
          , 'tc_loss_ha_2013', 'tc_loss_ha_2014', 'tc_loss_ha_2015', 'tc_loss_ha_2016', 'tc_loss_ha_2017'
          , 'tc_loss_ha_2018', 'tc_loss_ha_2019', 'tc_loss_ha_2020', 'tc_loss_ha_2021', 'tc_loss_ha_2022'
          , 'tc_loss_ha_2023', 'country_Brazil', 'country_Russia', 'country_Canada', 'country_Indonesia', 'country_United States'
          , 'country_China', 'country_Australia', 'country_Democratic Republic of the Congo']]

y = T_data['gain_2000-2020_ha']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=2)

# Instantiate and fit a linear regression model
lmodel = LinearRegression()
lmodel.fit(X_train, y_train)

# Predictions for train and test
y_train_pred = lmodel.predict(X_train)
y_test_pred = lmodel.predict(X_test)

# Evaluation Metrics
mean2_train = mean_squared_error(y_train, y_train_pred)
r2_train = r2_score(y_train, y_train_pred)

mean2_test = mean_squared_error(y_test, y_test_pred)
r2_test = r2_score(y_test, y_test_pred)

print(f"Training Mean squared error {mean2_train}, Training R squared: {r2_train}")
print(f"Test Mean squared error: {mean2_test}, Test R squared: {r2_test}")

Training Mean squared error 38834315161.65923, Training R squared: 0.9951980990007283
Test Mean squared error: 37527616384.50387, Test R squared: 0.9959806816832032
