# Tree Cover Loss Prediction

In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn import preprocessing
from sklearn.linear_model import LinearRegression
import numpy as np

import pickle

In [2]:
df = pd.read_csv('Datasets/TreeCoverLoss2001-2020.csv')
df.head()

Unnamed: 0,CountryCode,Year,TreeCoverLoss_ha,GrossEmissions_Co2_all_gases_Mg
0,AGO,2001,4957.597965,3021078.0
1,ARG,2001,5979.947498,2253298.0
2,AUS,2001,0.075922,28.85012
3,BDI,2001,6.22293,3288.072
4,BES,2001,0.146932,61.25723


In [3]:
df.describe()

Unnamed: 0,Year,TreeCoverLoss_ha,GrossEmissions_Co2_all_gases_Mg
count,1863.0,1863.0,1863.0
mean,2010.435856,35241.33,23421820.0
std,5.75064,162851.8,108103800.0
min,2001.0,0.07307871,0.0
25%,2005.0,25.48204,13304.61
50%,2010.0,1144.897,629707.3
75%,2015.0,13353.66,7586048.0
max,2020.0,2830977.0,1820993000.0


In [4]:
df.isnull().sum()

CountryCode                        0
Year                               0
TreeCoverLoss_ha                   0
GrossEmissions_Co2_all_gases_Mg    0
dtype: int64

The data seems to be fine. There are no NaN values.

In [5]:
df.shape

(1863, 4)

In [6]:
df['CountryCode'].unique()

array(['AGO', 'ARG', 'AUS', 'BDI', 'BES', 'BGD', 'BHS', 'BLZ', 'BOL',
       'BRA', 'BRN', 'BTN', 'CAF', 'CHN', 'CIV', 'CMR', 'COD', 'COG',
       'COL', 'COM', 'CRI', 'CUB', 'CYM', 'DMA', 'DOM', 'ECU', 'ETH',
       'FJI', 'GAB', 'GHA', 'GIN', 'GLP', 'GNB', 'GNQ', 'GTM', 'GUF',
       'GUY', 'HND', 'HTI', 'IDN', 'IND', 'JAM', 'KEN', 'KHM', 'KNA',
       'LAO', 'LBR', 'LCA', 'LKA', 'MAF', 'MDG', 'MDV', 'MEX', 'MMR',
       'MOZ', 'MSR', 'MTQ', 'MWI', 'MYS', 'NGA', 'NIC', 'NPL', 'PAN',
       'PER', 'PHL', 'PLW', 'PNG', 'PRI', 'PRY', 'RWA', 'SLB', 'SLE',
       'SLV', 'SSD', 'SUR', 'SXM', 'TCA', 'TGO', 'THA', 'TTO', 'TWN',
       'TZA', 'UGA', 'USA', 'VCT', 'VEN', 'VGB', 'VIR', 'VNM', 'VUT',
       'ZAF', 'ZMB', 'ZWE', 'BEN', 'SEN', 'SGP', 'UMI', 'ABW', 'ATG'],
      dtype=object)

#### Using Label Encoder for the Countrys' 

In [7]:
le = preprocessing.LabelEncoder()
df['Country'] = le.fit_transform(df['CountryCode'])
df['Country']

0        1
1        2
2        4
3        5
4        7
        ..
1858    94
1859    95
1860    96
1861    97
1862    98
Name: Country, Length: 1863, dtype: int32

In [8]:
X = df[['Country', 'Year', 'GrossEmissions_Co2_all_gases_Mg']]
y = df['TreeCoverLoss_ha']

## Using Random Forest Regressor to find Feature Importances

In [10]:
from sklearn.ensemble import RandomForestRegressor

rf = RandomForestRegressor().fit(X,y)
rf.feature_importances_

array([0.00436026, 0.00493036, 0.99070938])

We can see that Country and Year play no significant role while predicting the Tree Cover loss.
So I decided to exclude them while I trained the model.

In [11]:
#X = df[['Country', 'Year', 'TreeCoverLoss_ha']]
X = df[['GrossEmissions_Co2_all_gases_Mg']]
#X = df[['Country', 'Year', 'GrossEmissions_Co2_all_gases_Mg']]
y = df['TreeCoverLoss_ha']

In [12]:
X.head()

Unnamed: 0,GrossEmissions_Co2_all_gases_Mg
0,3021078.0
1,2253298.0
2,28.85012
3,3288.072
4,61.25723


In [13]:
y.head()

0    4957.597965
1    5979.947498
2       0.075922
3       6.222930
4       0.146932
Name: TreeCoverLoss_ha, dtype: float64

In [21]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2)

### Training an ML model

In [22]:
clf = LinearRegression()
clf.fit(X_train, y_train)
clf.score(X_test, y_test)

0.9904836059556512

Classic Linear Regression prediction problem

#### Saving the classifier in a pickle file for later use

In [23]:
with open ('clf1.pkl', 'wb') as f:
    pickle.dump(clf, f)

In [24]:
clf.predict([[288.5]])

array([275.84677346])