# imports

In [25]:
#!pip install catboost

In [26]:
# imports:
import pandas as pd
import numpy as np

# import regex module
import re

# graphs:
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

# display all the columns in dataframe
pd.set_option('display.max_columns', None)

# sklearn imports
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder, MinMaxScaler
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.tree import DecisionTreeRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.ensemble import RandomForestRegressor, AdaBoostRegressor
from sklearn.svm import SVR
from sklearn.model_selection import RandomizedSearchCV
from catboost import CatBoostRegressor
from xgboost import XGBRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import warnings

In [27]:
df = pd.read_csv('https://raw.githubusercontent.com/aps0611/experimental/main/dataset/data-V4.csv')

In [28]:
df.head(5)

Unnamed: 0.1,Unnamed: 0,Geo Level,State,State ANSI,Ag District,Ag District Code,County,County ANSI,cotton_area,Corn_area,SORGHUM_ACRES,precipitation_cm,Latitude,Longitude,CO2,N2O
0,0,COUNTY,ALABAMA,1,BLACK BELT,40,AUTAUGA,1,9261,645,0,148.0,32.516526,-86.63194,1639.220033,9923.611451
1,1,COUNTY,ALABAMA,1,BLACK BELT,40,DALLAS,47,14133,13876,0,148.0,32.311797,-87.104664,4270.154224,44321.79154
2,2,COUNTY,ALABAMA,1,BLACK BELT,40,ELMORE,51,13795,2487,0,148.0,32.580123,-86.125195,2651.127824,22593.7503
3,3,COUNTY,ALABAMA,1,BLACK BELT,40,LOWNDES,85,4046,1630,0,148.0,32.108807,-86.640254,901.109504,2184.483787
4,4,COUNTY,ALABAMA,1,BLACK BELT,40,MACON,87,12376,0,0,148.0,32.366606,-85.666031,2072.333953,17632.69941


In [29]:
df.shape

(2705, 16)

In [30]:
### check the columns

df.columns

Index(['Unnamed: 0', 'Geo Level', 'State', 'State ANSI', 'Ag District',
       'Ag District Code', 'County', 'County ANSI', 'cotton_area', 'Corn_area',
       'SORGHUM_ACRES', 'precipitation_cm', 'Latitude', 'Longitude', 'CO2',
       'N2O'],
      dtype='object')

In [31]:
## create a new df1 with all the needed columns

df1 = df[['Latitude', 'Longitude', 'cotton_area', 'Corn_area',
       'SORGHUM_ACRES', 'precipitation_cm', 'CO2',
       'N2O']]

In [32]:
df1.head(5)

Unnamed: 0,Latitude,Longitude,cotton_area,Corn_area,SORGHUM_ACRES,precipitation_cm,CO2,N2O
0,32.516526,-86.63194,9261,645,0,148.0,1639.220033,9923.611451
1,32.311797,-87.104664,14133,13876,0,148.0,4270.154224,44321.79154
2,32.580123,-86.125195,13795,2487,0,148.0,2651.127824,22593.7503
3,32.108807,-86.640254,4046,1630,0,148.0,901.109504,2184.483787
4,32.366606,-85.666031,12376,0,0,148.0,2072.333953,17632.69941


In [33]:
df1.describe()

Unnamed: 0,Latitude,Longitude,cotton_area,Corn_area,SORGHUM_ACRES,precipitation_cm,CO2,N2O
count,2705.0,2705.0,2705.0,2705.0,2705.0,2705.0,2705.0,2705.0
mean,38.331819,-90.761495,4162.07098,31268.403327,1839.810351,98.020407,4924.867588,427912.0
std,4.765679,10.53215,18279.682707,50129.493374,8823.697408,31.179485,7082.753396,1105998.0
min,2.169424,-159.558768,0.0,0.0,0.0,24.1,0.0,0.0
25%,34.86289,-97.130594,0.0,486.0,0.0,73.4,113.454106,70.48132
50%,38.461697,-89.42019,0.0,6052.0,0.0,99.6,1557.078729,11156.84
75%,41.676581,-83.367686,0.0,41952.0,0.0,124.2,7343.262366,265304.9
max,48.831939,-68.299475,297817.0,319973.0,142457.0,161.8,50712.72179,11332400.0


In [34]:
df1.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2705 entries, 0 to 2704
Data columns (total 8 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   Latitude          2705 non-null   float64
 1   Longitude         2705 non-null   float64
 2   cotton_area       2705 non-null   int64  
 3   Corn_area         2705 non-null   int64  
 4   SORGHUM_ACRES     2705 non-null   int64  
 5   precipitation_cm  2705 non-null   float64
 6   CO2               2705 non-null   float64
 7   N2O               2705 non-null   float64
dtypes: float64(5), int64(3)
memory usage: 169.2 KB


In [35]:
#df1 = df1.drop(df1[df1['CO2'] == 0].index)

In [36]:
df1.shape

(2705, 8)

In [37]:
# Select the input features (X) and target variables (y)
X = df1[['Latitude', 'Longitude','cotton_area', 'Corn_area',
       'SORGHUM_ACRES', 'precipitation_cm']]
y = df1[['CO2','N2O']]

In [38]:
X_train,X_test, y_train, y_test = train_test_split(X,y,test_size = 0.2, random_state = 42)

In [39]:
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((2164, 6), (541, 6), (2164, 2), (541, 2))

# creating the Evaluation function to give all the metrics after the model training

In [42]:
def evaluate_model(true, predicted):
  mae = mean_absolute_error(true, predicted)
  mse = mean_squared_error(true, predicted)
  rmse = np.sqrt(mean_squared_error(true, predicted))
  r2_square = r2_score(true, predicted)
  return mae, mse, r2_square

In [43]:
models = {
    "linear Regression": LinearRegression(),
    "Lasso": Lasso(),
    "Ridge": Ridge(),
    "K-neighbors-regressor":KNeighborsRegressor(),
    "Decision-tree":DecisionTreeRegressor(),
    "Random Forest Regressor":RandomForestRegressor(),
    "XGBRegressor":XGBRegressor(),
    #"Adaboost Regressor": AdaBoostRegressor()
}

model_list = []
r2_list = []

for i in range(len(list(models))):
  model = list(models.values())[i]
  model.fit(X_train,y_train) # train the model

  '''
  Predictions
  '''

  y_train_pred = model.predict(X_train)
  y_test_pred = model.predict(X_test)

  '''
  Evaluate the train and test dataset
  '''

  model_train_mae, model_train_rmse, model_train_r2 = evaluate_model(y_train, y_train_pred)
  model_test_mae, model_test_rmse, model_test_r2 = evaluate_model(y_test, y_test_pred)


  print(list(models.keys())[i])
  model_list.append(list(models.keys())[i])

  print('Model performance for training set')
  print(f'- RMSE: {model_train_rmse}')
  print(f'- MAE: {model_train_mae}')
  print(f'- R2Score: {model_train_r2}')

  print('--------------------------------')

  print('Model performance for test set')
  print(f'- RMSE: {model_test_rmse}')
  print(f'- MAE: {model_test_mae}')
  print(f'- R2Score: {model_test_r2}')

  r2_list.append(model_test_r2)

  print('*'*35)
  print('\n')




linear Regression
Model performance for training set
- RMSE: 102972313235.09322
- MAE: 151421.97870273254
- R2Score: 0.9181863101922073
--------------------------------
Model performance for test set
- RMSE: 114069525726.4011
- MAE: 154636.6690363745
- R2Score: 0.892506257268632
***********************************


Lasso
Model performance for training set
- RMSE: 102972313235.19858
- MAE: 151421.95811448162
- R2Score: 0.9181863088670048
--------------------------------
Model performance for test set
- RMSE: 114069520985.12543
- MAE: 154636.650636563
- R2Score: 0.892506165160887
***********************************


Ridge
Model performance for training set
- RMSE: 102972313235.29956
- MAE: 151421.9856183843
- R2Score: 0.9181863101920431
--------------------------------
Model performance for test set
- RMSE: 114069521743.76427
- MAE: 154636.66629606622
- R2Score: 0.8925062609586735
***********************************


K-neighbors-regressor
Model performance for training set
- RMSE: 272