# imports

In [None]:
# imports:
import pandas as pd
import numpy as np

# import regex module
import re

# graphs:
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

# display all the columns in dataframe
pd.set_option('display.max_columns', None)

# sklearn imports
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder, MinMaxScaler
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor


In [None]:
df = pd.read_csv('https://raw.githubusercontent.com/aps0611/experimental/main/dataset/data-V4.csv')

In [None]:
df.head(5)

Unnamed: 0.1,Unnamed: 0,Geo Level,State,State ANSI,Ag District,Ag District Code,County,County ANSI,cotton_area,Corn_area,SORGHUM_ACRES,precipitation_cm,Latitude,Longitude,CO2,N2O
0,0,COUNTY,ALABAMA,1,BLACK BELT,40,AUTAUGA,1,9261,645,0,148.0,32.516526,-86.63194,1639.220033,9923.611451
1,1,COUNTY,ALABAMA,1,BLACK BELT,40,DALLAS,47,14133,13876,0,148.0,32.311797,-87.104664,4270.154224,44321.79154
2,2,COUNTY,ALABAMA,1,BLACK BELT,40,ELMORE,51,13795,2487,0,148.0,32.580123,-86.125195,2651.127824,22593.7503
3,3,COUNTY,ALABAMA,1,BLACK BELT,40,LOWNDES,85,4046,1630,0,148.0,32.108807,-86.640254,901.109504,2184.483787
4,4,COUNTY,ALABAMA,1,BLACK BELT,40,MACON,87,12376,0,0,148.0,32.366606,-85.666031,2072.333953,17632.69941


In [None]:
df.shape

(2705, 16)

In [None]:
### check the columns

df.columns

Index(['Unnamed: 0', 'Geo Level', 'State', 'State ANSI', 'Ag District',
       'Ag District Code', 'County', 'County ANSI', 'cotton_area', 'Corn_area',
       'SORGHUM_ACRES', 'precipitation_cm', 'Latitude', 'Longitude', 'CO2',
       'N2O'],
      dtype='object')

In [None]:
## create a new df1 with all the needed columns

df1 = df[['Latitude', 'Longitude', 'cotton_area', 'Corn_area',
       'SORGHUM_ACRES', 'precipitation_cm', 'CO2',
       'N2O']]

In [None]:
df1.head(5)

Unnamed: 0,Latitude,Longitude,cotton_area,Corn_area,SORGHUM_ACRES,precipitation_cm,CO2,N2O
0,32.516526,-86.63194,9261,645,0,148.0,1639.220033,9923.611451
1,32.311797,-87.104664,14133,13876,0,148.0,4270.154224,44321.79154
2,32.580123,-86.125195,13795,2487,0,148.0,2651.127824,22593.7503
3,32.108807,-86.640254,4046,1630,0,148.0,901.109504,2184.483787
4,32.366606,-85.666031,12376,0,0,148.0,2072.333953,17632.69941


In [None]:
df1.describe()

Unnamed: 0,Latitude,Longitude,cotton_area,Corn_area,SORGHUM_ACRES,precipitation_cm,CO2,N2O
count,2705.0,2705.0,2705.0,2705.0,2705.0,2705.0,2705.0,2705.0
mean,38.331819,-90.761495,4162.07098,31268.403327,1839.810351,98.020407,4924.867588,427912.0
std,4.765679,10.53215,18279.682707,50129.493374,8823.697408,31.179485,7082.753396,1105998.0
min,2.169424,-159.558768,0.0,0.0,0.0,24.1,0.0,0.0
25%,34.86289,-97.130594,0.0,486.0,0.0,73.4,113.454106,70.48132
50%,38.461697,-89.42019,0.0,6052.0,0.0,99.6,1557.078729,11156.84
75%,41.676581,-83.367686,0.0,41952.0,0.0,124.2,7343.262366,265304.9
max,48.831939,-68.299475,297817.0,319973.0,142457.0,161.8,50712.72179,11332400.0


In [None]:
df1.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2705 entries, 0 to 2704
Data columns (total 8 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   Latitude          2705 non-null   float64
 1   Longitude         2705 non-null   float64
 2   cotton_area       2705 non-null   int64  
 3   Corn_area         2705 non-null   int64  
 4   SORGHUM_ACRES     2705 non-null   int64  
 5   precipitation_cm  2705 non-null   float64
 6   CO2               2705 non-null   float64
 7   N2O               2705 non-null   float64
dtypes: float64(5), int64(3)
memory usage: 169.2 KB


In [None]:
#df1 = df1.drop(df1[df1['CO2'] == 0].index)

In [None]:
df1.shape

(2705, 8)

In [None]:
# Select the input features (X) and target variables (y)
X = df1[['Latitude', 'Longitude','cotton_area', 'Corn_area',
       'SORGHUM_ACRES', 'precipitation_cm']]
y = df1[['CO2','N2O']]

In [None]:
X_train,X_test, y_train, y_test = train_test_split(X,y,test_size = 0.2, random_state = 42)

In [None]:
X_train

Unnamed: 0,Latitude,Longitude,cotton_area,Corn_area,SORGHUM_ACRES,precipitation_cm
979,35.126946,-84.530352,0,729,0,137.6
1283,37.131210,-81.115411,0,0,0,112.5
1356,22.050467,-159.558768,0,876,0,161.8
748,35.924738,-81.171632,0,958,180,127.9
2692,43.029334,-108.577255,0,1067,0,32.8
...,...,...,...,...,...,...
1638,40.739987,-94.254747,0,44842,0,86.4
1095,34.519657,-100.206766,36586,0,1875,73.4
1130,28.267875,-98.101923,1696,4714,5272,73.4
1294,36.711747,-82.589305,0,281,0,112.5


In [None]:
# Perform necessary transformations
# Scaling numerical features
#numeric_features = ['cotton_area', 'Corn_area', 'SORGHUM_ACRES', 'precipitation_cm_x']
#numeric_transformer = MinMaxScaler()

In [None]:
# One-hot encoding categorical features
#categorical_features = ['Ag District']
#categorical_transformer = OneHotEncoder(drop='first')


In [None]:
# Apply transformations using ColumnTransformer
'''preprocessor = ColumnTransformer(
    transformers=[
        #('num', numeric_transformer, numeric_features),
        #('cat', categorical_transformer, categorical_features)
    ])'''

"preprocessor = ColumnTransformer(\n    transformers=[\n        #('num', numeric_transformer, numeric_features),\n        #('cat', categorical_transformer, categorical_features)\n    ])"

In [None]:
# Create an instance of the LinearRegression model
model = DecisionTreeRegressor()


In [None]:
# Fit and transform the data
#X_transformed_train = preprocessor.fit_transform(X_train)
#X_transformed_test =  preprocessor.fit_transform(X_test)

In [None]:
# Fit the model to the transformed dataset
model.fit(X_train, y_train)


In [None]:
# Predict the target variables
y_pred = model.predict(X_test)


In [None]:
# Print the predicted target variables
pd.DataFrame(y_pred)

Unnamed: 0,0,1
0,1308.960302,4.899467e+03
1,13594.164130,1.016195e+06
2,2354.001226,3.261145e+04
3,0.000000,0.000000e+00
4,289.451099,4.293746e+02
...,...,...
536,2.712517,7.703845e-02
537,0.000000,0.000000e+00
538,0.000000,0.000000e+00
539,0.000000,0.000000e+00


In [None]:
y_test

Unnamed: 0,CO2,N2O
1106,995.307327,4.491043e+03
439,14911.401990,1.029209e+06
801,4209.657035,4.053353e+04
296,0.000000,0.000000e+00
1718,273.140419,4.414306e+02
...,...,...
1190,2.712517,7.703845e-02
2369,0.000000,0.000000e+00
2543,0.000000,0.000000e+00
1904,0.000000,0.000000e+00


In [None]:
model.score(X_test, y_test)

0.9762417789170243

In [None]:
model2 = LinearRegression()

In [None]:
model2.fit(X_train,y_train)

In [None]:
y_pred2 = model2.predict(X_test)

In [None]:
model2.score(X_test,y_test)

0.892506257268632