# **Data Loading and EDA**

## Countries: China, USA, Brazil, Indonesia
## Crops: Rice, Wheat, Corn, Soya Beans
## Year: 2000-2021


In [1]:
import pandas as pd

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
import warnings
# Suppress FutureWarnings
warnings.simplefilter(action='ignore', category=FutureWarning)

## Crop_Production

In [4]:
Crop_Production_df = pd.read_csv("/content/drive/MyDrive/Capstone/Latest/Datasets/Crop_Production.csv")
Crop_Production_df.drop(['Domain Code', 'Domain', 'Area Code (M49)', 'Element Code',
                         'Item Code (CPC)', 'Year Code', 'Flag', 'Flag Description', 'Note'], axis=1,inplace=True)
Crop_Production_df = Crop_Production_df[Crop_Production_df['Year'].between(2000, 2021)]
Crop_Production_df.rename(columns={'Element': 'Prod_type', 'Item': 'Crop_Name', 'Value': 'Crop_Production_Value',
                                   'Unit': 'Crop_Production_Unit'}, inplace=True)

Crop_Production_df.head()

FileNotFoundError: [Errno 2] No such file or directory: '/content/drive/MyDrive/Capstone/Latest/Datasets/Crop_Production.csv'

In [None]:
Crop_Production_df.shape

In [None]:
unique_values = Crop_Production_df['Year'].unique()
print(unique_values)

In [None]:
null_values_count = Crop_Production_df.isnull().sum()
print(null_values_count)

In [None]:
from matplotlib import pyplot as plt
import seaborn as sns
Crop_Production_df.groupby('Crop_Name')['Crop_Production_Value'].sum().plot(kind='barh', color=sns.palettes.mpl_palette('Dark2'))
plt.gca().spines[['top', 'right',]].set_visible(False)
plt.title("Crop Production Area by Crop")

In [None]:
from matplotlib import pyplot as plt
import seaborn as sns
Crop_Production_df.groupby('Area')['Crop_Production_Value'].sum().plot(kind='bar')
plt.gca().spines[['top', 'right',]].set_visible(False)
plt.xticks(rotation=30, ha='right')
plt.title("Crop Production Area by Country")

In [None]:
# Grouping by 'Area' (country) and 'Crop_Name', and summing up the 'Crop_Production_Value' for each country and crop
country_crop_production = Crop_Production_df.groupby(['Area', 'Crop_Name'])['Crop_Production_Value'].sum().reset_index()

# Plotting the data
plt.figure(figsize=(12, 8))
sns.barplot(data=country_crop_production.sort_values(by='Crop_Production_Value', ascending=True), x='Area', y='Crop_Production_Value', hue='Crop_Name')
plt.title('Country-wise Crop Production Value')
plt.xlabel('Country')
plt.ylabel('Crop Production Value')
plt.xticks(rotation=45, ha='right')
plt.tight_layout()
plt.legend(title='Crop', bbox_to_anchor=(1.05, 1), loc='upper left')
plt.show()

## Crop_Trade

In [None]:
Crop_Trade_df = pd.read_csv("/content/drive/MyDrive/Capstone/Latest/Datasets/Crop_Trade.csv")
Crop_Trade_df.drop(['Domain Code', 'Domain', 'Area Code (M49)', 'Element Code',
                         'Item Code (CPC)', 'Year Code', 'Flag', 'Flag Description', 'Note'], axis=1,inplace=True)
Crop_Trade_df = Crop_Trade_df[Crop_Trade_df['Year'].between(2000, 2021)]
Crop_Trade_df.rename(columns={'Element': 'Trade_type', 'Item': 'Crop_Name', 'Value': 'Trade_Value',  'Unit': 'Trade_Unit'}, inplace=True)

Crop_Trade_df.head()

In [None]:
Crop_Trade_df.shape

In [None]:
Crop_Trade_df.columns

In [None]:
unique_values = Crop_Trade_df['Year'].unique()
print(unique_values)

In [None]:
null_values_count = Crop_Trade_df.isnull().sum()
print(null_values_count)

In [None]:
sns.lineplot(data=Crop_Trade_df, x='Year', y='Trade_Value', hue='Area', ci=None)
plt.title('Overall Trade Value Over Years')
plt.xlabel('Year')
plt.ylabel('Trade Value')
plt.legend(title='Country')
plt.show()

In [None]:
# Grouping by 'Area' (country) and 'Crop_Name', and summing up the 'Crop_Production_Value' for each country and crop
country_crop_production = Crop_Trade_df.groupby(['Area', 'Trade_type'])['Trade_Value'].sum().reset_index()

# Plotting the data
plt.figure(figsize=(12, 8))
sns.barplot(data=country_crop_production.sort_values(by='Trade_Value', ascending=True), x='Area', y='Trade_Value', hue='Trade_type')
plt.title('Country-wise Trade Value')
plt.xlabel('Country')
plt.ylabel('Trade Value')
plt.xticks(rotation=360, ha='right')
plt.tight_layout()
plt.legend(title='Crop', bbox_to_anchor=(1.05, 1), loc='upper left')

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import MinMaxScaler

# Filter the DataFrame for Trade_Type = 'Export Quantity'
export_df = Crop_Trade_df[Crop_Trade_df['Trade_type'] == 'Export Quantity']

# Group by Area and Crop_Name and calculate the sum of Trade_Value
grouped_df = export_df.groupby(['Area', 'Crop_Name'])['Trade_Value'].sum().reset_index()

# Normalize the Trade_Value column
scaler = MinMaxScaler()
grouped_df['Trade_Value_Normalized'] = scaler.fit_transform(grouped_df['Trade_Value'].values.reshape(-1, 1))

# Plotting
plt.figure(figsize=(10, 6))
sns.barplot(data=grouped_df.sort_values(by='Trade_Value_Normalized', ascending=True), x='Area', y='Trade_Value_Normalized', hue='Crop_Name')
plt.title('Normalized Trade Value for All Crop Names (Export Quantity)')
plt.xlabel('Area')
plt.ylabel('Normalized Trade Value')
plt.xticks(rotation=45, ha='right')  # Rotate x-axis labels for better readability
plt.tight_layout()
plt.show()

## Emission_from_Crops

In [None]:
Emission_from_Crops_df = pd.read_csv("/content/drive/MyDrive/Capstone/Latest/Datasets/Emission_from_Crops.csv")
Emission_from_Crops_df.drop(['Domain Code', 'Domain', 'Area Code (M49)', 'Element Code', 'Source', 'Source Code',
                    'Item Code (CPC)', 'Year Code', 'Flag', 'Flag Description', 'Note'], axis=1,inplace=True)
Emission_from_Crops_df.rename(columns={'Element': 'Emission_type', 'Item': 'Crop_Name', 'Value': 'Emission_Value',  'Unit': 'Emission_Unit'}, inplace=True)

Emission_from_Crops_df.head()

In [None]:
Emission_from_Crops_df.shape

In [None]:
unique_values = Emission_from_Crops_df['Year'].unique()
print(unique_values)

In [None]:
null_values_count = Emission_from_Crops_df.isnull().sum()
print(null_values_count)

## Land_Use

In [None]:
Land_Use_df = pd.read_csv("/content/drive/MyDrive/Capstone/Latest/Datasets/Land_Use.csv")
Land_Use_df.drop(['Domain Code', 'Domain', 'Area Code (M49)', 'Element Code', 'Element',
                    'Item Code', 'Year Code', 'Flag', 'Flag Description', 'Note'], axis=1,inplace=True)
Land_Use_df.rename(columns={'Item': 'Area_type', 'Value': 'Area_Value',  'Unit': 'Area_Unit'}, inplace=True)

Land_Use_df.head()

In [None]:
Land_Use_df.shape

In [None]:
unique_values = Land_Use_df['Year'].unique()
print(unique_values)

In [None]:
null_values_count = Land_Use_df.isnull().sum()
print(null_values_count)

In [None]:
from matplotlib import pyplot as plt
import seaborn as sns
Land_Use_df.groupby('Area_type').sum().plot(kind='barh', color=sns.palettes.mpl_palette('Dark2'))
plt.title('Distribution of Area Type')
plt.gca().spines[['top', 'right',]].set_visible(False)

## Pesticides_Use

In [None]:
Pesticides_Use_df = pd.read_csv("/content/drive/MyDrive/Capstone/Latest/Datasets/Pesticides_Use.csv")
Pesticides_Use_df.drop(['Domain Code', 'Domain', 'Area Code (M49)', 'Element Code', 'Element',
                    'Item Code', 'Year Code', 'Flag', 'Flag Description', 'Note'], axis=1,inplace=True)
Pesticides_Use_df.rename(columns={'Item': 'Pesticide_Type', 'Value': 'Pesticide_Value',  'Unit': 'Pesticide_Unit'}, inplace=True)

Pesticides_Use_df.head()

In [None]:
Pesticides_Use_df.shape

In [None]:
unique_values = Pesticides_Use_df['Year'].unique()
print(unique_values)

In [None]:
null_values_count = Pesticides_Use_df.isnull().sum()
print(null_values_count)

In [None]:
# Importing necessary libraries
from matplotlib import pyplot as plt
import seaborn as sns

# Grouping the data by year and nutrients and summing the values
df_grouped = Pesticides_Use_df.groupby(['Year', 'Pesticide_Type'])['Pesticide_Value'].sum().unstack()

# Plotting the stacked bar graph
plt.figure(figsize=(10, 6))
sns.set_palette("Dark2")
df_grouped.plot(kind='bar', stacked=True)
plt.xlabel('Year')
plt.ylabel('Pesticides Value')
plt.title('Year vs Pesticides Value')
plt.legend(title='Nutrients', bbox_to_anchor=(1, 1), loc='upper left')
plt.show()

In [None]:
# Importing necessary libraries
from matplotlib import pyplot as plt
import seaborn as sns

# Grouping the data by year and nutrients and summing the values
df_grouped = Pesticides_Use_df.groupby(['Year', 'Area'])['Pesticide_Value'].sum().unstack()

# Plotting the stacked bar graph
plt.figure(figsize=(10, 6))
sns.set_palette("Dark2")
df_grouped.plot(kind='bar', stacked=True)
plt.xlabel('Year')
plt.ylabel('Pesticides Value')
plt.title('Year vs Pesticides Value')
plt.legend(title='Nutrients', bbox_to_anchor=(1, 1), loc='upper left')
plt.show()

In [None]:
# Importing necessary libraries
from matplotlib import pyplot as plt
import seaborn as sns

# Grouping the data by area and summing the values
df_grouped = Pesticides_Use_df.groupby('Area')['Pesticide_Value'].sum()

# Plotting the bar graph
plt.figure(figsize=(10, 6))
sns.set_palette("Dark2")
df_grouped.plot(kind='bar')
plt.xlabel('Area')
plt.ylabel('Pesticides Value')
plt.title('Country vs Pesticides Value')
plt.legend(title='Area', bbox_to_anchor=(1, 1), loc='upper left')
plt.xticks(rotation=30, ha='right')  # Rotate x-axis labels for better readability
plt.show()

## Value_of_Agricultural_production

In [None]:
Value_of_Agricultural_production_df = pd.read_csv("//content/drive/MyDrive/Capstone/Latest/Datasets/Value_of_Agricultural_Production.csv")
Value_of_Agricultural_production_df.drop(['Domain Code', 'Domain', 'Area Code (M49)', 'Element Code',
                         'Item Code (CPC)', 'Year Code', 'Flag', 'Flag Description'], axis=1,inplace=True)
Value_of_Agricultural_production_df = Value_of_Agricultural_production_df[Value_of_Agricultural_production_df['Year'].between(2000, 2021)]
Value_of_Agricultural_production_df.rename(columns={'Item': 'Crop_Name', 'Value': 'Agri_Prod_Value',  'Unit': 'Agri_Prod_Unit'}, inplace=True)
Value_of_Agricultural_production_df.head()

In [None]:
Value_of_Agricultural_production_df.shape

In [None]:
unique_values = Value_of_Agricultural_production_df['Year'].unique()
print(unique_values)

In [None]:
null_values_count = Value_of_Agricultural_production_df.isnull().sum()
print(null_values_count)

In [None]:
from matplotlib import pyplot as plt
import seaborn as sns
Value_of_Agricultural_production_df.groupby('Crop_Name')['Agri_Prod_Value'].sum().plot(kind='barh', color=sns.palettes.mpl_palette('Dark2'))
plt.title("Agricultural Production Value by Crop")
plt.gca().spines[['top', 'right',]].set_visible(False)

## Merging the Data

In [None]:
merged_df = pd.merge(Crop_Production_df, Crop_Trade_df, on=['Area', 'Year','Crop_Name'], how='inner')
merged_df = pd.merge(merged_df,Emission_from_Crops_df, on=['Area', 'Year','Crop_Name'], how='inner')
merged_df = pd.merge(merged_df,Value_of_Agricultural_production_df, on=['Area', 'Year','Crop_Name'], how='inner')
merged_df = pd.merge(merged_df, Land_Use_df, on=['Area', 'Year'], how='inner')
merged_df = pd.merge(merged_df,Pesticides_Use_df, on=['Area', 'Year'], how='inner')

In [None]:
merged_df.head(2)

In [None]:
merged_df.columns

In [None]:
plt.figure(figsize=(13, 6))
sns.barplot(data=merged_df, x='Year', y='Crop_Production_Value')
plt.title('Crop Production Value by Year')
plt.xlabel('Year')
plt.ylabel('Crop Production Value')
plt.show()


In [None]:
plt.figure(figsize=(12, 8))
sns.barplot(data=merged_df, x='Area', y='Crop_Production_Value', color='blue', label='Crop Production Value')
sns.barplot(data=merged_df, x='Area', y='Trade_Value', color='orange', label='Trade Value')
plt.title('Crop Production Value vs. Trade Value by Area')
plt.xlabel('Area')
plt.ylabel('Value')
plt.legend()
plt.show()

In [None]:
plt.figure(figsize=(10, 6))
sns.violinplot(data=merged_df, x='Crop_Name', y='Trade_Value')
plt.title('Trade Value Distribution by Crop Type')
plt.xlabel('Crop Type')
plt.ylabel('Trade Value')
plt.show()


In [None]:
merged_df = pd.merge(Crop_Production_df, Crop_Trade_df, on=['Area', 'Year','Crop_Name'], how='inner')
merged_df = pd.merge(merged_df,Emission_from_Crops_df, on=['Area', 'Year','Crop_Name'], how='inner')
merged_df = pd.merge(merged_df,Value_of_Agricultural_production_df, on=['Area', 'Year','Crop_Name'], how='inner')
merged_df = pd.merge(merged_df, Land_Use_df, on=['Area', 'Year'], how='inner')
merged_df = pd.merge(merged_df,Pesticides_Use_df, on=['Area', 'Year'], how='inner')

In [None]:
merged_df.head(2)

In [None]:
merged_df.shape

In [None]:
merged_df.columns

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
numerical_cols = ['Year', 'Crop_Production_Value', 'Trade_Value', 'Emission_Value', 'Element', 'Agri_Prod_Value', 'Area_type', 'Area_Value','Pesticide_Value']
merged_df[numerical_cols].hist(bins=20, figsize=(15, 10))
plt.show()

In [None]:
# Correlation matrix
correlation_matrix = merged_df[numerical_cols].corr()
plt.figure(figsize=(10, 8))
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', fmt=".2f")
plt.title('Correlation Matrix')
plt.show()

In [None]:
plt.figure(figsize=(12, 6))
sns.lineplot(data=merged_df, x='Year', y='Crop_Production_Value')
plt.xlabel('Year')
plt.ylabel('Crop Production Value')
plt.title('Crop Production Value over Time')
plt.show()

## **Crop Production Prediction - Linear Regression**

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error

df = merged_df[['Area', 'Year', 'Crop_Name', 'Agri_Prod_Unit', 'Area_Unit','Crop_Production_Value']]

X = df[['Area', 'Year', 'Crop_Name', 'Agri_Prod_Unit', 'Area_Unit']]
y = df['Crop_Production_Value']  # Target variable

# Convert categorical variables to numerical using one-hot encoding
X = pd.get_dummies(X)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# linear regression model
model = LinearRegression()
model.fit(X_train, y_train)
y_pred = model.predict(X_test)

In [None]:
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import numpy as np


mae = mean_absolute_error(y_test, y_pred)
mae_percentage = (mae / np.mean(y_test)) * 100
print("Mean Absolute Error (MAE): {:.2f}%".format(mae_percentage))

# Mean Squared Error (MSE)
mse = mean_squared_error(y_test, y_pred)
mse_percentage = (mse / (np.mean(y_test) ** 2)) * 100
print("Mean Squared Error (MSE): {:.2f}%".format(mse_percentage))

# Root Mean Squared Error (RMSE)
rmse = np.sqrt(mse)
rmse_percentage = (rmse / np.mean(y_test)) * 100
print("Root Mean Squared Error (RMSE): {:.2f}%".format(rmse_percentage))

In [None]:
crops = ['Rice', 'Wheat', 'Soyabean', 'Maize']
predictions = {}

for crop in crops:
    pred = [[10000, 2025, crop, 'kg', 'hectare']]
    pred_df = pd.DataFrame(pred, columns=['Area', 'Year', 'Crop_Name', 'Agri_Prod_Unit', 'Area_Unit'])  # Create DataFrame with correct column names
    pred_df = pd.get_dummies(pred_df)  # One-hot encode example data

    # Reindex data
    pred_df = pred_df.reindex(columns=X.columns, fill_value=0)
    prediction = model.predict(pred_df)
    predictions[crop] = prediction[0]

print("Predicted Crop Production:")
for crop, production in predictions.items():
    print(f"{crop}: {production:.2f}")

## **Trade Analysis - Random Forest**

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
rf_TA_df = merged_df[['Crop_Production_Value', 'Trade_type', 'Area','Trade_Value']]

X = rf_TA_df[['Crop_Production_Value', 'Trade_type', 'Area']]
y = rf_TA_df['Trade_Value']

# Convert categorical variables into dummy/indicator variables
X = pd.get_dummies(X)

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

model = RandomForestRegressor(n_estimators=100, random_state=42)
model.fit(X_train, y_train)
y_pred = model.predict(X_test)

mae = mean_absolute_error(y_test, y_pred)
mae_percentage = (mae / np.mean(y_test)) * 100
print("Mean Absolute Error (MAE): {:.2f}%".format(mae_percentage))

In [None]:
new_data = [[50000, 'Export', 'China']]  # Example new data
new_data_df = pd.DataFrame(new_data, columns=['Crop_Production_Value', 'Trade_type', 'Area'])
new_data_encoded = pd.get_dummies(new_data_df)
new_data_encoded = new_data_encoded.reindex(columns=X.columns, fill_value=0)  # Reindex columns to match training data
prediction = model.predict(new_data_encoded)
print("Predicted Trade Volume:", prediction)

## **Trade Analysis - Linear Regression**

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error

df = merged_df[['Crop_Production_Value', 'Trade_type', 'Area','Trade_Value']]

X = df[['Crop_Production_Value', 'Trade_type', 'Area']]
y = df['Trade_Value']

# Convert categorical variables into dummy/indicator variables
X = pd.get_dummies(X)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize and train a linear regression model
model = LinearRegression()
model.fit(X_train, y_train)
y_pred = model.predict(X_test)

In [None]:
mae = mean_absolute_error(y_test, y_pred)
mae_percentage = (mae / np.mean(y_test)) * 100
print("Mean Absolute Error (MAE): {:.2f}%".format(mae_percentage))

# Mean Squared Error (MSE)
mse = mean_squared_error(y_test, y_pred)
mse_percentage = (mse / (np.mean(y_test) ** 2)) * 100
print("Mean Squared Error (MSE): {:.2f}%".format(mse_percentage))

# Root Mean Squared Error (RMSE)
rmse = np.sqrt(mse)
rmse_percentage = (rmse / np.mean(y_test)) * 100
print("Root Mean Squared Error (RMSE): {:.2f}%".format(rmse_percentage))

# Chaturya's Code

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error
!pip install xgboost

In [None]:
feature_cols = ['Area', 'Prod_type', 'Crop_Name', 'Year', 'Emission_type', 'Area_type', 'Pesticide_Type', 'Pesticide_Value', 'Area_Value']
target_col = 'Crop_Production_Value'

In [None]:
# Splitting the dataset into training and test sets
# Filtering the dataset for training and testing
train_data = merged_df[merged_df['Year'] <= 2018]
test_data = merged_df[merged_df['Year'] > 2018]

In [None]:
X_train = train_data[feature_cols]
y_train = train_data[target_col]
X_test = test_data[feature_cols]
y_test = test_data[target_col]

In [None]:
# Creating a preprocessing pipeline for the numerical features
numerical_cols = ['Year', 'Area_Value', 'Pesticide_Value']
numerical_transformer = Pipeline(steps=[
    ('scaler', StandardScaler())
])

In [None]:
# Creating a preprocessing pipeline for the categorical features
categorical_cols = ['Area', 'Prod_type', 'Crop_Name', 'Emission_type', 'Area_type', 'Pesticide_Type']
categorical_transformer = Pipeline(steps=[
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

In [None]:
# Combining preprocessing steps
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_cols),
        ('cat', categorical_transformer, categorical_cols)
    ])

# Random Forest

In [None]:
# Create a Random Forest model
rf = RandomForestRegressor(random_state=42)

In [None]:
# Create a pipeline
pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                           ('regressor', rf)])

In [None]:
# Define the parameter grid
param_grid = {
    'regressor__n_estimators': [100, 200],
    'regressor__max_depth': [None, 10, 20, 30],
    'regressor__min_samples_split': [2, 10, 20],
    'regressor__min_samples_leaf': [1, 2, 4]
}

In [None]:
# Create the GridSearchCV object
grid_search = GridSearchCV(estimator=pipeline, param_grid=param_grid, cv=3, n_jobs=-1, verbose=2, scoring='neg_mean_squared_error')

In [None]:
# Fit the GridSearchCV
grid_search.fit(X_train, y_train)

In [None]:
# Print the best parameters and best score
print("Best parameters found: ", grid_search.best_params_)
print("Best score found: ", -grid_search.best_score_)

In [None]:
# Using the best parameters to create a new model
rf_model = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('regressor', RandomForestRegressor(n_estimators=100,
                                         max_depth=None,
                                         min_samples_split=2,  # Updated parameter
                                         min_samples_leaf=4,
                                         random_state=42))
])

In [None]:
# Training the model
rf_model.fit(X_train, y_train)

In [None]:
# Making predictions and evaluating the model
y_pred = rf_model.predict(X_test)
rmse = np.sqrt(mean_squared_error(y_test, y_pred))  # Calculating RMSE

# Calculating the mean of the actual values
mean_actual = np.mean(y_test)

# Calculating NRMSE in percentage
rf_nrmse_percent = (rmse / mean_actual) * 100

print(f"Normalized Root Mean Squared Error (Percentage): {rf_nrmse_percent:.2f}%")

In [None]:
print(y_pred)

In [None]:
# Calculate Mean Absolute Error (MAE)
rf_mae = mean_absolute_error(y_test, y_pred)
print(f"Mean Absolute Error (MAE): {rf_mae:.2f}")

# Function to calculate Mean Absolute Percentage Error (MAPE)
def mean_absolute_percentage_error(y_true, y_pred):
    y_true, y_pred = np.array(y_true), np.array(y_pred)
    return np.mean(np.abs((y_true - y_pred) / y_true)) * 100

# Calculate MAPE
rf_mape = mean_absolute_percentage_error(y_test, y_pred)
print(f"Mean Absolute Percentage Error (MAPE): {rf_mape:.2f}%")

# Xgboost

In [None]:
import xgboost as xgb

In [None]:
# Create a pipeline with XGBoost
xgb_pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('regressor', xgb.XGBRegressor(objective='reg:squarederror', n_estimators=100, learning_rate=0.1, max_depth=5, random_state=42))
])

In [None]:
# Training the model
xgb_pipeline.fit(X_train, y_train)

In [None]:
# Making predictions
y_pred_xgb = xgb_pipeline.predict(X_test)

In [None]:
# Calculating RMSE for the test set
rmse_xgb = np.sqrt(mean_squared_error(y_test, y_pred_xgb))
print(f"XGBoost RMSE: {rmse_xgb}")

# Calculating NRMSE in percentage
xgb_nrmse_percent = (rmse_xgb / mean_actual) * 100

print(f"Normalized Root Mean Squared Error (Percentage): {xgb_nrmse_percent:.2f}%")

In [None]:
# Calculating Mean Absolute Error (MAE)
mae_xgb = mean_absolute_error(y_test, y_pred_xgb)
print(f"XGBoost MAE: {mae_xgb}")

# Calculating Mean Absolute Percentage Error (MAPE)
mape_xgb = mean_absolute_percentage_error(y_test, y_pred_xgb)
print(f"XGBoost MAPE: {mape_xgb:.2f}%")