# Importing the necessary Library and Model

In [2]:
# import libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

# Sklearn processing
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

# Sklearn regression algorithms
from sklearn.ensemble import RandomForestRegressor

# Sklearn regression model evaluation function
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score

# Loading and Final Tuning of Data

In [20]:
# Load the dataset
df = pd.read_csv(' ')

In [19]:
# drop the unwanted columns
df.drop(['Unnamed: 0','vat_number','nace_code','highLevel_NaceCode','province',
        'year'], axis='columns', inplace=True)

In [8]:
# Select Features and Target
X = df.iloc[:, :-1].values
y = df.iloc[:, -1].values

In [9]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

# Normalize data

In [10]:
from sklearn.preprocessing import StandardScaler

# Perform scaler transform of the dataset
scal_transform = StandardScaler() 
X_train = scal_transform.fit_transform(X_train)
X_test = scal_transform.transform(X_test)

# Running the model with the best parameters

In [18]:
# Create the model with best parameters
rf = RandomForestRegressor(n_estimators=800, min_samples_split=2, min_samples_leaf=1, max_depth=100, max_features=1, bootstrap=True)
# Fit the model
rf.fit(X_train, y_train)

x_rf_train_predict = rf.predict(X_train) # Predict on training set
x_rf_test_predict = rf.predict(X_test) # Predict on test set

In [13]:
rf_train_mse = mean_squared_error(y_train, x_rf_train_predict) # Calculate training set MSE
rf_test_mse = mean_squared_error(y_test, x_rf_test_predict) # Calculate test set MSE

rf_train_r2 = r2_score(y_train, x_rf_train_predict) # Calculate training set R2
rf_test_r2 = r2_score(y_test, x_rf_test_predict) # Calculate test set R2

rf_train_rmse = np.sqrt(rf_train_mse) # Calculate training set RMSE
rf_test_rmse = np.sqrt(rf_test_mse) # Calculate test set RMSE

# Create a dataframe to store results
rf_result = pd.DataFrame({'Train_mse': [rf_train_mse], 'Test_mse': [rf_test_mse], 'Train_r2': [rf_train_r2], 'Test_r2': [rf_test_r2], 'Train_rmse': [rf_train_rmse], 'Test_rmse': [rf_test_rmse]}) 