## Importing the essential libraries over here

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings("ignore")

## Importing the dataset over here

In [2]:
data=pd.read_csv("Vegetable_market.csv")

In [3]:
data.head()

Unnamed: 0,Vegetable,Season,Month,Temp,Deasaster Happen in last 3month,Vegetable condition,Price per kg
0,potato,winter,jan,15,no,fresh,20
1,tomato,winter,jan,15,no,fresh,50
2,peas,winter,jan,15,no,fresh,70
3,pumkin,winter,jan,15,no,fresh,25
4,cucumber,winter,jan,15,no,fresh,20


## Taking care of duplicate observations over here

In [4]:
data.duplicated().sum()

10

In [5]:
data.drop_duplicates(inplace=True)

In [6]:
data.duplicated().sum()

0

## Taking care of missing values over here

In [7]:
data.isnull().sum()

Vegetable                          0
Season                             0
Month                              0
Temp                               0
Deasaster Happen in last 3month    0
Vegetable condition                0
Price per kg                       0
dtype: int64

## Filtering all the numerical features over here

In [10]:
numerical_features=[feature for feature in data.columns if data[feature].dtype!="O"]
for feature in numerical_features:
  print(feature)

Temp
Price per kg


In [11]:
data[numerical_features]

Unnamed: 0,Temp,Price per kg
0,15,20
1,15,50
2,15,70
3,15,25
4,15,20
...,...,...
116,15,33
117,15,88
118,32,24
119,33,33


## Filtering all the categorical feature into numerical features over here

In [12]:
cat_feature=[feature for feature in data.columns if data[feature].dtype=="O"]
for feature in cat_feature:
  print(feature)

Vegetable
Season
Month
Deasaster Happen in last 3month
Vegetable condition


In [13]:
data[cat_feature]

Unnamed: 0,Vegetable,Season,Month,Deasaster Happen in last 3month,Vegetable condition
0,potato,winter,jan,no,fresh
1,tomato,winter,jan,no,fresh
2,peas,winter,jan,no,fresh
3,pumkin,winter,jan,no,fresh
4,cucumber,winter,jan,no,fresh
...,...,...,...,...,...
116,brinjal,winter,jan,yes,fresh
117,ginger,winter,jan,no,fresh
118,potato,summer,apr,no,fresh
119,peas,summer,apr,no,fresh


## Encoding the categorical features into numerical features over here

In [15]:
for feature in cat_feature:
  feature_mapping={category:index for index,category in enumerate(data[feature].unique())}
  data[feature]=data[feature].map(feature_mapping)

In [16]:
data

Unnamed: 0,Vegetable,Season,Month,Temp,Deasaster Happen in last 3month,Vegetable condition,Price per kg
0,0,0,0,15,0,0,20
1,1,0,0,15,0,0,50
2,2,0,0,15,0,0,70
3,3,0,0,15,0,0,25
4,4,0,0,15,0,0,20
...,...,...,...,...,...,...,...
116,14,0,0,15,1,0,33
117,15,0,0,15,0,0,88
118,0,1,1,32,0,0,24
119,2,1,1,33,0,0,33


## Creating the features and labels over here

In [17]:
X=data.iloc[:,:-1].values
y=data.iloc[:,-1].values

## Splitting the dataset into training set and testing set to avoid the problem of overfitting over here

In [18]:
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.2,random_state=0)

## Training the model on the training set over here

In [22]:
# from sklearn.ensemble import RandomForestRegressor
# regressor=RandomForestRegressor()
# regressor.fit(X_train,y_train)

import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, AdaBoostRegressor
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.model_selection import train_test_split


# List of regression algorithms to experiment with
regressors = {
    "Linear Regression": LinearRegression(),
    "Ridge Regression": Ridge(),
    "Lasso Regression": Lasso(),
    "Random Forest": RandomForestRegressor(),
    "Gradient Boosting": GradientBoostingRegressor(),
    "AdaBoost": AdaBoostRegressor()
}

# Dictionary to store the performance metrics
performance_metrics = {}

# Train and evaluate each algorithm
for name, regressor in regressors.items():
    regressor.fit(X_train, y_train)
    y_pred = regressor.predict(X_test)

    mse = mean_squared_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)

    performance_metrics[name] = {
        "Mean Squared Error": mse,
        "R^2 Score": r2
    }

# Find the best algorithm based on R^2 Score
best_algorithm = max(performance_metrics, key=lambda k: performance_metrics[k]["R^2 Score"])

# Print the performance metrics
for name, metrics in performance_metrics.items():
    print(f"{name}: MSE = {metrics['Mean Squared Error']}, R^2 = {metrics['R^2 Score']}")

print(f"\nBest Algorithm: {best_algorithm} with R^2 Score = {performance_metrics[best_algorithm]['R^2 Score']}")

# Use the best algorithm
best_regressor = regressors[best_algorithm]
best_regressor.fit(X_train, y_train)


Linear Regression: MSE = 2248.5881096724984, R^2 = 0.025181228115395027
Ridge Regression: MSE = 2226.1797533942286, R^2 = 0.034895806900709814
Lasso Regression: MSE = 2221.2833738930944, R^2 = 0.03701850897826897
Random Forest: MSE = 648.1215235760087, R^2 = 0.7190232284309446
Gradient Boosting: MSE = 447.9571728754238, R^2 = 0.8057994439973618
AdaBoost: MSE = 780.3169377501629, R^2 = 0.661713234333006

Best Algorithm: Gradient Boosting with R^2 Score = 0.8057994439973618


In [23]:
regressor=GradientBoostingRegressor()
regressor.fit(X_train,y_train)

## Evaluating the performance of the model on the testing dataset over here

In [24]:
y_pred=regressor.predict(X_test)
np.set_printoptions(precision=2)
print(np.concatenate((y_pred.reshape(len(y_pred),1),y_test.reshape(len(y_test),1)),1))

[[ 30.    24.  ]
 [106.57 150.  ]
 [ 18.56  21.  ]
 [ 38.98  70.  ]
 [ 29.41  25.  ]
 [ 59.28  55.  ]
 [ 52.02  50.  ]
 [ 27.05  35.  ]
 [ 34.81  70.  ]
 [221.61 170.  ]
 [ 46.66  45.  ]
 [ 36.55  50.  ]
 [ 19.46  15.  ]
 [ 59.28  75.  ]
 [ 46.66  50.  ]
 [ 39.29  70.  ]
 [ 74.36  80.  ]
 [106.57 132.  ]
 [ 14.04  24.  ]
 [197.23 190.  ]
 [ 16.54  21.  ]
 [ 27.33  35.  ]
 [102.08  70.  ]]


In [25]:
from sklearn.metrics import r2_score
r2_score(y_test,y_pred)

0.8069841251966869