# 🥕 Vegetable Price Prediction

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import  r2_score
from sklearn.ensemble import RandomForestRegressor


In [None]:
df=pd.read_csv('/content/Vegetable_market.csv')
df

Unnamed: 0,Vegetable,Season,Month,Temp,Deasaster Happen in last 3month,Vegetable condition,Price per kg
0,potato,winter,jan,15,no,fresh,20
1,tomato,winter,jan,15,no,fresh,50
2,peas,winter,jan,15,no,fresh,70
3,pumkin,winter,jan,15,no,fresh,25
4,cucumber,winter,jan,15,no,fresh,20
...,...,...,...,...,...,...,...
116,brinjal,winter,jan,15,yes,fresh,33
117,ginger,winter,jan,15,no,fresh,88
118,potato,summer,apr,32,no,fresh,24
119,peas,summer,apr,33,no,fresh,33


In [None]:
df.info() #information

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 121 entries, 0 to 120
Data columns (total 7 columns):
 #   Column                           Non-Null Count  Dtype 
---  ------                           --------------  ----- 
 0   Vegetable                        121 non-null    object
 1   Season                           121 non-null    object
 2   Month                            121 non-null    object
 3   Temp                             121 non-null    int64 
 4   Deasaster Happen in last 3month  121 non-null    object
 5   Vegetable condition              121 non-null    object
 6   Price per kg                     121 non-null    int64 
dtypes: int64(2), object(5)
memory usage: 6.7+ KB


The describe() function in pandas, it is used to generate descriptive statistics of a DataFrame

In [None]:
df.describe()


Unnamed: 0,Temp,Price per kg
count,121.0,121.0
mean,24.892562,55.330579
std,9.319157,48.769934
min,15.0,9.0
25%,15.0,22.0
50%,27.0,35.0
75%,32.0,70.0
max,43.0,250.0


In [None]:
#checking null values
df.isnull().sum()

Unnamed: 0,0
Vegetable,0
Season,0
Month,0
Temp,0
Deasaster Happen in last 3month,0
Vegetable condition,0
Price per kg,0


## **Encoding**
Encoding is used in machine learning to transform categorical data, like 'Vegetable' or 'Season', into a numerical format.

**One-hot encoding**
 One-hot encoding creates binary (0 or 1) columns for each category, representing the presence or absence of that category for each data point.

In [None]:
# One-hot encode the categorical variables
categorical_columns = ['Vegetable', 'Season','Month','Deasaster Happen in last 3month','Vegetable condition']
df_encoded = pd.get_dummies(df, columns=categorical_columns)
df_encoded

Unnamed: 0,Temp,Price per kg,Vegetable_Bitter gourd,Vegetable_Raddish,Vegetable_brinjal,Vegetable_cabage,Vegetable_califlower,Vegetable_chilly,Vegetable_cucumber,Vegetable_garlic,...,Month_march,Month_may,Month_oct,Month_sept,Deasaster Happen in last 3month_no,Deasaster Happen in last 3month_yes,Vegetable condition_avarage,Vegetable condition_fresh,Vegetable condition_scarp,Vegetable condition_scrap
0,15,20,False,False,False,False,False,False,False,False,...,False,False,False,False,True,False,False,True,False,False
1,15,50,False,False,False,False,False,False,False,False,...,False,False,False,False,True,False,False,True,False,False
2,15,70,False,False,False,False,False,False,False,False,...,False,False,False,False,True,False,False,True,False,False
3,15,25,False,False,False,False,False,False,False,False,...,False,False,False,False,True,False,False,True,False,False
4,15,20,False,False,False,False,False,False,True,False,...,False,False,False,False,True,False,False,True,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
116,15,33,False,False,True,False,False,False,False,False,...,False,False,False,False,False,True,False,True,False,False
117,15,88,False,False,False,False,False,False,False,False,...,False,False,False,False,True,False,False,True,False,False
118,32,24,False,False,False,False,False,False,False,False,...,False,False,False,False,True,False,False,True,False,False
119,33,33,False,False,False,False,False,False,False,False,...,False,False,False,False,True,False,False,True,False,False


In [None]:
#Split the data into features and target varible
X=df_encoded.drop('Price per kg',axis=1)
y=df_encoded['Price per kg']

In [None]:
# X values
X

Unnamed: 0,Temp,Vegetable_Bitter gourd,Vegetable_Raddish,Vegetable_brinjal,Vegetable_cabage,Vegetable_califlower,Vegetable_chilly,Vegetable_cucumber,Vegetable_garlic,Vegetable_ginger,...,Month_march,Month_may,Month_oct,Month_sept,Deasaster Happen in last 3month_no,Deasaster Happen in last 3month_yes,Vegetable condition_avarage,Vegetable condition_fresh,Vegetable condition_scarp,Vegetable condition_scrap
0,15,False,False,False,False,False,False,False,False,False,...,False,False,False,False,True,False,False,True,False,False
1,15,False,False,False,False,False,False,False,False,False,...,False,False,False,False,True,False,False,True,False,False
2,15,False,False,False,False,False,False,False,False,False,...,False,False,False,False,True,False,False,True,False,False
3,15,False,False,False,False,False,False,False,False,False,...,False,False,False,False,True,False,False,True,False,False
4,15,False,False,False,False,False,False,True,False,False,...,False,False,False,False,True,False,False,True,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
116,15,False,False,True,False,False,False,False,False,False,...,False,False,False,False,False,True,False,True,False,False
117,15,False,False,False,False,False,False,False,False,True,...,False,False,False,False,True,False,False,True,False,False
118,32,False,False,False,False,False,False,False,False,False,...,False,False,False,False,True,False,False,True,False,False
119,33,False,False,False,False,False,False,False,False,False,...,False,False,False,False,True,False,False,True,False,False


In [None]:
#y values
y

Unnamed: 0,Price per kg
0,20
1,50
2,70
3,25
4,20
...,...
116,33
117,88
118,24
119,33


In [None]:
# Import train_test_split
from sklearn.model_selection import train_test_split

# Split the data
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.2,random_state=42)

In [None]:
# Get the shapes of X, X_train, X_test
print(f"Shape of X: {X.shape}")
print(f"Shape of X_train: {X_train.shape}")
print(f"Shape of X_test: {X_test.shape}")
print(f"Shape of y_train: {y_train.shape}")
print(f"Shape of y_test: {y_test.shape}")


Shape of X: (121, 40)
Shape of X_train: (96, 40)
Shape of X_test: (25, 40)
Shape of y_train: (96,)
Shape of y_test: (25,)




```
# This is formatted as code
```

# **Linear Regression model**

In [None]:
#Import the Linear Regression model
from sklearn.linear_model import LinearRegression

#Create the model
model=LinearRegression()

#Fit the model
model.fit(X_train,y_train)

In [None]:
y_pred=model.predict(X_test)
y_pred

array([ 15.        ,  29.84942584, -16.63696518,   9.19222759,
        69.74843486,  51.35414072,  39.67448162, 126.        ,
        17.65057416,  37.06527996,  46.74759044,  30.62620478,
        98.31402504, 190.        ,  46.74759044,  46.74759044,
       126.        ,  51.45947495,  59.84536546,  15.        ,
        82.20871937,  52.81939278,  78.12137854, 175.96782279,
        39.15057416])

In [None]:
r2=r2_score(y_test,y_pred)
print("R-squared:",r2)

R-squared: 0.8111948781567268


## **Mean Squared Error**

In [None]:
print(X.shape)
print(y.shape)

# if X has extra rows, remove them:
X=X[:len(y)]

# or, if y has extra rows:
y=y[:len(X)]

#Now split again:
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.2,random_state=42)

(121, 40)
(121,)


In [None]:
#Mean Squared Error
from sklearn.metrics import mean_squared_error
mse = mean_squared_error(y_test,y_pred)
print("Mean Squared Error:",mse)

Mean Squared Error: 582.6151470721677


# **Support Vector Machines (SVM)**

In [None]:
#first import the svm module from sklearn
from sklearn import svm

In [None]:
#create an SVM model
model=svm.SVR()
model
#Fit the model
model.fit(X_train,y_train)

In [None]:
y_pre=model.predict(X_test)
y_pre

array([37.99054829, 35.53479504, 34.90252847, 34.88386961, 38.26349849,
       37.69969792, 34.90150138, 34.91232847, 34.87021596, 36.62267491,
       38.28867806, 38.29796687, 34.97468584, 39.71758347, 38.28867806,
       38.28867806, 34.91232847, 34.90281017, 34.91740806, 37.99054829,
       39.72808008, 40.91059183, 35.12128033, 37.78256285, 34.90717799])

In [None]:
r21=r2_score(y_test,y_pre)
print("R-squared:",r2)

R-squared: 0.8111948781567268


In [None]:
# mean square error
mse = mean_squared_error(y_test,y_pre)
print("Mean Squared Error:",mse)

Mean Squared Error: 3430.1351511834755


# **Random Forest Regressor**

In [None]:
#first import random forest from sklearn
from sklearn.ensemble import RandomForestRegressor


In [None]:
# Initialize and train the Random Forest Regressor
model = RandomForestRegressor(random_state=42)
model.fit(X_train, y_train)

In [None]:
# Predict on the test set
y_pred = model.predict(X_test)
y_pred

array([ 30.61530952,  29.80533333,  23.68540476,  21.01842857,
        34.09291667,  34.3575    ,  31.6787619 , 118.94811905,
        18.257     ,  29.48142857,  24.17603571,  40.23966667,
        54.0267381 , 154.82142857,  24.17603571,  24.17603571,
       118.94811905,  23.80209524,  63.57138095,  30.61530952,
        86.49845238,  51.86066667,  21.98454762, 180.15035714,
        40.52133333])

In [None]:
# Calculate the R-squared score as a measure of accuracy
r2 = r2_score(y_test, y_pred)
r2


0.9119699118457764

## **Mean Square Error**


In [None]:
#mean Square Error
mse = mean_squared_error(y_test,y_pred)
print("Mean Squared Error:",mse )

Mean Squared Error: 271.6433868744444


**Preprocessing**

In [None]:
def onehot_encode(df, column):
    df = df.copy()
    dummies = pd.get_dummies(df[column], prefix=column)
    df = pd.concat([df, dummies], axis=1)
    df = df.drop(column, axis=1)
    return df

In [None]:
def preprocess_inputs(df):
    df = df.copy()

    # Clean Vegetable condition column
    df['Vegetable condition'] = df['Vegetable condition'].replace({'scarp': 'scrap'})

    # Binary encoding
    df['Deasaster Happen in last 3month'] = df['Deasaster Happen in last 3month'].replace({'no': 0, 'yes': 1})

    # Ordinal encoding
    df['Month'] = df['Month'].replace({
        'jan': 1,
        'apr': 4,
        'july': 7,
        'sept': 9,
        'oct': 10,
        'dec': 12,
        'may': 5,
        'aug': 8,
        'june': 6,
        ' ': np.NaN,
        'march': 3
    })

    # Fill missing month values with column mode
    df['Month'] = df['Month'].fillna(df['Month'].mode()[0])

    # One-hot encoding
    for column in ['Vegetable', 'Season', 'Vegetable condition']:
      df_encoded = onehot_encode(df, column)



In [None]:
# Split df into X and y
y = df['Price per kg']
X= df.drop('Price per kg', axis=1)

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.7, shuffle=True, random_state=1)


# **Conclusion **

In [None]:
# Linear Regression - Mean Squared Error: 271.6433868744444
# KNeighborsRegressor - Mean Squared Error: 271.6433868744444
# Random Forest Regressor - Mean Squared Error: 271.6433868744444