<h2>Required Libraries :-</h2>

In [21]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error
import joblib
import os

In [22]:
df = pd.read_csv('Data.csv')
df

Unnamed: 0,State,District,Market,Commodity,Variety,Grade,Arrival_Date,Min_x0020_Price,Max_x0020_Price,Modal_x0020_Price
0,Bihar,Araria,Forbesganj,Potato,Jyoti,FAQ,01/09/2024,2200,2700,2500.0
1,Bihar,Kishanganj,Bahadurganj,Potato,Jyoti,FAQ,01/09/2024,2500,2700,2600.0
2,Bihar,Madhubani,Jainagar,Onion,Medium,FAQ,01/09/2024,4200,4400,4300.0
3,Bihar,Rohtas,Natwar,Potato,Jyoti,FAQ,01/09/2024,2400,2800,2600.0
4,Chandigarh,Chandigarh,Chandigarh(Grain/Fruit),Ginger(Green),Other,FAQ,01/09/2024,3000,5000,4000.0
...,...,...,...,...,...,...,...,...,...,...
7934,West Bengal,Purba Bardhaman,Kalna,Green Chilli,Other,FAQ,01/09/2024,7800,8200,8000.0
7935,West Bengal,Purba Bardhaman,Kalna,Sweet Pumpkin,Other,FAQ,01/09/2024,2300,2500,2400.0
7936,West Bengal,Purba Bardhaman,Memari,Brinjal,Brinjal,FAQ,01/09/2024,4700,5200,5000.0
7937,West Bengal,Purba Bardhaman,Memari,Sweet Pumpkin,Sweet Pumpkin,FAQ,01/09/2024,2000,2200,2000.0


In [23]:
df['State'].value_counts().keys()

Index(['Tamil Nadu', 'Uttar Pradesh', 'Maharashtra', 'West Bengal',
       'Madhya Pradesh', 'Haryana', 'Punjab', 'Kerala', 'Telangana', 'Tripura',
       'Gujarat', 'Odisha', 'Himachal Pradesh', 'Rajasthan', 'Bihar',
       'Chandigarh', 'Uttrakhand', 'Karnataka', 'Chattisgarh',
       'Andhra Pradesh'],
      dtype='object', name='State')

In [24]:
list(df[df['State'] == 'Andhra Pradesh'].value_counts('Commodity').keys())

['Tomato']

In [25]:
df['Grade'] = df['Grade'].replace('FAQ', 'Other')
df.head(40)

Unnamed: 0,State,District,Market,Commodity,Variety,Grade,Arrival_Date,Min_x0020_Price,Max_x0020_Price,Modal_x0020_Price
0,Bihar,Araria,Forbesganj,Potato,Jyoti,Other,01/09/2024,2200,2700,2500.0
1,Bihar,Kishanganj,Bahadurganj,Potato,Jyoti,Other,01/09/2024,2500,2700,2600.0
2,Bihar,Madhubani,Jainagar,Onion,Medium,Other,01/09/2024,4200,4400,4300.0
3,Bihar,Rohtas,Natwar,Potato,Jyoti,Other,01/09/2024,2400,2800,2600.0
4,Chandigarh,Chandigarh,Chandigarh(Grain/Fruit),Ginger(Green),Other,Other,01/09/2024,3000,5000,4000.0
5,Gujarat,Amreli,Damnagar,Coriander(Leaves),Coriander,Other,01/09/2024,4200,5050,4750.0
6,Gujarat,Chhota Udaipur,Bodeliu,Cotton,Shanker 6 (B) 30mm FIne,Other,01/09/2024,6800,7000,6900.0
7,Gujarat,Rajkot,Gondal(Veg.market Gondal),Apple,Apple,Medium,01/09/2024,6000,13000,9500.0
8,Gujarat,Rajkot,Gondal(Veg.market Gondal),Cucumbar(Kheera),Cucumbar,Other,01/09/2024,2000,3500,2750.0
9,Gujarat,Rajkot,Gondal(Veg.market Gondal),Green Chilli,Green Chilly,Other,01/09/2024,2000,5000,3500.0


In [26]:
# Feature columns and target variables
features = ['State', 'District', 'Market', 'Commodity', 'Variety', 'Grade']
target_min = 'Min_x0020_Price'
target_max = 'Max_x0020_Price'

In [27]:
# Splitting data into features and target
X = df[features]
y_min = df[target_min]
y_max = df[target_max]

In [28]:
# Categorical feature encoding
encoder = OneHotEncoder()
X_encoded = encoder.fit_transform(X)


In [29]:
# Train-test split
X_train, X_test, y_min_train, y_min_test, y_max_train, y_max_test = train_test_split(X_encoded, y_min, y_max, test_size=0.2, random_state=42)

In [30]:
# Define models
models = {
    'Linear Regression': LinearRegression(),
    'Decision Tree': DecisionTreeRegressor(random_state=42),
    'Random Forest': RandomForestRegressor(n_estimators=50, random_state=12)
}

# Train and evaluate models
for name, model in models.items():
    print(f"Training and evaluating {name}...")
    
    # Train the model
    model.fit(X_train, y_min_train)
    
    print("Training Score : ",model.score(X_train,y_min_train)*100)
    print("Testing Score : ",model.score(X_test,y_min_test)*100)

    print("Training Score : ",model.score(X_train,y_max_train)*100)
    print("Testing Score : ",model.score(X_test,y_max_test)*100)
    print()


Training and evaluating Linear Regression...
Training Score :  85.25133298665638
Testing Score :  74.02227881232675
Training Score :  82.28319718870603
Testing Score :  73.725990874503

Training and evaluating Decision Tree...
Training Score :  100.0
Testing Score :  70.50467511061204
Training Score :  94.00070377751321
Testing Score :  72.36514577219022

Training and evaluating Random Forest...
Training Score :  96.99054350639514
Testing Score :  74.65229853252066
Training Score :  91.75735957433315
Testing Score :  75.35243313252504



In [32]:
# Choose the best model based on evaluation metrics
best_model_name = 'Random Forest'  # Replace with the model name that performed best
best_model = models[best_model_name]

# Example: Making a prediction with new data
new_data = pd.DataFrame({
    'State': ['West Bengal'], 
    'District': ['Purba Bardhaman'], 
    'Market': ['Memari'], 
    'Commodity': ['Sweet Pumpkin'], 
    'Variety': ['Sweet Pumpkin'], 
    'Grade': ['Other'], 
})

encoded_data = encoder.transform(new_data)

# Predict min and max prices
predicted_min_price = best_model.predict(encoded_data)
predicted_max_price = best_model.predict(encoded_data)

print(f'Predicted Min Price: {predicted_min_price[0]}')
print(f'Predicted Max Price: {predicted_max_price[0]}')


Predicted Min Price: 2120.0
Predicted Max Price: 2120.0


In [33]:
os.makedirs('models',exist_ok=True)
joblib.dump(model,'./models/RandomForest.lb')
joblib.dump(encoder,'./models/Encoder.lb')

['./models/Encoder.lb']

In [34]:
df.to_csv('Data_change.csv')