In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import Ridge, Lasso
import statsmodels.api as sm
from sklearn.model_selection import GridSearchCV

In [None]:
df=pd.read_csv("avocado.csv")

In [None]:
df.head()

Unnamed: 0.1,Unnamed: 0,Date,AveragePrice,Total Volume,4046,4225,4770,Total Bags,Small Bags,Large Bags,XLarge Bags,type,year,region
0,0,2015-12-27,1.33,64236.62,1036.74,54454.85,48.16,8696.87,8603.62,93.25,0.0,conventional,2015,Albany
1,1,2015-12-20,1.35,54876.98,674.28,44638.81,58.33,9505.56,9408.07,97.49,0.0,conventional,2015,Albany
2,2,2015-12-13,0.93,118220.22,794.7,109149.67,130.5,8145.35,8042.21,103.14,0.0,conventional,2015,Albany
3,3,2015-12-06,1.08,78992.15,1132.0,71976.41,72.58,5811.16,5677.4,133.76,0.0,conventional,2015,Albany
4,4,2015-11-29,1.28,51039.6,941.48,43838.39,75.78,6183.95,5986.26,197.69,0.0,conventional,2015,Albany


In [None]:
df = df.drop('Date', axis=1)
df

Unnamed: 0.1,Unnamed: 0,AveragePrice,Total Volume,4046,4225,4770,Total Bags,Small Bags,Large Bags,XLarge Bags,type,year,region
0,0,1.33,64236.62,1036.74,54454.85,48.16,8696.87,8603.62,93.25,0.0,conventional,2015,Albany
1,1,1.35,54876.98,674.28,44638.81,58.33,9505.56,9408.07,97.49,0.0,conventional,2015,Albany
2,2,0.93,118220.22,794.70,109149.67,130.50,8145.35,8042.21,103.14,0.0,conventional,2015,Albany
3,3,1.08,78992.15,1132.00,71976.41,72.58,5811.16,5677.40,133.76,0.0,conventional,2015,Albany
4,4,1.28,51039.60,941.48,43838.39,75.78,6183.95,5986.26,197.69,0.0,conventional,2015,Albany
...,...,...,...,...,...,...,...,...,...,...,...,...,...
18244,7,1.63,17074.83,2046.96,1529.20,0.00,13498.67,13066.82,431.85,0.0,organic,2018,WestTexNewMexico
18245,8,1.71,13888.04,1191.70,3431.50,0.00,9264.84,8940.04,324.80,0.0,organic,2018,WestTexNewMexico
18246,9,1.87,13766.76,1191.92,2452.79,727.94,9394.11,9351.80,42.31,0.0,organic,2018,WestTexNewMexico
18247,10,1.93,16205.22,1527.63,2981.04,727.01,10969.54,10919.54,50.00,0.0,organic,2018,WestTexNewMexico


In [None]:
dic=dict()
dic={
    "region":df["region"].unique(),
    "type":df["type"].unique()
}


In [None]:
dic

{'region': array(['Albany', 'Atlanta', 'BaltimoreWashington', 'Boise', 'Boston',
        'BuffaloRochester', 'California', 'Charlotte', 'Chicago',
        'CincinnatiDayton', 'Columbus', 'DallasFtWorth', 'Denver',
        'Detroit', 'GrandRapids', 'GreatLakes', 'HarrisburgScranton',
        'HartfordSpringfield', 'Houston', 'Indianapolis', 'Jacksonville',
        'LasVegas', 'LosAngeles', 'Louisville', 'MiamiFtLauderdale',
        'Midsouth', 'Nashville', 'NewOrleansMobile', 'NewYork',
        'Northeast', 'NorthernNewEngland', 'Orlando', 'Philadelphia',
        'PhoenixTucson', 'Pittsburgh', 'Plains', 'Portland',
        'RaleighGreensboro', 'RichmondNorfolk', 'Roanoke', 'Sacramento',
        'SanDiego', 'SanFrancisco', 'Seattle', 'SouthCarolina',
        'SouthCentral', 'Southeast', 'Spokane', 'StLouis', 'Syracuse',
        'Tampa', 'TotalUS', 'West', 'WestTexNewMexico'], dtype=object),
 'type': array(['conventional', 'organic'], dtype=object)}

In [None]:
region_dict = {
    'north_east': ['Albany', 'BaltimoreWashington', 'Boston', 'BuffaloRochester', 'HarrisburgScranton', 'HartfordSpringfield', 'NewYork', 'Northeast', 'NorthernNewEngland', 'Philadelphia', 'Pittsburgh', 'Syracuse'],
    'west': ['Boise', 'California', 'Denver', 'LasVegas', 'LosAngeles', 'PhoenixTucson', 'Portland', 'Sacramento', 'SanDiego', 'SanFrancisco', 'Seattle', 'Spokane', 'West', 'WestTexNewMexico'],
    'south': ['Atlanta', 'Charlotte', 'DallasFtWorth', 'Houston', 'Jacksonville', 'Louisville', 'MiamiFtLauderdale', 'Midsouth', 'Nashville', 'NewOrleansMobile', 'Orlando', 'RaleighGreensboro', 'RichmondNorfolk', 'Roanoke', 'SouthCarolina', 'SouthCentral', 'Southeast', 'Tampa']
}

df['is_north_east'] = df['region'].isin(region_dict['north_east'])
df['is_west'] = df['region'].isin(region_dict['west'])
df['is_south'] = df['region'].isin(region_dict['south'])

def assign_super_region(region):
    for super_region, regions_list in region_dict.items():
        if region in regions_list:
            return super_region
    return 'Other'

df['SuperRegion'] = df['region'].apply(assign_super_region)

In [None]:
data_for_df = []
for super_region, regions_list in region_dict.items():
    for region in regions_list:
        data_for_df.append({'SuperRegion': super_region, 'Region': region})
df2 = pd.DataFrame(data_for_df)

In [None]:
df2.shape

(44, 2)

In [None]:
df2

Unnamed: 0,SuperRegion,Region
0,north_east,Albany
1,north_east,BaltimoreWashington
2,north_east,Boston
3,north_east,BuffaloRochester
4,north_east,HarrisburgScranton
5,north_east,HartfordSpringfield
6,north_east,NewYork
7,north_east,Northeast
8,north_east,NorthernNewEngland
9,north_east,Philadelphia


In [None]:
df

Unnamed: 0.1,Unnamed: 0,AveragePrice,Total Volume,4046,4225,4770,Total Bags,Small Bags,Large Bags,XLarge Bags,type,year,region,is_north_east,is_west,is_south,SuperRegion
0,0,1.33,64236.62,1036.74,54454.85,48.16,8696.87,8603.62,93.25,0.0,conventional,2015,Albany,True,False,False,north_east
1,1,1.35,54876.98,674.28,44638.81,58.33,9505.56,9408.07,97.49,0.0,conventional,2015,Albany,True,False,False,north_east
2,2,0.93,118220.22,794.70,109149.67,130.50,8145.35,8042.21,103.14,0.0,conventional,2015,Albany,True,False,False,north_east
3,3,1.08,78992.15,1132.00,71976.41,72.58,5811.16,5677.40,133.76,0.0,conventional,2015,Albany,True,False,False,north_east
4,4,1.28,51039.60,941.48,43838.39,75.78,6183.95,5986.26,197.69,0.0,conventional,2015,Albany,True,False,False,north_east
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
18244,7,1.63,17074.83,2046.96,1529.20,0.00,13498.67,13066.82,431.85,0.0,organic,2018,WestTexNewMexico,False,True,False,west
18245,8,1.71,13888.04,1191.70,3431.50,0.00,9264.84,8940.04,324.80,0.0,organic,2018,WestTexNewMexico,False,True,False,west
18246,9,1.87,13766.76,1191.92,2452.79,727.94,9394.11,9351.80,42.31,0.0,organic,2018,WestTexNewMexico,False,True,False,west
18247,10,1.93,16205.22,1527.63,2981.04,727.01,10969.54,10919.54,50.00,0.0,organic,2018,WestTexNewMexico,False,True,False,west


In [None]:
df_pro=df.drop("region",axis=1)

In [None]:
df_pro

Unnamed: 0.1,Unnamed: 0,AveragePrice,Total Volume,4046,4225,4770,Total Bags,Small Bags,Large Bags,XLarge Bags,type,year,is_north_east,is_west,is_south,SuperRegion
0,0,1.33,64236.62,1036.74,54454.85,48.16,8696.87,8603.62,93.25,0.0,conventional,2015,True,False,False,north_east
1,1,1.35,54876.98,674.28,44638.81,58.33,9505.56,9408.07,97.49,0.0,conventional,2015,True,False,False,north_east
2,2,0.93,118220.22,794.70,109149.67,130.50,8145.35,8042.21,103.14,0.0,conventional,2015,True,False,False,north_east
3,3,1.08,78992.15,1132.00,71976.41,72.58,5811.16,5677.40,133.76,0.0,conventional,2015,True,False,False,north_east
4,4,1.28,51039.60,941.48,43838.39,75.78,6183.95,5986.26,197.69,0.0,conventional,2015,True,False,False,north_east
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
18244,7,1.63,17074.83,2046.96,1529.20,0.00,13498.67,13066.82,431.85,0.0,organic,2018,False,True,False,west
18245,8,1.71,13888.04,1191.70,3431.50,0.00,9264.84,8940.04,324.80,0.0,organic,2018,False,True,False,west
18246,9,1.87,13766.76,1191.92,2452.79,727.94,9394.11,9351.80,42.31,0.0,organic,2018,False,True,False,west
18247,10,1.93,16205.22,1527.63,2981.04,727.01,10969.54,10919.54,50.00,0.0,organic,2018,False,True,False,west


In [None]:

df_pro=pd.get_dummies(df_pro,columns=["type","SuperRegion"])

In [None]:
df_pro

Unnamed: 0.1,Unnamed: 0,AveragePrice,Total Volume,4046,4225,4770,Total Bags,Small Bags,Large Bags,XLarge Bags,year,is_north_east,is_west,is_south,type_conventional,type_organic,SuperRegion_Other,SuperRegion_north_east,SuperRegion_south,SuperRegion_west
0,0,1.33,64236.62,1036.74,54454.85,48.16,8696.87,8603.62,93.25,0.0,2015,True,False,False,True,False,False,True,False,False
1,1,1.35,54876.98,674.28,44638.81,58.33,9505.56,9408.07,97.49,0.0,2015,True,False,False,True,False,False,True,False,False
2,2,0.93,118220.22,794.70,109149.67,130.50,8145.35,8042.21,103.14,0.0,2015,True,False,False,True,False,False,True,False,False
3,3,1.08,78992.15,1132.00,71976.41,72.58,5811.16,5677.40,133.76,0.0,2015,True,False,False,True,False,False,True,False,False
4,4,1.28,51039.60,941.48,43838.39,75.78,6183.95,5986.26,197.69,0.0,2015,True,False,False,True,False,False,True,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
18244,7,1.63,17074.83,2046.96,1529.20,0.00,13498.67,13066.82,431.85,0.0,2018,False,True,False,False,True,False,False,False,True
18245,8,1.71,13888.04,1191.70,3431.50,0.00,9264.84,8940.04,324.80,0.0,2018,False,True,False,False,True,False,False,False,True
18246,9,1.87,13766.76,1191.92,2452.79,727.94,9394.11,9351.80,42.31,0.0,2018,False,True,False,False,True,False,False,False,True
18247,10,1.93,16205.22,1527.63,2981.04,727.01,10969.54,10919.54,50.00,0.0,2018,False,True,False,False,True,False,False,False,True


In [None]:
df_pro=df_pro.rename(columns={"4046":"A","4225":"B","4770":"C"})

In [None]:
df_pro

Unnamed: 0.1,Unnamed: 0,AveragePrice,Total Volume,A,B,C,Total Bags,Small Bags,Large Bags,XLarge Bags,year,is_north_east,is_west,is_south,type_conventional,type_organic,SuperRegion_Other,SuperRegion_north_east,SuperRegion_south,SuperRegion_west
0,0,1.33,64236.62,1036.74,54454.85,48.16,8696.87,8603.62,93.25,0.0,2015,True,False,False,True,False,False,True,False,False
1,1,1.35,54876.98,674.28,44638.81,58.33,9505.56,9408.07,97.49,0.0,2015,True,False,False,True,False,False,True,False,False
2,2,0.93,118220.22,794.70,109149.67,130.50,8145.35,8042.21,103.14,0.0,2015,True,False,False,True,False,False,True,False,False
3,3,1.08,78992.15,1132.00,71976.41,72.58,5811.16,5677.40,133.76,0.0,2015,True,False,False,True,False,False,True,False,False
4,4,1.28,51039.60,941.48,43838.39,75.78,6183.95,5986.26,197.69,0.0,2015,True,False,False,True,False,False,True,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
18244,7,1.63,17074.83,2046.96,1529.20,0.00,13498.67,13066.82,431.85,0.0,2018,False,True,False,False,True,False,False,False,True
18245,8,1.71,13888.04,1191.70,3431.50,0.00,9264.84,8940.04,324.80,0.0,2018,False,True,False,False,True,False,False,False,True
18246,9,1.87,13766.76,1191.92,2452.79,727.94,9394.11,9351.80,42.31,0.0,2018,False,True,False,False,True,False,False,False,True
18247,10,1.93,16205.22,1527.63,2981.04,727.01,10969.54,10919.54,50.00,0.0,2018,False,True,False,False,True,False,False,False,True


In [None]:
X = df_pro.drop(['AveragePrice','C','XLarge Bags',"year"], axis=1)
y = df_pro['AveragePrice']
x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
X

Unnamed: 0.1,Unnamed: 0,Total Volume,A,B,Total Bags,Small Bags,Large Bags,is_north_east,is_west,is_south,type_conventional,type_organic,SuperRegion_Other,SuperRegion_north_east,SuperRegion_south,SuperRegion_west
0,0,64236.62,1036.74,54454.85,8696.87,8603.62,93.25,True,False,False,True,False,False,True,False,False
1,1,54876.98,674.28,44638.81,9505.56,9408.07,97.49,True,False,False,True,False,False,True,False,False
2,2,118220.22,794.70,109149.67,8145.35,8042.21,103.14,True,False,False,True,False,False,True,False,False
3,3,78992.15,1132.00,71976.41,5811.16,5677.40,133.76,True,False,False,True,False,False,True,False,False
4,4,51039.60,941.48,43838.39,6183.95,5986.26,197.69,True,False,False,True,False,False,True,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
18244,7,17074.83,2046.96,1529.20,13498.67,13066.82,431.85,False,True,False,False,True,False,False,False,True
18245,8,13888.04,1191.70,3431.50,9264.84,8940.04,324.80,False,True,False,False,True,False,False,False,True
18246,9,13766.76,1191.92,2452.79,9394.11,9351.80,42.31,False,True,False,False,True,False,False,False,True
18247,10,16205.22,1527.63,2981.04,10969.54,10919.54,50.00,False,True,False,False,True,False,False,False,True


In [None]:
x_train_numeric = x_train.astype(int)
model = sm.OLS(y_train, x_train_numeric)

In [None]:
X_train=StandardScaler().fit_transform(x_train_numeric)
X_test=StandardScaler().fit_transform(x_test)
y_train=y_train

In [None]:
results = model.fit()
results.summary()

0,1,2,3
Dep. Variable:,AveragePrice,R-squared:,0.454
Model:,OLS,Adj. R-squared:,0.453
Method:,Least Squares,F-statistic:,1100.0
Date:,"Thu, 19 Feb 2026",Prob (F-statistic):,0.0
Time:,10:59:07,Log-Likelihood:,-3040.8
No. Observations:,14599,AIC:,6106.0
Df Residuals:,14587,BIC:,6197.0
Df Model:,11,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Unnamed: 0,-0.0033,0.000,-20.660,0.000,-0.004,-0.003
Total Volume,-2.022e-07,5.36e-08,-3.774,0.000,-3.07e-07,-9.72e-08
A,1.491e-07,5.48e-08,2.721,0.007,4.17e-08,2.56e-07
B,2.507e-07,5.84e-08,4.291,0.000,1.36e-07,3.65e-07
Total Bags,1.832e-06,2.79e-07,6.566,0.000,1.29e-06,2.38e-06
Small Bags,-1.616e-06,2.65e-07,-6.099,0.000,-2.14e-06,-1.1e-06
Large Bags,-1.718e-06,2.56e-07,-6.716,0.000,-2.22e-06,-1.22e-06
is_north_east,0.4046,0.003,150.811,0.000,0.399,0.410
is_west,0.3217,0.002,131.573,0.000,0.317,0.327

0,1,2,3
Omnibus:,1275.139,Durbin-Watson:,1.99
Prob(Omnibus):,0.0,Jarque-Bera (JB):,2415.48
Skew:,0.602,Prob(JB):,0.0
Kurtosis:,4.588,Cond. No.,1.84e+23


In [None]:
param_grid = {
    'alpha': [0.001, 0.01, 0.1, 1, 10, 100]
}

grid1=GridSearchCV(
    estimator=Lasso(),
    param_grid=param_grid
)

In [None]:
param_grid = {
    'alpha': [0.001, 0.01, 0.1, 1, 10, 100]
}

grid2=GridSearchCV(
    estimator=Ridge(),
    param_grid=param_grid
)

In [None]:
grid1.fit(X_train, y_train)

In [None]:
grid1.best_estimator_

In [None]:

test_score = grid1.score(X_test, y_test)
print("Test Score:", test_score)

Test Score: 0.4360332068212981


In [None]:
train_score = grid1.score(X_train, y_train)
print("Train Score:", train_score)

Train Score: 0.4515209494132568


In [None]:
grid2.fit(X_train,y_train)

In [None]:
grid2.best_estimator_

In [None]:
ridge_train_score = grid2.score(X_train, y_train)
print("Train Score:", ridge_train_score)

Train Score: 0.45350161996141236


In [None]:
ridge_test_score = grid2.score(X_test, y_test)
print("Test Score:", ridge_test_score)

Test Score: 0.4379026846211135


In [None]:
from sklearn.neural_network import MLPRegressor


In [None]:
ann = MLPRegressor(solver='adam', alpha=0.0001, batch_size=20, learning_rate='constant', max_iter=10000, random_state=111)

In [None]:
param_grid = {
    'hidden_layer_sizes': [(30,), (50,), (100,), (75, 30), (100, 50)],
    'activation': ['relu', 'tanh', 'logistic'],
    'learning_rate_init': [0.001, 0.01, 0.1],
    'n_iter_no_change': [10, 20, 30]
}

In [None]:
grid_search = GridSearchCV(ann, param_grid, scoring='neg_root_mean_squared_error', cv=5)

In [None]:
grid_search.fit(X_train, y_train)



In [None]:
ann.fit(X_train, y_train)

In [None]:
ann.score(X_test,y_test)

0.5492059672435268

In [None]:
train_score = ann.score(X_train,y_train)
train_score

0.6198611087582602