In [2]:
import pandas as pd
import numpy as np
from IPython.display import display

In [4]:
file_path = r'C:\Users\serik\OneDrive\Рабочий стол\machine learning\avocado\avocado_prices_with_noise.csv'
df = pd.read_csv(file_path)
display(df)

Unnamed: 0,Date,AveragePrice,Total Volume,4046,4225,4770,Total Bags,Small Bags,Large Bags,XLarge Bags,type,year,region
0,2015-12-27,1.33 dollars,64236.62,1036.74,54454.85,48.16,8696.87,8603.62,93.25,0.0,conventional,2015,Albany
1,2015-12-20,$1.35 average,54876.98,674.28,44638.81,58.33,9505.56,9408.07,97.49,0.0,conventional,2015,Albany
2,2015-12-13,$0.93 average,118220.22,794.70,109149.67,130.50,8145.35,8042.21,103.14,0.0,conventional,2015,Albany
3,2015-12-06,$1.08 average,78992.15,1132.00,71976.41,72.58,5811.16,5677.40,133.76,0.0,conventional,2015,Albany
4,2015-11-29,1.28 dollars,51039.60,941.48,43838.39,75.78,6183.95,5986.26,197.69,0.0,conventional,2015,Albany
...,...,...,...,...,...,...,...,...,...,...,...,...,...
18244,2018-02-04,1.63 dollars,17074.83,2046.96,1529.20,0.00,13498.67,13066.82,431.85,0.0,organic,2018,WestTexNewMexico
18245,2018-01-28,1.71 dollars,13888.04,1191.70,3431.50,0.00,9264.84,8940.04,324.80,0.0,organic,2018,WestTexNewMexico
18246,2018-01-21,1.87 dollars,13766.76,1191.92,2452.79,727.94,9394.11,9351.80,42.31,0.0,organic,2018,WestTexNewMexico
18247,2018-01-14,$1.93 average,16205.22,1527.63,2981.04,727.01,10969.54,10919.54,50.00,0.0,organic,2018,WestTexNewMexico


In [22]:
df.columns = df.columns.str.lower().str.replace(" ", "_")

In [12]:
df.isna().sum()

date            0
averageprice    0
total_volume    0
4046            0
4225            0
4770            0
total_bags      0
small_bags      0
large_bags      0
xlarge_bags     0
type            0
year            0
region          0
dtype: int64

In [13]:
import re

In [14]:
pattern = r"(\d+[,.]?\d*)"

In [15]:
df["averageprice"] = df["averageprice"].str.extract(pattern)
df["averageprice"] = pd.to_numeric(df["averageprice"], errors="coerce")
df["averageprice"].isna().sum()

0

In [25]:
print(df['date'].value_counts())
print(df['year'].value_counts())

2015-12-27    108
2017-12-24    108
2017-12-10    108
2017-12-03    108
2017-11-26    108
             ... 
2016-11-06    108
2018-01-07    108
2017-06-18    107
2017-06-25    107
2015-12-06    107
Name: date, Length: 169, dtype: int64
2017    5722
2016    5616
2015    5615
2018    1296
Name: year, dtype: int64


In [23]:
df.dtypes

date             object
averageprice    float64
total_volume    float64
4046            float64
4225            float64
4770            float64
total_bags      float64
small_bags      float64
large_bags      float64
xlarge_bags     float64
type             object
year              int64
region           object
dtype: object

In [32]:
df['year'] = df['year'].astype('object')

In [24]:
X = df.drop(["averageprice"], axis=1)
y = df["averageprice"]

In [27]:
from sklearn.model_selection import train_test_split

In [28]:
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.20, shuffle=True, random_state=1)

In [39]:
numeric_features = X_train.select_dtypes(include=['int64', 'float64']).columns
categorical_features = X_train.select_dtypes(include=['object']).columns

In [41]:
from sklearn.preprocessing import RobustScaler
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

In [42]:
ohe_features = ['date', 'year']
oe_features = ['type', 'region']

In [43]:
numeric_transformer = Pipeline(steps=[
    ('scaler', RobustScaler())])
categorical_transformer_ohe = Pipeline(steps=[
    ('onehot', OneHotEncoder(handle_unknown='ignore'))])
categorical_transformer_oe = Pipeline(steps=[
    ('ordinal_encoder', OrdinalEncoder())])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat_ohe', categorical_transformer_ohe, ohe_features),
        ('cat_oe', categorical_transformer_oe, oe_features)])

In [70]:
X_train_preprocessed = preprocessor.fit_transform(X_train)
X_val_preprocessed = preprocessor.transform(X_val)

In [75]:
X_train_preprocessed_ = X_train_preprocessed.toarray()
X_val_preprocessed_ = X_val_preprocessed.toarray()

In [55]:
import tensorflow as tf
from tensorflow.keras import layers
from tensorflow.keras.models import Sequential
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import LearningRateScheduler
from tensorflow.keras.losses import MeanSquaredError
from sklearn.metrics import mean_squared_error

In [78]:
def create_model(input_shape, dropout_rate=0.2, weight_decay=0.0001):
    model = Sequential()
    model.add(layers.InputLayer(input_shape=input_shape))
    model.add(layers.Dense(128, activation='relu', kernel_regularizer=tf.keras.regularizers.l2(weight_decay)))
    model.add(layers.Dropout(dropout_rate))
    model.add(layers.Dense(64, activation='relu', kernel_regularizer=tf.keras.regularizers.l2(weight_decay)))
    model.add(layers.Dropout(dropout_rate))
    model.add(layers.Dense(32, activation='relu', kernel_regularizer=tf.keras.regularizers.l2(weight_decay)))
    model.add(layers.Dropout(dropout_rate))
    model.add(layers.Dense(1, activation='linear'))
    return model

In [79]:
def scheduler(epoch, lr):
    if epoch < 30:
        return lr
    else:
        return lr * 0.1

In [80]:
input_shape = X_train_preprocessed_.shape[1]
model = create_model(input_shape)
model.compile(optimizer=Adam(learning_rate=0.001), loss=MeanSquaredError(), metrics=['mse'])

callbacks = [LearningRateScheduler(scheduler, verbose=1)]
history = model.fit(X_train_preprocessed_, y_train, epochs=50, batch_size=32, validation_data=(X_val_preprocessed_, y_val), callbacks=callbacks)


Epoch 1: LearningRateScheduler setting learning rate to 0.0010000000474974513.
Epoch 1/50

Epoch 2: LearningRateScheduler setting learning rate to 0.0010000000474974513.
Epoch 2/50

Epoch 3: LearningRateScheduler setting learning rate to 0.0010000000474974513.
Epoch 3/50

Epoch 4: LearningRateScheduler setting learning rate to 0.0010000000474974513.
Epoch 4/50

Epoch 5: LearningRateScheduler setting learning rate to 0.0010000000474974513.
Epoch 5/50

Epoch 6: LearningRateScheduler setting learning rate to 0.0010000000474974513.
Epoch 6/50

Epoch 7: LearningRateScheduler setting learning rate to 0.0010000000474974513.
Epoch 7/50

Epoch 8: LearningRateScheduler setting learning rate to 0.0010000000474974513.
Epoch 8/50

Epoch 9: LearningRateScheduler setting learning rate to 0.0010000000474974513.
Epoch 9/50

Epoch 10: LearningRateScheduler setting learning rate to 0.0010000000474974513.
Epoch 10/50

Epoch 11: LearningRateScheduler setting learning rate to 0.0010000000474974513.
Epoch 1

In [81]:
y_val_pred = model.predict(X_val_preprocessed)

rmse = np.sqrt(mean_squared_error(y_val, y_val_pred))
print("RMSE for the validation set:", rmse)

RMSE for the validation set: 0.242923307317651


**XGboostRegressor**

In [83]:
import xgboost as xgb

In [84]:
model = xgb.XGBRegressor()

In [87]:
params = {
    'n_estimators': [100, 500, 1000], 
    'learning_rate': [0.1, 0.01, 0.001],  
    'max_depth': [3, 5, 7] 
}

In [88]:
grid_search = GridSearchCV(estimator=model, param_grid=params, scoring='neg_root_mean_squared_error', cv=5, n_jobs=-1)
grid_search.fit(X_train_preprocessed, y_train)

In [89]:
best_params = grid_search.best_params_
best_model = grid_search.best_estimator_

In [90]:
y_val_pred = best_model.predict(X_val_preprocessed)
rmse = mean_squared_error(y_val, y_val_pred, squared=False)
print("RMSE:", rmse)

RMSE: 0.1336073766317054
