In [8]:
# Télécharger le CSV, importer et supprimer les colonnes spécifiées
# Prompt : 
# read the file "avocado.csv", 
# strip the columns : "Unnamed: 0, Total Volume, and Total Bags" 
# and put it inside a dataframe named "df_avocado"

# Read the CSV file
import pandas as pd

# Read the file and create the dataframe
df_avocado = pd.read_csv('avocado.csv')

# Drop the specified columns
columns_to_drop = ['Unnamed: 0', 'Total Volume', 'Total Bags']
df_avocado = df_avocado.drop(columns=columns_to_drop, errors='ignore')


In [9]:
# Renommage de colonnes
## Prompt :
# inside the dataframe df_avocado, rename the columns, 
# respectively, "4046, 4225, 4770" to "Quality1, Quality2, and Quality3"
# please use the inplace parameter if possible to reuse the df

# Rename the columns
column_mapping = {
    '4046': 'Quality1',
    '4225': 'Quality2',
    '4770': 'Quality3'
}
df_avocado.rename(columns=column_mapping, inplace=True)

In [10]:
# Convertir les dates
##Prompt
#Inside df_avocado, the column "Date" must be converted in python usable "datetime" format

# Convert Date column to datetime
df_avocado['Date'] = pd.to_datetime(df_avocado['Date'])

In [11]:

##Prompt : 
# run through the rows of df_avocado, 
# if a row is a duplicate, drop it. 
# If a value is missing inside the row, put the row ID inside a var named "inconsistent_rows_array"
# If no row was inconsistent, display "Aucune valeur manquante!". 
# Else, display the array of inconsistent rows preceded by "Index des lignes avec des valeurs manquantes :"

# Check for duplicates and missing values
# Drop duplicates
df_avocado.drop_duplicates(inplace=True)

# Check for missing values
inconsistent_rows_array = df_avocado[df_avocado.isnull().any(axis=1)].index.tolist()

if len(inconsistent_rows_array) == 0:
    print("Aucune valeur manquante!")
else:
    print("Index des lignes avec des valeurs manquantes :")
    print(inconsistent_rows_array)


Aucune valeur manquante!


In [12]:
# Sépération des colonnes numériques et catégoriques
##Prompt
#Create a new dataframe, from df_avocado, named df_avocado_num, 
# containing the following rows :  Quality1, Quality2, Quality3, Small Bags, Large Bags, XLarge Bags, year. 
# Then, create a new dataframe, also from df_avocado, named df_avocado_cat, containing : type, region

# Create numeric and categorical dataframes
numeric_columns = ['Quality1', 'Quality2', 'Quality3', 'Small Bags', 'Large Bags', 'XLarge Bags', 'year']
categorical_columns = ['type', 'region']

df_avocado_num = df_avocado[numeric_columns].copy()
df_avocado_cat = df_avocado[categorical_columns].copy()

In [13]:
#Normalisation des données
##Prompt
# Normalize the values inside df_avocado_num using StandardScaler; put the results inside df_avocado_num_scaled, 
# make imports if necessary

# Normalize numeric values
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
df_avocado_num_scaled = pd.DataFrame(
    scaler.fit_transform(df_avocado_num),
    columns=df_avocado_num.columns,
    index=df_avocado_num.index
)

In [14]:
# Encodage des données catégoriques
## Prompt
# Use a OneHotEncoder to transform the categorical data into numerical datan use "drop first" and "handle unknown ignore" as parameters
# Display the rows 9123 to 9130 to verify the changes

# One-hot encode categorical data
from sklearn.preprocessing import OneHotEncoder
import numpy as np

encoder = OneHotEncoder(drop='first', handle_unknown='ignore', sparse_output=False)
encoded_data = encoder.fit_transform(df_avocado_cat)

# Create DataFrame with encoded categorical data
feature_names = encoder.get_feature_names_out(categorical_columns)
df_avocado_cat_scaled = pd.DataFrame(
    encoded_data,
    columns=feature_names,
    index=df_avocado_cat.index
)

# Display specified rows
print(df_avocado_cat_scaled.iloc[9123:9131])

      type_organic  region_Atlanta  region_BaltimoreWashington  region_Boise  \
9123           0.0             0.0                         0.0           0.0   
9124           0.0             0.0                         0.0           0.0   
9125           0.0             0.0                         0.0           0.0   
9126           1.0             0.0                         0.0           0.0   
9127           1.0             0.0                         0.0           0.0   
9128           1.0             0.0                         0.0           0.0   
9129           1.0             0.0                         0.0           0.0   
9130           1.0             0.0                         0.0           0.0   

      region_Boston  region_BuffaloRochester  region_California  \
9123            0.0                      0.0                0.0   
9124            0.0                      0.0                0.0   
9125            0.0                      0.0                0.0   
9126       

In [15]:
# Fusionner les dataframes scalés
## Prompt
# Using ColumnTransformer, encoder and scaler
# apply the required changes to the scaled dataframes, df_avocado_num_scaled and df_avocado_cat_scaled, 
# to create a new dataframe called df_avocado_transformed. 
# Make imports if necessary. 

# Apply ColumnTransformer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

# Create the final transformed dataframe
df_avocado_transformed = pd.concat([df_avocado_num_scaled, df_avocado_cat_scaled], axis=1)

In [16]:
# Création d'un pipeline
## Prompt
# Create a new pipeline, including : 
# pretreatement of data using ColumnTransformer, 
# a prediction model based off XGBRegressor

# Create pipeline with ColumnTransformer and XGBRegressor
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from xgboost import XGBRegressor

# Define transformers
numeric_transformer = StandardScaler()
categorical_transformer = OneHotEncoder(drop='first', handle_unknown='ignore')

# Create preprocessor
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_columns),
        ('cat', categorical_transformer, categorical_columns)
    ])

# Create pipeline
pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('regressor', XGBRegressor())
])

In [17]:
# Séparation des données
## Prompt
# We want to predict the average price of avocados, using all available data.
# Use the df_avocado_transformed var exclusively, DO NOT use df_avocado
# Split the data with : 80% of data to train, 20% remaining data to test and evaluate the model. 
# Target is AveragePrice. 
# Features are all the other data. Explicitly define them.
# Use the train_test_split method from sklearn. random state of 1337. 
# Make imports if necessary

# Prepare data for model training
from sklearn.model_selection import train_test_split

# Define target and features
target = 'AveragePrice'
features = numeric_columns + categorical_columns

# Split the data
X = df_avocado[features]
y = df_avocado[target]

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=1337
)

In [18]:
#Entrainement du modèle
## Prompt
# Train the pipeline model using the targets and feature X and y, including X_train, X_test, y_train, y_test
# Display the R² score
# Train the model

# Train the model and display R² score
from sklearn.metrics import r2_score

# Train the pipeline
pipeline.fit(X_train, y_train)

# Calculate and display R² score
y_train_pred = pipeline.predict(X_train)
train_r2 = r2_score(y_train, y_train_pred)
print(f"R² score on training data: {train_r2:.4f}")

R² score on training data: 0.9142


In [19]:
# Prédictions et évaluation
## Prompt
# Predict the AveragePrice from the whole test dataset
# Evaluate the model using 2 key metrics : RMSE and R²
# Make imports if necessary

# Predict and evaluate model
from sklearn.metrics import mean_squared_error
import numpy as np

# Make predictions on test set
y_test_pred = pipeline.predict(X_test)

# Calculate RMSE
rmse = np.sqrt(mean_squared_error(y_test, y_test_pred))

# Calculate R²
test_r2 = r2_score(y_test, y_test_pred)

print(f"RMSE: {rmse:.4f}")
print(f"R² score on test data: {test_r2:.4f}")

RMSE: 0.1610
R² score on test data: 0.8381


In [20]:
# Export de la pipeline
## Prompt
# Save the whole pipeline inside a .pkl file for future usage outside of this environment, no future training required.
# Make imports if necessary

# Save the pipeline
import pickle

# Save the model
with open('avocado_price_predictor.pkl', 'wb') as file:
    pickle.dump(pipeline, file)

print("Pipeline saved successfully as 'avocado_price_predictor.pkl'")

Pipeline saved successfully as 'avocado_price_predictor.pkl'
