# Banana Quality prediction

## Libraries

In [19]:
# Data manipulation
import pandas as pd
import numpy as np

from sklearn.preprocessing import LabelEncoder

from sklearn.model_selection import train_test_split

from sklearn.metrics import accuracy_score, classification_report
from sklearn.metrics import mean_squared_error, r2_score

from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier

## Data Processing

In [20]:
df=pd.read_csv(r"D:\ML\Git\Projects\ML-Models\Banana_Quality_Prediction\banana_quality_dataset.csv")
df

Unnamed: 0,sample_id,variety,region,quality_score,quality_category,ripeness_index,ripeness_category,sugar_content_brix,firmness_kgf,length_cm,weight_g,harvest_date,tree_age_years,altitude_m,rainfall_mm,soil_nitrogen_ppm
0,1,Manzano,Colombia,1.88,Processing,2.11,Turning,16.83,3.53,21.44,146.92,2023-10-16,13.7,58.2,2440.5,183.6
1,2,Plantain,Guatemala,2.42,Processing,4.25,Ripe,16.73,4.09,26.11,160.48,2023-10-14,5.1,280.2,2374.6,109.8
2,3,Burro,Ecuador,3.57,Premium,6.24,Overripe,21.34,1.63,25.20,225.27,2023-09-08,17.7,1246.9,1191.5,147.7
3,4,Manzano,Ecuador,2.21,Processing,5.39,Ripe,16.75,3.31,13.08,137.80,2023-10-07,13.0,1150.2,2845.1,92.8
4,5,Red Dacca,Ecuador,2.35,Processing,5.84,Ripe,16.90,3.07,12.98,227.84,2023-10-02,4.8,526.0,2136.9,129.7
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
995,996,Burro,Ecuador,3.50,Good,4.94,Ripe,21.06,2.49,27.95,196.35,2023-10-04,12.4,409.7,2029.9,173.0
996,997,Cavendish,Philippines,2.38,Processing,6.74,Overripe,16.10,2.00,12.39,165.55,2023-09-23,14.0,314.3,1330.7,180.5
997,998,Plantain,Ecuador,1.68,Processing,1.41,Green,17.17,2.28,16.56,210.08,2023-10-19,18.4,683.0,2955.4,189.9
998,999,Fehi,Guatemala,2.02,Processing,1.34,Green,17.03,2.88,26.31,162.50,2023-09-22,8.2,1362.7,1215.8,85.5


In [21]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 16 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   sample_id           1000 non-null   int64  
 1   variety             1000 non-null   object 
 2   region              1000 non-null   object 
 3   quality_score       1000 non-null   float64
 4   quality_category    1000 non-null   object 
 5   ripeness_index      1000 non-null   float64
 6   ripeness_category   1000 non-null   object 
 7   sugar_content_brix  1000 non-null   float64
 8   firmness_kgf        1000 non-null   float64
 9   length_cm           1000 non-null   float64
 10  weight_g            1000 non-null   float64
 11  harvest_date        1000 non-null   object 
 12  tree_age_years      1000 non-null   float64
 13  altitude_m          1000 non-null   float64
 14  rainfall_mm         1000 non-null   float64
 15  soil_nitrogen_ppm   1000 non-null   float64
dtypes: floa

### Dop columns

In [22]:
y=df['quality_score']


### Encode

In [23]:
from sklearn.preprocessing import LabelEncoder
encoder=LabelEncoder()
col_encode=['variety','region']
for column in col_encode:
    le = LabelEncoder()
    df[column] = le.fit_transform(df[column])

In [24]:
x=df.drop(['sample_id','quality_category','ripeness_index','harvest_date','ripeness_category'],axis=1)

## Data Split

In [25]:
x_train,x_test,y_train,y_test= train_test_split(x,y)
print(x_train.shape)
print(x_test.shape)

(750, 11)
(250, 11)


## Model

In [26]:
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, BaggingRegressor
from xgboost import XGBRegressor
from sklearn.tree import DecisionTreeRegressor

from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score


models = {
    'Linear Regression': LinearRegression(),
    'Random Forest': RandomForestRegressor(random_state=42),
    'Gradient Boosting': GradientBoostingRegressor(random_state=42),
    'Bagging': BaggingRegressor(random_state=42),
    'XGBoost': XGBRegressor(random_state=42),
    'Decision Tree': DecisionTreeRegressor(random_state=42)
}


for name, model in models.items():
    model.fit(x_train, y_train)
    y_pred = model.predict(x_test)
    
    mae = mean_absolute_error(y_test, y_pred)
    mse = mean_squared_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)
    print(f"Model: {name}: MAE: {mae:.2f}, MSE: {mse:.2f}, R2: {r2:.2f}")

Model: Linear Regression: MAE: 0.00, MSE: 0.00, R2: 1.00
Model: Random Forest: MAE: 0.00, MSE: 0.00, R2: 1.00
Model: Gradient Boosting: MAE: 0.00, MSE: 0.00, R2: 1.00
Model: Bagging: MAE: 0.00, MSE: 0.00, R2: 1.00
Model: XGBoost: MAE: 0.01, MSE: 0.00, R2: 1.00
Model: Decision Tree: MAE: 0.00, MSE: 0.00, R2: 1.00


In [28]:
from sklearn.model_selection import cross_val_score

# Perform k-fold cross-validation to validate model performance
for name, model in models.items():
    scores = cross_val_score(model, x, y, cv=5)
    print("Cross-validation scores:", scores)

Cross-validation scores: [1. 1. 1. 1. 1.]
Cross-validation scores: [0.99985022 0.99897402 0.99992406 0.99985723 0.9994856 ]
Cross-validation scores: [0.9998738  0.99922883 0.99989303 0.9998891  0.99961183]
Cross-validation scores: [0.99959257 0.99908668 0.99977275 0.99985728 0.99930576]
Cross-validation scores: [0.99964068 0.99903148 0.99972692 0.9997483  0.99939829]
Cross-validation scores: [0.99944092 0.99923668 0.99964992 0.99973844 0.99813955]
