In [None]:
%pip install autogluon

In [None]:
#%pip install --upgrade pip
#!pip install --upgrade pillow autogluon==0.6.0 duckdb https://github.com/pandas-profiling/pandas-profiling/archive/master.zip
#%pip install --upgrade autogluon
#%pip install duckdb
#%pip install numba==0.53.1
#%pip install ydata-profiling==3.1.0
#%pip install --upgrade visions==0.7.0

In [None]:
%pip install numba==0.56.0

In [None]:
%pip install matplotlib --user


In [None]:
%pip install duckdb

In [None]:
%pip install torch --user

In [None]:
%pip install scikit-learn

In [None]:
%pip install ydata-profiling==4.7.0


In [None]:
%pip install ipywidgets

In [None]:
%pip install tensorflow-gpu
# or
%pip install torch torchvision torchaudio


In [None]:
import os
import numpy as np
import warnings
import pandas as pd
from IPython.display import display, Image
import matplotlib.pyplot as plt
import json
import torch as th
import random
import duckdb as dd
import uuid
import numba
numba.config.DISABLE_JIT = True

from sklearn.model_selection import train_test_split


# Auto Exploratory Data Analysis
from ydata_profiling import ProfileReport


# AutoML
import autogluon.core as ag
from autogluon.multimodal import MultiModalPredictor
from autogluon.multimodal.data.infer_types import infer_column_types
from autogluon.multimodal.utils.misc import shopee_dataset

# Evaluation
from sklearn import metrics

# Configuration
warnings.filterwarnings('ignore')
pd.set_option('display.max_columns', None)
%matplotlib inline

# Seed
def set_seed(seed):
    th.manual_seed(seed)
    np.random.seed(seed)
    random.seed(seed)
    
set_seed(123)

# Data Collection

In [None]:
sdir= r'C:\Users\acer\Downloads\Processed Images_Fruits'
bad_path=r'C:\Users\acer\Downloads\Processed Images_Fruits\Bad Quality_Fruits'
good_path=r'C:\Users\acer\Downloads\Processed Images_Fruits\Good Quality_Fruits'
mixed_path=r'C:\Users\acer\Downloads\Processed Images_Fruits\Mixed Qualit_Fruits'
file_paths=[]
labels=[]
ht=0
wt=0
samples=0
sample_count=20
for quality in [bad_path, good_path, mixed_path]:    
    fruit_list=os.listdir(quality)
    for fruit in fruit_list:
        fruit_path=os.path.join(quality, fruit)
        img_list=os.listdir(fruit_path)
        for i, img in enumerate(img_list):
            img_path=os.path.join(fruit_path, img)
            if i < sample_count:
                img=plt.imread(img_path)               
                ht += img.shape[0]
                wt += img.shape[1]
                samples +=1
            file_paths.append(img_path)
            if quality == mixed_path:
                labels.append(str.lower(fruit) + '_mixed')
            else:
                labels.append(str.lower(fruit))
Fseries = pd.Series(file_paths, name='image')
Lseries = pd.Series(labels, name='label')
df=pd.concat([Fseries, Lseries], axis=1)
# split df into a train_df a valid_df and a test_df
trsplit=.9
vsplit=.05
dsplit =vsplit/(1-trsplit)
strat=df['label']
train_df, dummy_df=train_test_split(df, train_size=.9, shuffle=True, stratify=strat)
strat=dummy_df['label']
test_df, valid_df=train_test_split(dummy_df, train_size=dsplit, shuffle=True, stratify=strat)
print('train_df lenght: ', len(train_df), '  test_df length: ', len(test_df), '  valid_df length: ', len(valid_df))
classes=list(train_df['label'].unique())
class_count = len(classes)
groups=df.groupby('label')
print('{0:^30s} {1:^13s}'.format('CLASS', 'IMAGE COUNT'))
for label in train_df['label'].unique():
      group=groups.get_group(label)
      samples=len(group)
      print('{0:^30s} {1:^13s}'.format(label, str(len(group))))
wave=wt/samples
have=ht/samples
aspect_ratio= have/wave
print ('Average Image Height: ' ,have, '  Average Image Width: ', wave, '  Aspect ratio: ', aspect_ratio)

# Data Preparation

In [None]:
def trim (df, max_size, min_size, column):
    df=df.copy()
    original_class_count= len(list(df[column].unique()))
    print ('Original Number of classes in dataframe: ', original_class_count)
    sample_list=[] 
    groups=df.groupby(column)
    for label in df[column].unique():        
        group=groups.get_group(label)
        sample_count=len(group)         
        if sample_count> max_size :
            strat=group[column]
            samples,_=train_test_split(group, train_size=max_size, shuffle=True, stratify=strat)            
            sample_list.append(samples)
        elif sample_count>= min_size:
            sample_list.append(group)
    df=pd.concat(sample_list, axis=0).reset_index(drop=True)
    final_class_count= len(list(df[column].unique())) 
    if final_class_count != original_class_count:
        print ('*** WARNING***  dataframe has a reduced number of classes' )
    balance=list(df[column].value_counts())
    print (balance)
    return df

In [None]:
train_df

In [None]:
dataset_concated = pd.concat([train_df, test_df])

# Data cleaning
label_count = dd.query("""
select count(dc.label) as count from dataset_concated dc
group by dc.label
""").to_df()
label_count

label_count_avg = dd.query("""
select avg(lc.count) as avg from label_count lc
""").to_df()
label_count_avg

max_samples = int(label_count_avg.iloc[0])
min_samples = 0
column = 'label'

dataset_trimmed = trim(dataset_concated, max_samples, min_samples, column)

dataset_trimmed.label = dataset_trimmed.label.replace('lemon_mixed', 'lime_mixed')

dataset_cleaned = dataset_trimmed
dataset_cleaned.label.unique()

In [None]:
train_data = dataset_cleaned.sample(frac=0.8)
test_data = dataset_cleaned.drop(train_data.index)

# Data Analysis

In [None]:
import os
import pandas as pd

# Paths to the dataset directories
sdir = r'C:\Users\acer\Downloads\Processed Images_Fruits'
bad_path = r'C:\Users\acer\Downloads\Processed Images_Fruits\Bad Quality_Fruits'
good_path = r'C:\Users\acer\Downloads\Processed Images_Fruits\Good Quality_Fruits'
mixed_path = r'C:\Users\acer\Downloads\Processed Images_Fruits\Mixed Qualit_Fruits'

file_paths = []  # List to store file paths
labels = []      # List to store labels

# Iterate through each quality category
for quality, quality_label in [(bad_path, 'bad'), (good_path, 'good'), (mixed_path, 'mixed')]:
    # Get the list of fruit types (subdirectories) in the current quality path
    fruit_list = os.listdir(quality)
    
    for fruit in fruit_list:
        fruit_path = os.path.join(quality, fruit)
        
        # Ensure it's a directory before processing
        if os.path.isdir(fruit_path):
            # Get the list of image files for the current fruit type
            img_list = os.listdir(fruit_path)
            
            for img in img_list:
                # Generate the full image file path
                img_path = os.path.join(fruit_path, img)
                
                # Append the image path and its corresponding label
                file_paths.append(img_path)
                labels.append(f"{fruit.lower()}_{quality_label}")

# Create a DataFrame from the collected file paths and labels
dataset_cleaned = pd.DataFrame({'file_path': file_paths, 'label': labels})

# Display the first few rows of the DataFrame to verify
print(dataset_cleaned.head())

# Exploratory data analysis
profile = ProfileReport(
    dataset_cleaned, 
    explorative=True
)

profile.to_notebook_iframe()


# Modeling

In [None]:
from sklearn.model_selection import train_test_split

# Sample a subset of the dataset
train_data_subset, _ = train_test_split(train_data, test_size=0.6, random_state=42)  

predictor = MultiModalPredictor(label="label")
predictor.fit(
    train_data=train_data_subset,
    time_limit=60 * 30, 
    hyperparameters={
        'model.timm_image.checkpoint_name': 'swin_base_patch4_window7_224',
        'optimization.max_epochs': 2,  
    }
)


In [None]:
# !rm -R /kaggle/working/AutogluonModels/
!ls -al /kaggle/working/AutogluonModels/

# Evaluation

In [None]:
    from autogluon.multimodal import MultiModalPredictor
    predictor = MultiModalPredictor.load(r"c:\Users\acer\Downloads\AutogluonModels\ag-20241215_160144")

In [None]:
import pandas as pd
image_path = r"C:\Users\acer\Downloads\Quality Dataset\test\fresh\images-13-_jpeg.rf.548d3de3cfbe2bd64344b7bface3db6c.jpg"

# Create a DataFrame with one row
test_data = pd.DataFrame({'image': [image_path]})


In [None]:
test_data = pd.DataFrame({
    'image': [r'C:\Users\acer\Downloads\Quality Dataset\test\rotten\download_jpeg.rf.a74308f010a0c793a3b1311052b8d7f2.jpg'],
    'label': ['banana_bad_bad']  # Ground-truth label
})
scores = predictor.evaluate(test_data, metrics=["accuracy"])
print('Top-1 test acc: %.3f' % scores["accuracy"])

In [None]:
import os
from IPython.display import Image, display

def predict_label(image_path):
    # Validate the file exists
    if not os.path.exists(image_path):
        return f"Error: The file '{image_path}' does not exist."

    # Optionally, display the image
    display(Image(filename=image_path))

    # Create a dictionary for the input
    test_data = {'image': [image_path]}

    # Make prediction
    prediction = predictor.predict(test_data)
    return f"Predicted label: {prediction[0]}"

image_path = input("Enter the path to your image: ").strip()

# Predict and display the label
result = predict_label(image_path)
print(result)

In [None]:
print(predictor.class_labels)
proba = predictor.predict_proba({'image': [image_path]})
print(proba)