<a href="https://colab.research.google.com/github/MahdiFaourr/MahdiFaourr/blob/main/apple_quality.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install opendatasets

In [None]:
# Import necessary libraries and functions
import opendatasets as od
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_validate
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score,precision_score,recall_score
import joblib

In [None]:
# Random seed=42
SEED=42

In [None]:
# Download the dataset from Kaggle
od.download("https://www.kaggle.com/datasets/nelgiriyewithana/apple-quality")

In [None]:
# Read the data in a pandas dataframe
data=pd.read_csv("/content/apple-quality/apple_quality.csv")
data.head()

In [4]:
# Shape of the data
data.shape

(4001, 9)

In [5]:
# Data info
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4001 entries, 0 to 4000
Data columns (total 9 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   A_id         4000 non-null   float64
 1   Size         4000 non-null   float64
 2   Weight       4000 non-null   float64
 3   Sweetness    4000 non-null   float64
 4   Crunchiness  4000 non-null   float64
 5   Juiciness    4000 non-null   float64
 6   Ripeness     4000 non-null   float64
 7   Acidity      4001 non-null   object 
 8   Quality      4000 non-null   object 
dtypes: float64(7), object(2)
memory usage: 281.4+ KB


In [None]:
# Search for nulls
data.isnull().sum()

In [7]:
# Drop null values
data=data.dropna()

In [8]:
# Drop A_id feature
data=data.drop('A_id',axis=1)

In [None]:
# Recheck data shape again
data.shape

In [10]:
# Get the count of the unique values of the column 'Quality'
data['Quality'].value_counts()

good    2004
bad     1996
Name: Quality, dtype: int64

In [None]:
# Define a subdata
data_good_quality = data[data['Quality'] == "good"]

# Create a figure and an array of subplots with 4 rows and 2 columns
fig, axs = plt.subplots(4, 2, figsize=(15, 15))

# Flatten the axs array to simplify indexing
axs = axs.flatten()

# Loop through each column (excluding 'Quality')
for i, column in enumerate(data_good_quality.columns):
    # Plot data on the i-th subplot
    axs[i].hist(data_good_quality[column])

    ## Set title for the subplot
    axs[i].set_title(column)

    # Set x-axis label for the subplot
    axs[i].set_xlabel('Index')

    # Set y-axis label for the subplot
    axs[i].set_ylabel('Frequency')
    # Adjust layout to prevent overlap
plt.tight_layout()

# Show the plot
plt.show()


In [None]:
data_bad_quality = data[data['Quality'] == "bad"]
fig, axs = plt.subplots(4, 2, figsize=(15, 15))
axs = axs.flatten()
for i, column in enumerate(data_bad_quality.columns):
    axs[i].hist(data_bad_quality[column])
    axs[i].set_title(column)
    axs[i].set_xlabel('Index')
    axs[i].set_ylabel('Frequency')
plt.tight_layout()
plt.show()

In [12]:
# Encode the classes in Quality column
encoder=LabelEncoder()
data['Quality']=encoder.fit_transform(data['Quality'])

In [None]:
# Calculate the correlation matrix
correlation_matrix = data.corr()

# Plot the correlation matrix using Seaborn
plt.figure(figsize=(10, 8))
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', fmt=".2f")
plt.title('Correlation Matrix')
plt.show()


In [13]:
# Split the data into training and testing parts
x=data.drop('Quality',axis=1)
y=data['Quality'].values
x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.2)

In [23]:
# Define a set of models
lr=LogisticRegression()
sv=SVC(kernel="rbf")
tree=DecisionTreeClassifier()
adab=AdaBoostClassifier()
forest=RandomForestClassifier()
models=[lr,sv,tree,forest,adab]
# Iterate over the models and find the model with best performance
for model in models:
  score=cross_validate(model,x_train,y_train,cv=3,scoring=['accuracy','precision','recall'])
  print(f"{model}:")
  print('accuracy%:',score.get('test_accuracy').mean()*100)
  print('precision%:',score.get('test_precision').mean()*100)
  print('recall%:',score.get('test_recall').mean()*100)
  print(''.join('-'for x in range(100)))

LogisticRegression():
accuracy%: 74.49940889719619
precision%: 73.63919444865785
recall%: 76.57320872274144
----------------------------------------------------------------------------------------------------
SVC():
accuracy%: 87.96902117244083
precision%: 87.84116268527681
recall%: 88.22429906542055
----------------------------------------------------------------------------------------------------
DecisionTreeClassifier():
accuracy%: 78.93745681022523
precision%: 79.43295116503454
recall%: 78.31775700934578
----------------------------------------------------------------------------------------------------
RandomForestClassifier():
accuracy%: 87.40643314442661
precision%: 86.6960625322113
recall%: 88.47352024922118
----------------------------------------------------------------------------------------------------
AdaBoostClassifier():
accuracy%: 75.53054187451976
precision%: 75.41757165302272
recall%: 75.95015576323988
----------------------------------------------------------------

In [24]:
# Define Parameter Grid
param_grid = {
    'C': [0.1, 1, 10, 100],
    'gamma': [0.01, 0.1, 1, 10],
}

# Instantiate Classifier
svc = SVC(kernel='rbf')

# Instantiate GridSearchCV
grid_search = GridSearchCV(svc, param_grid, cv=5)

# Fit GridSearchCV
grid_search.fit(x_train, y_train)

# Access Best Parameters and Best Score
print("Best Parameters:", grid_search.best_params_)
print("Best Score:", grid_search.best_score_)


Best Parameters: {'C': 10, 'gamma': 0.1}
Best Score: 0.9075000000000001


In [25]:
# Train the model svc with the optimal parameters
svc = SVC(kernel='rbf',C=10,gamma=0.1)
svc.fit(x_train,y_train)

In [26]:
# Check the performance of the fine tuned model
y_hat=svc.predict(x_test)
print("Accuracy%:",accuracy_score(y_test,y_hat))
print("Precision%:",precision_score(y_test,y_hat))
print("Recall%:",recall_score(y_test,y_hat))

Accuracy%: 0.9175
Precision%: 0.9172932330827067
Recall%: 0.9172932330827067


In [29]:
# Save the trained model to a file
joblib.dump(svc, 'apple_quality_svc_model.pkl')

['apple_quality_svc_model.pkl']

In [30]:
def quality_predictor(x,model):
  y_pred=model.predict([x])
  if y_pred>0.5:
    return "Good Apple"
  else:
    return "Bad Apple"

In [38]:
# Example usage 1
x=x_test.values[0]
print(y_test[0])
model=svc
print(quality_predictor(x,model))

1
Good Apple




In [40]:
# Example usage 2
x=x_test.values[90]
print(y_test[90])
model=svc
print(quality_predictor(x,model))

0
Bad Apple


