### Step 1: Data Exploration and Preprocessing
### Load the dataset

In [18]:
import pandas as pd

# Load the dataset
data = pd.read_csv('apple_quality.csv')
# Check for non-numeric values in each column
for col in data.columns:
    if data[col].dtype == 'object':
        print(f"Unique values in column {col}:", data[col].unique())
# It seems like 'A_id' column contains non-numeric values, let's drop it
data.drop(columns=['A_id'], inplace=True)
# Check for missing values
print("Missing values:")
print(data.isnull().sum())
# Encode categorical variable 'Quality'
data['Quality'] = data['Quality'].map({'good': 1, 'bad': 0})
# Drop rows with missing values
data.dropna(inplace=True)


Unique values in column Acidity: ['-0.491590483' '-0.722809367' '2.621636473' ... '-2.229719806'
 '1.599796456' 'Created_by_Nidula_Elgiriyewithana']
Unique values in column Quality: ['good' 'bad' nan]
Missing values:
Size           1
Weight         1
Sweetness      1
Crunchiness    1
Juiciness      1
Ripeness       1
Acidity        0
Quality        1
dtype: int64


### Step 2: Split the Dataset

In [19]:
# Split the dataset into features (X) and target variable (y)
X = data.drop(columns=['Quality'])
y = data['Quality']


### Step 3: Model Selection
We'll try out multiple classification algorithms and tune their hyperparameters to find the best-performing model

In [22]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize Random Forest Classifier
rf_classifier = RandomForestClassifier(random_state=42)

# Initialize Gradient Boosting Classifier
gb_classifier = GradientBoostingClassifier(random_state=42)

# Train the models
rf_classifier.fit(X_train, y_train)
gb_classifier.fit(X_train, y_train)


In [27]:
from sklearn.model_selection import GridSearchCV

# Hyperparameter tuning for Random Forest Classifier
rf_param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [None, 10, 20],
    'min_samples_split': [2, 5, 10]
}

rf_grid_search = GridSearchCV(RandomForestClassifier(random_state=42), param_grid=rf_param_grid, cv=3)
rf_grid_search.fit(X_train, y_train)

# Best parameters and best score for Random Forest Classifier
print("Best Parameters for Random Forest Classifier:", rf_grid_search.best_params_)
print("Best Score for Random Forest Classifier:", rf_grid_search.best_score_)

# Hyperparameter tuning for Gradient Boosting Classifier
gb_param_grid = {
    'n_estimators': [50, 100, 200],
    'learning_rate': [0.1, 0.05, 0.01],
    'max_depth': [3, 5, 7]
}

gb_grid_search = GridSearchCV(GradientBoostingClassifier(random_state=42), param_grid=gb_param_grid, cv=3)
gb_grid_search.fit(X_train, y_train)

# Best parameters and best score for Gradient Boosting Classifier
print("\nBest Parameters for Gradient Boosting Classifier:", gb_grid_search.best_params_)
print("Best Score for Gradient Boosting Classifier:", gb_grid_search.best_score_)


Best Parameters for Random Forest Classifier: {'max_depth': None, 'min_samples_split': 2, 'n_estimators': 100}
Best Score for Random Forest Classifier: 0.8756281016778881

Best Parameters for Gradient Boosting Classifier: {'learning_rate': 0.1, 'max_depth': 7, 'n_estimators': 200}
Best Score for Gradient Boosting Classifier: 0.8834402124570593


In [28]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix

# Evaluate Random Forest Classifier
rf_best_model = RandomForestClassifier(max_depth=None, min_samples_split=2, n_estimators=100, random_state=42)
rf_best_model.fit(X_train, y_train)
rf_predictions = rf_best_model.predict(X_test)

print("Random Forest Classifier Metrics:")
print("Accuracy:", accuracy_score(y_test, rf_predictions))
print("Precision:", precision_score(y_test, rf_predictions))
print("Recall:", recall_score(y_test, rf_predictions))
print("F1 Score:", f1_score(y_test, rf_predictions))
print("Confusion Matrix:")
print(confusion_matrix(y_test, rf_predictions))

# Evaluate Gradient Boosting Classifier
gb_best_model = GradientBoostingClassifier(n_estimators=100, learning_rate=0.1, max_depth=3, random_state=42)
gb_best_model.fit(X_train, y_train)
gb_predictions = gb_best_model.predict(X_test)

print("\nGradient Boosting Classifier Metrics:")
print("Accuracy:", accuracy_score(y_test, gb_predictions))
print("Precision:", precision_score(y_test, gb_predictions))
print("Recall:", recall_score(y_test, gb_predictions))
print("F1 Score:", f1_score(y_test, gb_predictions))
print("Confusion Matrix:")
print(confusion_matrix(y_test, gb_predictions))


Random Forest Classifier Metrics:
Accuracy: 0.90125
Precision: 0.8902439024390244
Recall: 0.9147869674185464
F1 Score: 0.9023485784919655
Confusion Matrix:
[[356  45]
 [ 34 365]]

Gradient Boosting Classifier Metrics:
Accuracy: 0.87
Precision: 0.8696741854636592
Recall: 0.8696741854636592
F1 Score: 0.8696741854636592
Confusion Matrix:
[[349  52]
 [ 52 347]]
