In [30]:
# Importing necessary libraries
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV, KFold, cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, confusion_matrix

# Loading the dataset
url = 'seeds_dataset1.txt'
names = ['area', 'perimeter', 'compactness', 'length_of_kernel', 'width_of_kernel', 'asymmetry_coefficient', 'length_of_kernel_groove', 'class']
seeds_df = pd.read_csv(url, names=names, header=None, delimiter='\t')

# Data preprocessing
print("Total missing Values : ", seeds_df.isna().sum())
X = seeds_df.iloc[:, :-1].values
y = seeds_df.iloc[:, -1].values
sc = StandardScaler()
X = sc.fit_transform(X)

# Splitting the dataset into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Building and training the classification models
models = []
models.append(('Logistic Regression', LogisticRegression()))
models.append(('Decision Tree', DecisionTreeClassifier()))
models.append(('Random Forest', RandomForestClassifier()))

results = []
names = []

for name, model in models:
    kfold = KFold(n_splits=10, shuffle=True, random_state=42)
    cv_results = cross_val_score(model, X_train, y_train, cv=kfold, scoring='accuracy')
    results.append(cv_results)
    names.append(name)
    msg = f"{name}: Mean accuracy - {cv_results.mean()}, Std - {cv_results.std()}"
    print(msg)

# Hyperparameter tuning for Decision Tree and Random Forest
param_grid_dt = {'criterion': ['gini', 'entropy'], 'max_depth': [2, 4, 6, 8, 10]}
param_grid_rf = {'n_estimators': [10, 50, 100], 'max_depth': [2, 4, 6, 8, 10]}

dt = DecisionTreeClassifier()
rf = RandomForestClassifier()

grid_dt = GridSearchCV(dt, param_grid_dt, cv=10, scoring='accuracy')
grid_rf = GridSearchCV(rf, param_grid_rf, cv=10, scoring='accuracy')

grid_dt.fit(X_train, y_train)
grid_rf.fit(X_train, y_train)

print("Best parameters for Decision Tree: ", grid_dt.best_params_)
print("Best parameters for Random Forest: ", grid_rf.best_params_)

# Predicting and evaluating the models on the test set
dt = DecisionTreeClassifier(**grid_dt.best_params_)
rf = RandomForestClassifier(**grid_rf.best_params_)

dt.fit(X_train, y_train)
rf.fit(X_train, y_train)

y_pred_dt = dt.predict(X_test)
y_pred_rf = rf.predict(X_test)

print("Accuracy score for Decision Tree: ", accuracy_score(y_test, y_pred_dt))
print("Confusion matrix for Decision Tree: \n", confusion_matrix(y_test, y_pred_dt))

print("Accuracy score for Random Forest: ", accuracy_score(y_test, y_pred_rf))
print("Confusion matrix for Random Forest: \n", confusion_matrix(y_test, y_pred_rf))


Total missing Values :  area                       0
perimeter                  0
compactness                0
length_of_kernel           0
width_of_kernel            0
asymmetry_coefficient      0
length_of_kernel_groove    0
class                      0
dtype: int64
Logistic Regression: Mean accuracy - 0.9047794117647058, Std - 0.06561135341716276
Decision Tree: Mean accuracy - 0.88125, Std - 0.11223837803291799
Random Forest: Mean accuracy - 0.9051470588235293, Std - 0.10275455773321249
Best parameters for Decision Tree:  {'criterion': 'entropy', 'max_depth': 4}
Best parameters for Random Forest:  {'max_depth': 4, 'n_estimators': 10}
Accuracy score for Decision Tree:  0.9523809523809523
Confusion matrix for Decision Tree: 
 [[10  0  1]
 [ 0 14  0]
 [ 1  0 16]]
Accuracy score for Random Forest:  0.8333333333333334
Confusion matrix for Random Forest: 
 [[ 9  0  2]
 [ 1 13  0]
 [ 4  0 13]]
