# DRY BEAN DATA SET

Attribute Information:
- 1.Area (A): The area of a bean zone and the number of pixels within its boundaries.
- 2.Perimeter (P): Bean circumference is defined as the length of its border.
- 3.Major axis length (L): The distance between the ends of the longest line that can be drawn from a bean.
- 4.Minor axis length (l): The longest line that can be drawn from the bean while standing perpendicular to the main axis.
- 5.Aspect ratio (K): Defines the relationship between L and l.
- 6.Eccentricity (Ec): Eccentricity of the ellipse having the same moments as the region.
- 7.Convex area (C): Number of pixels in the smallest convex polygon that can contain the area of a bean seed.
- 8.Equivalent diameter (Ed): The diameter of a circle having the same area as a bean seed area.
- 9.Extent (Ex): The ratio of the pixels in the bounding box to the bean area.
- 10.Solidity (S): Also known as convexity. The ratio of the pixels in the convex shell to those found in beans.
- 11.Roundness (R): Calculated with the following formula: (4piA)/(P^2)
- 12.Compactness (CO): Measures the roundness of an object: Ed/L
- 13.ShapeFactor1 (SF1)
- 14.ShapeFactor2 (SF2)
- 15.ShapeFactor3 (SF3)
- 16.ShapeFactor4 (SF4)
- 17.Class (Seker, Barbunya, Bombay, Cali, Dermosan, Horoz and Sira)

In [None]:
#!pip install scikit-elm

In [None]:
#importing all necessary libraries 
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
from collections import Counter
import warnings
warnings.filterwarnings("ignore")
from sklearn.preprocessing import MinMaxScaler
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_samples, silhouette_score
from yellowbrick.cluster import SilhouetteVisualizer
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from skelm import ELMClassifier
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.feature_selection import SelectKBest, f_classif
from sklearn.decomposition import PCA
from sklearn.metrics import accuracy_score
import time


In [None]:
df = pd.read_csv('Dry_Bean_Data_Set.csv')

In [None]:
# print no of samples and attributes in the original dataset
df.shape

In [None]:
df.head()

In [None]:
df.columns

In [None]:
# check class distribution for each variety of beans
df['Class'].value_counts()

In [None]:
#Different types of Beans
labels, counts = np.unique(df.Class, return_counts=True)
plt.figure(figsize=(8, 8))
plt.pie(counts, autopct='%1.1f%%', labels=labels)
plt.title('Bean proportions')
plt.show()

In [None]:
df.info()

In [None]:
X = df.drop("Class", axis=1)
Y = df['Class']

# Visualizing the correlation between the features
corr_matrix = df.corr()
sns.clustermap(corr_matrix, annot = True, fmt = ".2f")
plt.title("Correlation between features")
plt.show()

In [None]:
# box plot of each attribute to find the outliers
Num_columns = df.drop(columns=['Class']).columns

fig, ax = plt.subplots(4, 4, figsize=(15, 20))

for variable, subplot in zip(Num_columns, ax.flatten()):
    sns.boxplot(y= df[variable], ax=subplot)
plt.tight_layout()

## Data Pre-processing
### Data Cleaning:
### 1. Check for missing/null values

In [None]:
df.isnull().sum()

**from the above result we can state dataset do not have any missing values**

In [None]:
df.describe(percentiles=[0.15,0.25,0.50,0.75,0.90]).T

In [None]:
df.corr()

In [None]:
# For further understanding the Data use Histograms
# These Histograms gives a better visualization of Data Distribution.

df.hist(bins=30, figsize=(15,15))
plt.show()

### 2. removing outliers based on IQR method (values beyond 3 standard deviations are considered to be outliers/noise)

In [None]:
features = ['Area', 'Perimeter', 'MajorAxisLength',
            'MinorAxisLength', 'AspectRation', 'Eccentricity',
            'ConvexArea', 'EquivDiameter', 'Extent', 'Solidity',
            'roundness', 'Compactness', 'ShapeFactor1',
            'ShapeFactor2', 'ShapeFactor3', 'ShapeFactor4']

Outlier_indices = []

for b in features:
    Q1 = np.percentile(df[b], 25)
    Q3 = np.percentile(df[b], 75)
    Inter_quartile_range = Q3 - Q1
    Outlier = Inter_quartile_range * 1.5
    Outlier_list = df[(df[b] < Q1 - Outlier) | (df[b] > Q3 + Outlier)].index
    Outlier_indices.extend(Outlier_list)

Outlier_indices = Counter(Outlier_indices)
Multiple_Outliers = list(i for i, v in Outlier_indices.items() if v>1)
    
#print(Multiple_Outliers)
df_update = df.drop(Multiple_Outliers,axis=0).reset_index(drop=True)
df_update

In [None]:
print('No: of samples before removing outliers: ', df.shape[0])
print('No: of samples after removing outliers:  ', df_update.shape[0])

In [None]:
df_update.describe()

In [None]:
Num_columns = df_update.drop(columns=['Class']).columns
fig, ax = plt.subplots(4, 4, figsize=(15, 20))

for variable, subplot in zip(Num_columns, ax.flatten()):
    sns.boxplot(y= df[variable], ax=subplot)
plt.tight_layout()

### 3. Drop duplicated data

In [None]:
df_update.duplicated(subset=None, keep='first').sum()

In [None]:
df_clean=df_update.drop_duplicates( keep='last')
df_clean

In [None]:
# export cleaned dataset to csv
df_clean.to_csv('Dry Bean cleaned.csv')

## Exploratory Analysis

In [None]:
#Visualisation of variables-Scatterplot including Class labels

sns.set_theme(style="darkgrid")
sns.pairplot(df_clean, hue="Class")

## Dimensionality reduction and Feature Selection

In [None]:
X = df_clean.drop("Class",axis = 1)
y = df_clean["Class"]

In [None]:
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
X = sc.fit_transform(X)

In [None]:
from sklearn.decomposition import PCA
pca = PCA()  
x = pca.fit_transform(X)
per_var = np.round(pca.explained_variance_ratio_*100, 1)    
print(per_var)

In [None]:
plt.figure(figsize = (20,10))
princ_comp = ["PC" + str(x) for x in range(1, 17)]
plt.bar(x = range(1, 17), height = per_var, tick_label = princ_comp)
plt.xlabel("Principal Component", fontsize = 15)
plt.ylabel("Percentage of Variation", fontsize = 15)
plt.title("Scree Plot", fontsize = 15)
plt.grid();

- PC1, PC2, PC3, PC4 accounts to 95.2%

In [None]:
import plotly.express as px
plt.figure(figsize = (40,40))

pca_all_data = PCA(n_components=4,random_state=0)
pca_all_data.fit(X)
pca_trans = pca_all_data.transform(X)
labels = {
    str(i): f"PC {i+1} ({var:.2f}%)"
    for i, var in enumerate(pca_all_data.explained_variance_ratio_ * 100)}

fig = px.scatter_matrix( pca_trans, labels=labels, dimensions=range(4), color = y)
fig.update_traces(diagonal_visible=True)
fig.show()

In [None]:
pca_dataset = pd.DataFrame(x, columns = princ_comp)
z = pca_dataset.iloc[:, 0:2].values

In [None]:
from matplotlib.colors import ListedColormap
plt.figure(figsize = (12,8))
X_set, y_set = z, y
X1, X2 = np.meshgrid(np.arange(start = X_set[:, 0].min() - 1, stop = X_set[:, 0].max() + 1, step = 0.01),
                     np.arange(start = X_set[:, 1].min() - 1, stop = X_set[:, 1].max() + 1, step = 0.01))

plt.xlim(X1.min(), X1.max())
plt.ylim(X2.min(), X2.max())
for i, j in enumerate(np.unique(y_set)):
    plt.scatter(X_set[y_set == j, 0], X_set[y_set == j, 1],
                c = ListedColormap(('red', 'green', 'blue','cyan','purple','olive'))(i), label = j)
plt.title("Two-component PCA", fontsize = 22)
plt.xlabel('PC1')
plt.ylabel('PC2')
plt.legend()
plt.show()

In [None]:
X = df_clean.drop("Class",axis = 1)
y = df_clean["Class"]
#normalizer = preprocessing.StandardScaler()
#norm_X = normalizer.fit_transform(X)
#X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
lab_enc = preprocessing.LabelEncoder()
label_Y = lab_enc.fit_transform(Y)
lb = preprocessing.LabelBinarizer()
label_train = lb.fit_transform(Y)


### Variable Selection using bidirectional elimination wrapper method

In [None]:
import joblib
import sys
sys.modules['sklearn.externals.joblib'] = joblib
from mlxtend.feature_selection import SequentialFeatureSelector as SFS
from sklearn.linear_model import LogisticRegression

In [None]:
Y = y.values
X1 = X.values

In [None]:
%%time
sffs = SFS(LogisticRegression(multi_class='ovr'),
         k_features=(6,12),
         forward=True,
         floating=True,
         scoring='accuracy',
         cv=5)
sffs.fit(X1, Y)
sffs.k_feature_names_

In [None]:
feat = list(df.columns)
indices = [int(i)  for i in sffs.k_feature_names_]
selected_feat = [feat[i] for i in indices]

In [None]:
selected_feat

In [None]:
X = df_clean[['Perimeter',
 'MajorAxisLength',
 'MinorAxisLength',
 'AspectRation',
 'EquivDiameter',
 'Extent',
 'ShapeFactor1']]
X

### Performing PCA on the Selected features

In [None]:
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
X = sc.fit_transform(X)

In [None]:
pca = PCA()  
x = pca.fit_transform(X)
per_var = np.round(pca.explained_variance_ratio_*100, 1)    
print(per_var)

In [None]:
plt.figure(figsize = (20,10))
princ_comp = ["PC" + str(x) for x in range(1, 8)]
plt.bar(x = range(1, 8), height = per_var, tick_label = princ_comp)
plt.xlabel("Principal Component", fontsize = 15)
plt.ylabel("Percentage of Variation", fontsize = 15)
plt.title("Scree Plot", fontsize = 15)
plt.grid();

- PC1, PC2, PC3 accounts to 99.4%

In [None]:
import plotly.express as px
plt.figure(figsize = (40,40))

pca_all_data = PCA(n_components=3,random_state=0)
pca_all_data.fit(X)
pca_trans = pca_all_data.transform(X)
labels = {
    str(i): f"PC {i+1} ({var:.2f}%)"
    for i, var in enumerate(pca_all_data.explained_variance_ratio_ * 100)}

fig = px.scatter_matrix( pca_trans, labels=labels, dimensions=range(3), color = y)
fig.update_traces(diagonal_visible=True)
fig.show()

In [None]:
pca_dataset = pd.DataFrame(x, columns = princ_comp)
z = pca_dataset.iloc[:, 0:2].values

In [None]:
from matplotlib.colors import ListedColormap
plt.figure(figsize = (12,8))
X_set, y_set = z, y
X1, X2 = np.meshgrid(np.arange(start = X_set[:, 0].min() - 1, stop = X_set[:, 0].max() + 1, step = 0.01),
                     np.arange(start = X_set[:, 1].min() - 1, stop = X_set[:, 1].max() + 1, step = 0.01))

plt.xlim(X1.min(), X1.max())
plt.ylim(X2.min(), X2.max())
for i, j in enumerate(np.unique(y_set)):
    plt.scatter(X_set[y_set == j, 0], X_set[y_set == j, 1],
                c = ListedColormap(('red', 'green', 'blue','cyan','purple','olive'))(i), label = j)
plt.title("Two-component PCA", fontsize = 22)
plt.xlabel('PC1')
plt.ylabel('PC2')
plt.legend()
plt.show()

### Performing Models

In [None]:
#Entire data
X = df_clean.drop("Class",axis = 1)
y = df_clean["Class"]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25, random_state = 0)

In [None]:
#Feature Scaling
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)

In [None]:
#selected features data
X1 = df_clean[['Perimeter',
 'MajorAxisLength',
 'MinorAxisLength',
 'AspectRation',
 'EquivDiameter',
 'Extent',
 'ShapeFactor1']]
y = df_clean["Class"]
X1_train, X1_test, y_train, y_test = train_test_split(X1, y, test_size = 0.25, random_state = 0)


In [None]:
sc = StandardScaler()
X1_train = sc.fit_transform(X1_train)
X1_test = sc.transform(X1_test)

### SVC - Entire Features

### Getting the best hyper parameters using grid sreach cross validation

In [None]:
%%time
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV
  
# defining parameter range
param_grid = {'C': [0.01, 0.1, 1, 10, 100], 
              'gamma': [1, 0.1, 0.01, 0.001, 0.0001],
              'kernel': ['linear','poly','rbf','sigmoid']} 

grid = GridSearchCV(SVC(), param_grid, verbose = 3)
  
# fitting the model for grid search
grid.fit(X_train, y_train)

In [None]:
# print best parameter after tuning
print('Best Parameters: ', grid.best_params_)
print('Best Score: ',grid.best_score_)

In [None]:
final_svm = SVC(kernel = grid.best_params_['kernel'], gamma = grid.best_params_['gamma'], C = grid.best_params_['C'])
final_svm.fit(X_train, y_train)

In [None]:
y_pred = final_svm.predict(X_test)
print("Accuracy Score: ", accuracy_score(y_test, y_pred))
print("Classification Report: ")
print(classification_report(y_test, y_pred))

In [None]:
#confusion matrix
from sklearn.metrics import confusion_matrix, accuracy_score
cm = confusion_matrix(y_test, y_pred)
print(cm)

In [None]:
#Confusion matrix heat map
import seaborn as sns
plt.figure(figsize = (8,6))
sns.heatmap(cm, annot=True, cmap='Spectral')
plt.xlabel('Predicted', fontsize=15)
plt.ylabel('Actual', fontsize=15)
plt.show()

### SVC - Selected features

In [None]:
final_svm = SVC(kernel = grid.best_params_['kernel'], gamma = grid.best_params_['gamma'], C = grid.best_params_['C'])
final_svm.fit(X1_train, y_train)

In [None]:
y_pred = final_svm.predict(X1_test)
print("Accuracy Score: ", accuracy_score(y_test, y_pred))
print("Classification Report: ")
print(classification_report(y_test, y_pred))

In [None]:
#confusion matrix
from sklearn.metrics import confusion_matrix, accuracy_score
cm = confusion_matrix(y_test, y_pred)
print(cm)

In [None]:
#Confusion matrix heat map
import seaborn as sns
plt.figure(figsize = (8,6))
sns.heatmap(cm, annot=True, cmap='Spectral')
plt.xlabel('Predicted', fontsize=15)
plt.ylabel('Actual', fontsize=15)
plt.show()

### KNN - Entire Data

In [None]:
gs = GridSearchCV(estimator = KNeighborsClassifier(),
                  param_grid = {"n_neighbors": range(1,100),
                                "weights": ['uniform', 'distance'],
                                "p": [1,2]}, cv=5)
gs.fit(X_train, y_train)

In [None]:
# print best parameter after tuning
print('Best Parameters: ', gs.best_params_)
print('Best Score: ',gs.best_score_)

In [None]:
best_knn = KNeighborsClassifier(n_neighbors = gs.best_params_['n_neighbors'], 
                                p = gs.best_params_['p'], 
                                weights = gs.best_params_['weights'])
best_knn.fit(X_train,y_train)

In [None]:
#best knn model on the testing dataset
y_pred_test = best_knn.predict(X_test)
KNN_Accuracy_1 = accuracy_score(y_test, y_pred_test)
print(KNN_Accuracy_1)

In [None]:
print(classification_report(y_test,y_pred_test))

In [None]:
#Confusion Matrix
from sklearn.metrics import confusion_matrix
cm = confusion_matrix(y_test, y_pred_test)
cm

In [None]:
#Confusion matrix heat map
plt.figure(figsize = (8,6))
sns.heatmap(cm, annot=True, cmap='Spectral')
plt.xlabel('Predicted', fontsize=15)
plt.ylabel('Actual', fontsize=15)
plt.show()

### KNN - Selected Features

In [None]:
best_knn = KNeighborsClassifier(n_neighbors = gs.best_params_['n_neighbors'], 
                                p = gs.best_params_['p'], 
                                weights = gs.best_params_['weights'])

best_knn.fit(X1_train,y_train)

In [None]:
#best knn model on the testing dataset
y_pred_test = best_knn.predict(X1_test)
KNN_Accuracy_2 = accuracy_score(y_test, y_pred_test)
print(KNN_Accuracy_2)

In [None]:
print(classification_report(y_test,y_pred_test))

In [None]:
#Confusion Matrix
from sklearn.metrics import confusion_matrix
cm = confusion_matrix(y_test, y_pred_test)
cm

In [None]:
#Confusion matrix heat map
plt.figure(figsize = (8,6))
sns.heatmap(cm, annot=True, cmap='Spectral')
plt.xlabel('Predicted', fontsize=15)
plt.ylabel('Actual', fontsize=15)
plt.show()

### Random Forest - Entire Features


In [None]:
%%time
gs = GridSearchCV(RandomForestClassifier(),
                  param_grid = {
                      'n_estimators': [10, 100, 200, 400, 500],
                      'max_depth': [5, 10, 15, 20, 25],
                      'min_samples_split': [2, 5, 7, 10, 15],
                      'min_samples_leaf': [2, 5, 7, 10, 15],
                      'max_features': ['auto', 'sqrt']
                  },
                  cv = 5, scoring = 'accuracy')
gs.fit(X_train, y_train)

In [None]:
# print best parameter after tuning
print('Best Parameters: ', gs.best_params_)
print('Best Score: ',gs.best_score_)

In [None]:
best_RF = RandomForestClassifier(n_estimators = gs.best_params_['n_estimators'], 
                                max_depth = gs.best_params_['max_depth'], 
                                min_samples_split = gs.best_params_['min_samples_split'],
                                min_samples_leaf = gs.best_params_['min_samples_leaf'],
                                max_features = gs.best_params_['max_features'],
                                )
best_RF.fit(X_train,y_train)

In [None]:
#best RF model on the testing dataset
y_pred_test = best_RF.predict(X_test)
RF_Accuracy_1 = accuracy_score(y_test, y_pred_test)
print(RF_Accuracy_1)

In [None]:
print(classification_report(y_test,y_pred_test))

In [None]:
#Confusion Matrix
from sklearn.metrics import confusion_matrix
cm = confusion_matrix(y_test, y_pred_test)
cm

In [None]:
#Confusion matrix heat map
plt.figure(figsize = (8,6))
sns.heatmap(cm, annot=True, cmap='Spectral')
plt.xlabel('Predicted', fontsize=15)
plt.ylabel('Actual', fontsize=15)
plt.show()

### Random Forest - Selected Features


In [None]:
best_RF.fit(X1_train,y_train)

In [None]:
#best RF model on the testing dataset
y_pred_test = best_RF.predict(X1_test)
RF_Accuracy_2 = accuracy_score(y_test, y_pred_test)
print(RF_Accuracy_2)

In [None]:
print(classification_report(y_test,y_pred_test))

In [None]:
#Confusion Matrix
from sklearn.metrics import confusion_matrix
cm = confusion_matrix(y_test, y_pred_test)
cm

In [None]:
#Confusion matrix heat map
plt.figure(figsize = (8,6))
sns.heatmap(cm, annot=True, cmap='Spectral')
plt.xlabel('Predicted', fontsize=15)
plt.ylabel('Actual', fontsize=15)
plt.show()

In [None]:
import matplotlib.pyplot as plt

#Entire Features Accuracy 
EFA = [92.10, 89.70, 91.38]
#Selected Features Accuracy 
SFA = [92.10, 89.70, 91.18]

fig, ax = plt.subplots(figsize=(12,8))
width = 0.35 

ax.bar(range(len(EFA)), EFA, width, label='Entire Features Accuracy')
ax.bar([i + width for i in range(len(SFA))], SFA, width, label='Selected Features Accuracy')

# set the axis labels and title
ax.set_xlabel('X Axis')
ax.set_ylabel('Values')
ax.set_title('Bar plot of two data sets')
ax.set_xticks([i + width / 2 for i in range(len(EFA))])
ax.set_xticklabels(['SVC', 'KNN', 'RF'])
ax.legend()
plt.show()


- Accuracy before and after feature selection is almost same but we are reducing the computational cost and training time with less number of features