In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB, BernoulliNB, MultinomialNB, CategoricalNB
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, f1_score, recall_score, roc_auc_score, auc, roc_curve

In [None]:
# Load the Raisin dataset from an Excel (.xlsx) file into a pandas DataFrame
df = pd.read_excel("Raisin_Dataset.xlsx")

In [None]:
# Display the first 5 rows of the DataFrame to quickly inspect the dataset structure and values
df.head()

Unnamed: 0,Area,MajorAxisLength,MinorAxisLength,Eccentricity,ConvexArea,Extent,Perimeter,Class
0,87524,442.246011,253.291155,0.819738,90546,0.758651,1184.04,Kecimen
1,75166,406.690687,243.032436,0.801805,78789,0.68413,1121.786,Kecimen
2,90856,442.267048,266.328318,0.798354,93717,0.637613,1208.575,Kecimen
3,45928,286.540559,208.760042,0.684989,47336,0.699599,844.162,Kecimen
4,79408,352.19077,290.827533,0.564011,81463,0.792772,1073.251,Kecimen


In [None]:
# Print the shape of the DataFrame (number of rows and columns) to understand dataset size
print(f'Shape of Dataset(Row * Column): {df.shape}')

Shape of Dataset(Row * Column): (900, 8)


In [15]:
# Print a concise summary of the DataFrame including column names, non-null counts, and data types
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 900 entries, 0 to 899
Data columns (total 8 columns):
 #   Column           Non-Null Count  Dtype   
---  ------           --------------  -----   
 0   Area             900 non-null    int64   
 1   MajorAxisLength  900 non-null    float64 
 2   MinorAxisLength  900 non-null    float64 
 3   Eccentricity     900 non-null    float64 
 4   ConvexArea       900 non-null    int64   
 5   Extent           900 non-null    float64 
 6   Perimeter        900 non-null    float64 
 7   Class            900 non-null    category
dtypes: category(1), float64(5), int64(2)
memory usage: 50.3 KB


In [None]:
# Print the total number of missing (NaN) values in each column of the DataFrame
print(f'NAN Values:\n{df.isnull().sum()}')

# Print the total number of duplicate rows present in the DataFrame
print(f'Duplicate Value: {df.duplicated().sum()}')

NAN Values:
Area               0
MajorAxisLength    0
MinorAxisLength    0
Eccentricity       0
ConvexArea         0
Extent             0
Perimeter          0
Class              0
dtype: int64
Duplicate VAlue: 0


In [16]:
# Convert the 'Class' column to categorical type for efficient memory usage and proper handling in ML algorithms
df['Class'] = df['Class'].astype('category')

In [18]:
df.describe()

Unnamed: 0,Area,MajorAxisLength,MinorAxisLength,Eccentricity,ConvexArea,Extent,Perimeter
count,900.0,900.0,900.0,900.0,900.0,900.0,900.0
mean,87804.127778,430.92995,254.488133,0.781542,91186.09,0.699508,1165.906636
std,39002.11139,116.035121,49.988902,0.090318,40769.290132,0.053468,273.764315
min,25387.0,225.629541,143.710872,0.34873,26139.0,0.379856,619.074
25%,59348.0,345.442898,219.111126,0.741766,61513.25,0.670869,966.41075
50%,78902.0,407.803951,247.848409,0.798846,81651.0,0.707367,1119.509
75%,105028.25,494.187014,279.888575,0.842571,108375.75,0.734991,1308.38975
max,235047.0,997.291941,492.275279,0.962124,278217.0,0.835455,2697.753
