In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder

In [2]:
# Load the Raisin dataset from an Excel (.xlsx) file into a pandas DataFrame
df = pd.read_excel("Raisin_Dataset.xlsx")

In [3]:
# Display the first 5 rows of the DataFrame to quickly inspect the dataset structure and values
df.head()

Unnamed: 0,Area,MajorAxisLength,MinorAxisLength,Eccentricity,ConvexArea,Extent,Perimeter,Class
0,87524,442.246011,253.291155,0.819738,90546,0.758651,1184.04,Kecimen
1,75166,406.690687,243.032436,0.801805,78789,0.68413,1121.786,Kecimen
2,90856,442.267048,266.328318,0.798354,93717,0.637613,1208.575,Kecimen
3,45928,286.540559,208.760042,0.684989,47336,0.699599,844.162,Kecimen
4,79408,352.19077,290.827533,0.564011,81463,0.792772,1073.251,Kecimen


In [4]:
# Print the shape of the DataFrame (number of rows and columns) to understand dataset size
print(f'Shape of Dataset(Row * Column): {df.shape}')

Shape of Dataset(Row * Column): (900, 8)


In [5]:
# Print a concise summary of the DataFrame including column names, non-null counts, and data types
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 900 entries, 0 to 899
Data columns (total 8 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   Area             900 non-null    int64  
 1   MajorAxisLength  900 non-null    float64
 2   MinorAxisLength  900 non-null    float64
 3   Eccentricity     900 non-null    float64
 4   ConvexArea       900 non-null    int64  
 5   Extent           900 non-null    float64
 6   Perimeter        900 non-null    float64
 7   Class            900 non-null    object 
dtypes: float64(5), int64(2), object(1)
memory usage: 56.4+ KB


In [6]:
# Print the total number of missing (NaN) values in each column of the DataFrame
print(f'NAN Values:\n{df.isnull().sum()}')

# Print the total number of duplicate rows present in the DataFrame
print(f'Duplicate Value: {df.duplicated().sum()}')

NAN Values:
Area               0
MajorAxisLength    0
MinorAxisLength    0
Eccentricity       0
ConvexArea         0
Extent             0
Perimeter          0
Class              0
dtype: int64
Duplicate Value: 0


In [7]:
# Convert the 'Class' column to categorical type for efficient memory usage and proper handling in ML algorithms
df['Class'] = df['Class'].astype('category')

# Count the number of occurrences of each category in the 'Class' column to understand class distribution
df['Class'].value_counts()

Class
Besni      450
Kecimen    450
Name: count, dtype: int64

In [8]:
lb = LabelEncoder()
df['Class'] = lb.fit_transform(df['Class'])

In [9]:
# Generate descriptive statistics of the DataFrame's numerical columns (count, mean, std, min, 25%, 50%, 75%, max)
df.describe()

Unnamed: 0,Area,MajorAxisLength,MinorAxisLength,Eccentricity,ConvexArea,Extent,Perimeter,Class
count,900.0,900.0,900.0,900.0,900.0,900.0,900.0,900.0
mean,87804.127778,430.92995,254.488133,0.781542,91186.09,0.699508,1165.906636,0.5
std,39002.11139,116.035121,49.988902,0.090318,40769.290132,0.053468,273.764315,0.500278
min,25387.0,225.629541,143.710872,0.34873,26139.0,0.379856,619.074,0.0
25%,59348.0,345.442898,219.111126,0.741766,61513.25,0.670869,966.41075,0.0
50%,78902.0,407.803951,247.848409,0.798846,81651.0,0.707367,1119.509,0.5
75%,105028.25,494.187014,279.888575,0.842571,108375.75,0.734991,1308.38975,1.0
max,235047.0,997.291941,492.275279,0.962124,278217.0,0.835455,2697.753,1.0


In [10]:
df.columns

Index(['Area', 'MajorAxisLength', 'MinorAxisLength', 'Eccentricity',
       'ConvexArea', 'Extent', 'Perimeter', 'Class'],
      dtype='object')

In [11]:
'''# Function to remove outliers from a specified column using the IQR method
def rem_outlier(df, col):
    q1 = df[col].quantile(.25)                   # 25th percentile (Q1) of the column
    q3 = df[col].quantile(.75)                   # 75th percentile (Q3) of the column
    iqr = q3 - q1                                # Interquartile Range (IQR)
    lower_wh = q1 - 1.5 * iqr                    # Lower bound for outliers
    upper_wh = q3 + 1.5 * iqr                    # Upper bound for outliers
    df = df[(df[col] >= lower_wh) & (df[col] <= upper_wh)]  # Keep rows within bounds
    return df                                    # Return cleaned DataFrame'''

'# Function to remove outliers from a specified column using the IQR method\ndef rem_outlier(df, col):\n    q1 = df[col].quantile(.25)                   # 25th percentile (Q1) of the column\n    q3 = df[col].quantile(.75)                   # 75th percentile (Q3) of the column\n    iqr = q3 - q1                                # Interquartile Range (IQR)\n    lower_wh = q1 - 1.5 * iqr                    # Lower bound for outliers\n    upper_wh = q3 + 1.5 * iqr                    # Upper bound for outliers\n    df = df[(df[col] >= lower_wh) & (df[col] <= upper_wh)]  # Keep rows within bounds\n    return df                                    # Return cleaned DataFrame'

In [12]:
'''# Create a clean copy of the DataFrame and remove outliers from all columns
df_clean = df.copy()                               # Make a copy to preserve original data
sel_col = df.drop(['Class'], axis=1)              # Select all columns except 'Class' for outlier removal

# Loop through all columns and remove outliers using the rem_outlier function
for col2 in df_clean.columns:
    df_clean = rem_outlier(df_clean, col2)        # Apply outlier removal to each column

df = df_clean.copy()                               # Update original DataFrame with cleaned values'''

"# Create a clean copy of the DataFrame and remove outliers from all columns\ndf_clean = df.copy()                               # Make a copy to preserve original data\nsel_col = df.drop(['Class'], axis=1)              # Select all columns except 'Class' for outlier removal\n\n# Loop through all columns and remove outliers using the rem_outlier function\nfor col2 in df_clean.columns:\n    df_clean = rem_outlier(df_clean, col2)        # Apply outlier removal to each column\n\ndf = df_clean.copy()                               # Update original DataFrame with cleaned values"

In [None]:
# Function to cap outliers in a specified column using the IQR method
def cap_outlier(df, col):
    q1 = df[col].quantile(.25)                   # 25th percentile (Q1) of the column
    q3 = df[col].quantile(.75)                   # 75th percentile (Q3) of the column
    iqr = q3 - q1                                # Compute the Interquartile Range (IQR)
    lower_wh = q1 - 1.5 * iqr                    # Define lower bound for capping outliers
    upper_wh = q3 + 1.5 * iqr                    # Define upper bound for capping outliers
    df[col] = np.clip(df[col], lower_wh, upper_wh)  # Cap values outside bounds to the whiskers
    return df                                    # Return DataFrame with capped column

In [None]:
# Cap outliers in all feature columns to reduce extreme value impact
df_clean = df.copy()                               # Make a copy to preserve original data
sel_col = df.drop(['Class'], axis=1)              # Select all columns except 'Class' for outlier handling

# Apply outlier capping column by column
for col2 in df_clean.columns:
    df_clean = cap_outlier(df_clean, col2)        # Cap outliers in the current column

df = df_clean.copy()                               # Update the original DataFrame with capped values

In [15]:
df.shape

(900, 8)

In [16]:
# Save the preprocessed DataFrame to a CSV file without including the index
df.to_csv("Raisin_preprocessed.csv", index=False)

# Print confirmation message that preprocessing is complete and file has been saved
print("Preprocessing done and file saved as Raisin_preprocessed")

Preprocessing done and file saved as Raisin_preprocessed
