In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
pd.options.display.max_columns = None 

In [None]:
raw_df = pd.read_excel(f'{Path.cwd()}/../datasets/Rice_MSC_Dataset.xlsx', engine='openpyxl')

In [None]:
raw_df.info()

In [None]:
raw_df.columns

In [None]:
raw_df

In [None]:
# label encode rice class labels
lel = LabelEncoder()
raw_df['CLASS'] = lel.fit_transform(raw_df['CLASS'])
label_to_index_map = { label: idx for label, idx in zip(lel.classes_, list(range(len(lel.classes_)))) }
label_to_index_map

In [None]:
raw_df["CLASS"].value_counts().sort_index()

In [None]:
raw_df.head()

In [None]:
raw_df.isnull().sum().sort_values(ascending=False)[:10]

In [None]:
# Treat missing values by imputing them with the "mode", if there are any
for col in raw_df.columns.tolist():
    print(f"Number of missing values in column {col}: {raw_df[col].isnull().sum()}")

    if raw_df[col].isnull().sum() > 0:
        # impute using mode
        mode_value = raw_df[col].mode()[0]
        
        print(f"Imputing missing values in column {col} with mode value {mode_value}")

        raw_df[col] = raw_df[col].fillna(mode_value)

In [None]:
raw_df[raw_df.isna().any(axis=1)]

### No categorical features are present in the dataset. Only the label column is categorical.

In [None]:
for i in raw_df.columns:
    print(f"- \"{i}\"")

In [None]:
raw_df_features = raw_df.drop(columns=["CLASS"])
raw_df_labels = raw_df[["CLASS"]]
raw_df_features.shape, raw_df_labels.shape

In [None]:
X_train, X_test, y_train, y_test = train_test_split(
    raw_df_features,
    raw_df_labels,
    test_size=50000,
    random_state=42, 
    stratify=raw_df_labels.to_numpy()
)

X_train.shape, X_test.shape, y_train.shape, y_test.shape

In [None]:
y_test.head(), X_test.head()

In [None]:
y_test["CLASS"].value_counts().sort_index()

In [None]:
from sklearn.preprocessing import StandardScaler
# all columns are numerical
# standard scale numerical columns
for col in X_test.columns.tolist():
    _scaler = StandardScaler()
    X_test[col] = _scaler.fit_transform(X_test[col].to_numpy().reshape(-1, 1))

In [None]:
merged_df = pd.concat([X_test, y_test], axis=1)
merged_df

In [None]:
merged_df.to_csv(f'{Path.cwd()}/../datasets/rice_msc_preprocessed.csv', index=False)

In [None]:
corr = merged_df.corr()

# Generate a mask for the upper triangle
mask = np.triu(np.ones_like(corr, dtype=bool))

# Set up the matplotlib figure
f, ax = plt.subplots(figsize=(11, 9))

# Generate a custom diverging colormap
cmap = sns.diverging_palette(230, 20, as_cmap=True)

# Draw the heatmap with the mask and correct aspect ratio
sns.heatmap(corr, mask=mask, cmap=cmap, center=0,
            square=True, linewidths=.5, cbar_kws={"shrink": .5})