In [1]:
import pandas as pd
import numpy as np

csv_file = 'Chinese_MINST_Dataset/chinese_mnist.csv'
data = pd.read_csv(csv_file)


data['label'] = data.apply(lambda x: f"{x['suite_id']}_{x['sample_id']}_{x['code']}.jpg", axis=1)
data['value'] = data['value']  

final_data = data[['label', 'value', 'code']]  

train_samples = []
test_samples = []

In [2]:
min_train_per_class = 333
min_test_per_class = 66

total_train_samples = 5000
total_test_samples = 1000

class_distribution = final_data['code'].value_counts()


for code, group in final_data.groupby('code'):
    group = group.sample(frac=1, random_state=42).reset_index(drop=True)
    
    available_samples = len(group)
    train_count = min(min_train_per_class, available_samples - min_test_per_class) 
    test_count = min(min_test_per_class, available_samples - train_count)  
    
    train_samples.append(group.iloc[:train_count]) 
    test_samples.append(group.iloc[train_count:train_count + test_count]) 

train_data = pd.concat(train_samples).reset_index(drop=True)
test_data = pd.concat(test_samples).reset_index(drop=True)


In [3]:
if len(train_data) < total_train_samples or len(test_data) < total_test_samples:
    print("Insufficient samples in train or test dataset, trying to supplement...")
    
  
    remaining_data = final_data[~final_data['label'].isin(train_data['label']) & ~final_data['label'].isin(test_data['label'])]
    

    while len(train_data) < total_train_samples:
        additional_samples = remaining_data.sample(n=1, random_state=42)
        train_data = pd.concat([train_data, additional_samples]).reset_index(drop=True)
        remaining_data = remaining_data[~remaining_data['label'].isin(additional_samples['label'])]
        

    while len(test_data) < total_test_samples:
        additional_samples = remaining_data.sample(n=1, random_state=42)
        test_data = pd.concat([test_data, additional_samples]).reset_index(drop=True)
        remaining_data = remaining_data[~remaining_data['label'].isin(additional_samples['label'])]

print(f'Train set size: {len(train_data)}')
print(f'Test set size: {len(test_data)}')




Insufficient samples in train or test dataset, trying to supplement...
Train set size: 5000
Test set size: 1000


In [4]:
import cv2  
import os
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import SGDClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix
image_directory = 'Chinese_MINST_Dataset/data/data/input_'  



def load_images_and_labels(data):
    images = []
    labels = []

    for index, row in data.iterrows():
        img_path = image_directory + row['label']  
        value = row['value'] 
        
        # Load image
        img = cv2.imread(img_path, cv2.IMREAD_GRAYSCALE)  
        if img is not None:
            img_resized = cv2.resize(img, (64, 64)) 
            images.append(img_resized.flatten())  
            labels.append(value) 
        else:
            print(f"Warning: Image {img_path} not found.")
    
    return np.array(images), np.array(labels)



In [5]:
X_train, y_train = load_images_and_labels(train_data)  
X_test, y_test = load_images_and_labels(test_data)    

X_train = X_train.reshape(X_train.shape[0], -1)  
X_test = X_test.reshape(X_test.shape[0], -1)

scaler = StandardScaler()
X_train = scaler.fit_transform(X_train) 
X_test = scaler.transform(X_test)         


knn = KNeighborsClassifier(n_neighbors=3)  # KNN
dt = DecisionTreeClassifier()               # Decision Tree
sgd = SGDClassifier(max_iter=250)           # SGD Classifier

knn.fit(X_train, y_train)
dt.fit(X_train, y_train)
sgd.fit(X_train, y_train)

knn_pred = knn.predict(X_test)
dt_pred = dt.predict(X_test)
sgd_pred = sgd.predict(X_test)

def report_metrics(y_true, y_pred, model_name):
    accuracy = accuracy_score(y_true, y_pred)
    precision = precision_score(y_true, y_pred, average='weighted', zero_division=0)
    recall = recall_score(y_true, y_pred, average='weighted', zero_division=0)
    f1 = f1_score(y_true, y_pred, average='weighted', zero_division=0)
    conf_matrix = confusion_matrix(y_true, y_pred)

    print(f"{model_name} Performance:")
    print(f"Accuracy: {accuracy:.4f}")
    print(f"Precision: {precision:.4f}")
    print(f"Recall: {recall:.4f}")
    print(f"F1 Score: {f1:.4f}")
    print("Confusion Matrix:")
    print(conf_matrix)
    print("\n")

# Report metrics for each classifier
report_metrics(y_test, knn_pred, "KNN")
report_metrics(y_test, dt_pred, "Decision Tree")
report_metrics(y_test, sgd_pred, "SGD Classifier")




KNN Performance:
Accuracy: 0.3830
Precision: 0.4981
Recall: 0.3830
F1 Score: 0.3809
Confusion Matrix:
[[49  5  0  2  3  1  0  0  0  0  1  0  2  3  0]
 [ 1 62  2  1  0  0  0  0  0  0  0  0  0  0  0]
 [ 0 27 25 12  0  1  0  0  1  0  0  0  0  0  0]
 [ 0 17 15 32  0  2  0  0  0  0  0  0  0  0  0]
 [ 6 20  3  7 25  1  1  0  0  0  1  0  0  1  1]
 [ 1  8 28 17  1 11  0  0  2  0  2  1  0  0  0]
 [ 2 39  0  1  1  0 18  0  2  0  0  0  1  2  0]
 [ 0 21  6  5  0  4  4 19  0  1  5  1  1  0  1]
 [ 0 10  5  0  0  1  1  0 51  0  0  0  0  0  0]
 [ 2 20  1  1  1  4 11  6  2 11  0  1  0  4  2]
 [ 0 18  3  0  0  0  3  0  0  0 26  0 14  2  0]
 [ 3 11  5  7  5  5  1  2  2  0  1 12  2  6  4]
 [ 1 20  5  3  0  0  3  0  0  1 20  0 13  0  0]
 [ 2 25  0  0  1  2 13  0  0  1  2  1  2 16  1]
 [ 1 18  5 10  3  3  2  6  2  1  0  0  3  0 13]]


Decision Tree Performance:
Accuracy: 0.2810
Precision: 0.2852
Recall: 0.2810
F1 Score: 0.2809
Confusion Matrix:
[[22  1  1  0  7  2 10  1  2  0  2  8  7  1  2]
 [ 0 45  6  2  

In [6]:
import pandas as pd
import numpy as np

csv_file = 'Chinese_MINST_Dataset/chinese_mnist.csv'
data = pd.read_csv(csv_file)


data['label'] = data.apply(lambda x: f"{x['suite_id']}_{x['sample_id']}_{x['code']}.jpg", axis=1)
data['value'] = data['value']  

final_data = data[['label', 'value', 'code']]  

train_samples = []
test_samples = []

min_train_per_class = 333
min_test_per_class = 66

total_train_samples = 10000
total_test_samples = 1000

class_distribution = final_data['code'].value_counts()


for code, group in final_data.groupby('code'):
    group = group.sample(frac=1, random_state=42).reset_index(drop=True)
    
    available_samples = len(group)
    train_count = min(min_train_per_class, available_samples - min_test_per_class) 
    test_count = min(min_test_per_class, available_samples - train_count)  
    
    train_samples.append(group.iloc[:train_count]) 
    test_samples.append(group.iloc[train_count:train_count + test_count]) 

train_data = pd.concat(train_samples).reset_index(drop=True)
test_data = pd.concat(test_samples).reset_index(drop=True)

if len(train_data) < total_train_samples or len(test_data) < total_test_samples:
    print("Insufficient samples in train or test dataset, trying to supplement...")
    
  
    remaining_data = final_data[~final_data['label'].isin(train_data['label']) & ~final_data['label'].isin(test_data['label'])]
    

    while len(train_data) < total_train_samples:
        additional_samples = remaining_data.sample(n=1, random_state=42)
        train_data = pd.concat([train_data, additional_samples]).reset_index(drop=True)
        remaining_data = remaining_data[~remaining_data['label'].isin(additional_samples['label'])]
        

    while len(test_data) < total_test_samples:
        additional_samples = remaining_data.sample(n=1, random_state=42)
        test_data = pd.concat([test_data, additional_samples]).reset_index(drop=True)
        remaining_data = remaining_data[~remaining_data['label'].isin(additional_samples['label'])]

print(f'Train set size: {len(train_data)}')
print(f'Test set size: {len(test_data)}')

import cv2  # OpenCV for image processing
import os
from sklearn.preprocessing import StandardScaler

image_directory = 'Chinese_MINST_Dataset/data/data/input_'  



def load_images_and_labels(data):
    images = []
    labels = []

    for index, row in data.iterrows():
        img_path = image_directory + row['label']  
        value = row['value'] 
        
        # Load image
        img = cv2.imread(img_path, cv2.IMREAD_GRAYSCALE)  
        if img is not None:
            img_resized = cv2.resize(img, (64, 64)) 
            images.append(img_resized.flatten())  
            labels.append(value) 
        else:
            print(f"Warning: Image {img_path} not found.")
    
    return np.array(images), np.array(labels)


X_train, y_train = load_images_and_labels(train_data)  
X_test, y_test = load_images_and_labels(test_data)    

X_train = X_train.reshape(X_train.shape[0], -1)  
X_test = X_test.reshape(X_test.shape[0], -1)

scaler = StandardScaler()
X_train = scaler.fit_transform(X_train) 
X_test = scaler.transform(X_test)         


knn = KNeighborsClassifier(n_neighbors=3)  # KNN
dt = DecisionTreeClassifier()               # Decision Tree
sgd = SGDClassifier(max_iter=250)           # SGD Classifier

knn.fit(X_train, y_train)
dt.fit(X_train, y_train)
sgd.fit(X_train, y_train)

knn_pred = knn.predict(X_test)
dt_pred = dt.predict(X_test)
sgd_pred = sgd.predict(X_test)

def report_metrics(y_true, y_pred, model_name):
    accuracy = accuracy_score(y_true, y_pred)
    precision = precision_score(y_true, y_pred, average='weighted', zero_division=0)
    recall = recall_score(y_true, y_pred, average='weighted', zero_division=0)
    f1 = f1_score(y_true, y_pred, average='weighted', zero_division=0)
    conf_matrix = confusion_matrix(y_true, y_pred)

    print(f"{model_name} Performance:")
    print(f"Accuracy: {accuracy:.4f}")
    print(f"Precision: {precision:.4f}")
    print(f"Recall: {recall:.4f}")
    print(f"F1 Score: {f1:.4f}")
    print("Confusion Matrix:")
    print(conf_matrix)
    print("\n")

# Report metrics for each classifier
report_metrics(y_test, knn_pred, "KNN")
report_metrics(y_test, dt_pred, "Decision Tree")
report_metrics(y_test, sgd_pred, "SGD Classifier")


Insufficient samples in train or test dataset, trying to supplement...
Train set size: 10000
Test set size: 1000




KNN Performance:
Accuracy: 0.4370
Precision: 0.5425
Recall: 0.4370
F1 Score: 0.4451
Confusion Matrix:
[[44  4  0  2  1  1  0  1  0  0  3  2  1  7  0]
 [ 0 60  5  1  0  0  0  0  0  0  0  0  0  0  0]
 [ 2 22 30  9  0  1  0  1  1  0  0  0  0  0  0]
 [ 1 19 13 30  0  4  0  0  0  0  0  1  1  0  0]
 [ 4 18  3  3 30  1  2  2  0  0  0  1  0  0  2]
 [ 1  9 22 15  0 14  0  1  1  0  2  0  0  1  0]
 [ 0 34  2  0  0  0 26  1  1  0  0  0  0  2  0]
 [ 0 23  5  3  1  0  3 22  0  1  4  0  1  0  3]
 [ 0  2  2  0  0  0  2  0 60  0  0  0  0  0  0]
 [ 2 14  0  1  1  5 11  9  1 16  0  0  0  3  3]
 [ 0 19  2  0  0  0  2  0  0  0 28  0 13  2  0]
 [ 1  8  5  4  5  4  3  4  0  1  2 18  2 10  3]
 [ 1 19  6  2  0  0  4  1  0  0 17  0 18  0  0]
 [ 1 30  0  1  1  2  8  0  0  1  3  2  1 16  0]
 [ 2 17  4  3  3  2  1  5  1  1  0  1  2  0 25]]


Decision Tree Performance:
Accuracy: 0.3050
Precision: 0.3026
Recall: 0.3050
F1 Score: 0.3026
Confusion Matrix:
[[28  1  1  0  1  5  1  2  0  6  1  9  5  1  5]
 [ 0 52  6  1  