## 预处理

In [1]:
import preprocess.preprocess as process
import numpy as np

file_path = "data/raw/train.csv"
label_names=["label"]

preprocesser = process.Preprocessor("data/raw/train.csv")

# 预处理
preprocesser.remove_duplicates()
preprocesser.remove_null_values()

# 输出
labels, features = preprocesser.split_features_labels(label_names=label_names)
labels = np.asarray(labels).T
features = features.to_numpy()

## KNN

In [2]:
from collections import Counter

class KNNClassifier:
    def __init__(self, k=3):
        self.k = k
    
    def fit(self, train_data, train_labels):
        self.train_data = train_data
        self.train_labels = train_labels
    
    def predict(self, test_data):
        all_dist = self._get_index(test_data)
        all_numbers = self._get_number(all_dist)
        min_number = self._get_min_number(all_numbers)
        return min_number
    
    def _get_index(self, test_data):
        # 计算测试样本与每一个训练样本的距离
        return np.sqrt(np.sum(np.square(test_data - self.train_data), axis=1))
    
    def _get_number(self, all_dist):
        # 计算查找最近的K个训练集所对应的预测值
        sorted_indices = np.argsort(all_dist)
        top_k_indices = sorted_indices[:self.k]
        top_k_labels = self.train_labels[top_k_indices]
        return top_k_labels.flatten()
    
    def _get_min_number(self, all_numbers):
        # 在K个预测值中，求众数
        counts = Counter(all_numbers)
        return counts.most_common(1)[0][0]

## 划分验证集和训练集

In [4]:
from sklearn.metrics import classification_report, accuracy_score
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(features, labels, test_size=0.1, random_state=42)

knn = KNNClassifier(k=10)
knn.fit(X_train, y_train)

y_pred = []
for i in range(len(X_test)):
    y_pred.append(knn.predict(X_test[i]))

acc = accuracy_score(y_test, y_pred)
print("Accuracy: {acc:.2f}")

print(classification_report(y_test, y_pred))

Accuracy: {acc:.2f}
              precision    recall  f1-score   support

           0       0.97      1.00      0.98       408
           1       0.93      1.00      0.96       471
           2       0.99      0.95      0.97       420
           3       0.96      0.96      0.96       506
           4       0.98      0.96      0.97       397
           5       0.96      0.97      0.97       339
           6       0.97      0.98      0.98       402
           7       0.95      0.95      0.95       438
           8       0.99      0.91      0.95       403
           9       0.94      0.96      0.95       416

    accuracy                           0.96      4200
   macro avg       0.96      0.96      0.96      4200
weighted avg       0.96      0.96      0.96      4200



## 并行化推理测试集

In [3]:
import pandas as pd

test = pd.read_csv("data/raw/test.csv")
test = test.to_numpy()

In [4]:
import multiprocessing

def predict(test_data):
    print("a process is predicting")
    knn = KNNClassifier(k = 10)
    knn.fit(features, labels)
    
    results = []
    for i in range(len(test_data)):
        results.append(knn.predict(test_data[i]))
    return results

def parrallel_predict(test_data, num_cores=4):
    # 划分数据集
    step = len(test_data) // num_cores

    grouped_data = [test_data[i : i+step] for i in range(0, len(test_data), step)]

    grouped_data[-1] = np.concatenate((grouped_data[-1], test_data[step * num_cores : ]))

    # 并行预测
    pool = multiprocessing.Pool(num_cores)

    results = pool.map(predict, grouped_data)

    pool.close()
    pool.join() 
    return results

y_pred = parrallel_predict(test, num_cores=8)

a process is predicting
a process is predicting
a process is predicting
a process is predicting
a process is predicting
a process is predicting
a process is predicting
a process is predicting


In [6]:
# 展平预测结果
y_pred = sum(y_pred, [])

# 保存成csv文件，（ImageId，Label）
df = pd.DataFrame({'Label': y_pred})
df.index += 1
df.index.name = "ImageId"
print(df.info)
df.to_csv("data/processed/knn_test.csv")

<bound method DataFrame.info of          Label
ImageId       
1            2
2            0
3            9
4            9
5            3
...        ...
27996        9
27997        7
27998        3
27999        9
28000        2

[28000 rows x 1 columns]>
