In [10]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
import pandas as pd
from sklearn.model_selection import cross_val_score

In [11]:
# 加载训练集和测试集
train_data = pd.read_csv(r"F:\BaiduNetdiskDownload\python_ml\data\FBlocation\train.csv")
test_data = pd.read_csv(r"F:\BaiduNetdiskDownload\python_ml\data\FBlocation\test.csv")

# 显示训练集和测试集的前5行
print("训练集预览：")
print(train_data.head())
print("\n测试集预览：")
print(test_data.head())



训练集预览：
   row_id       x       y  accuracy    time    place_id
0       0  0.7941  9.0809        54  470702  8523065625
1       1  5.9567  4.7968        13  186555  1757726713
2       2  8.3078  7.0407        74  322648  1137537235
3       3  7.3665  2.5165        65  704587  6567393236
4       4  4.0961  1.1307        31  472130  7440663949

测试集预览：
   row_id       x       y  accuracy    time
0       0  0.1675  1.3608       107  930883
1       1  7.3909  2.5301        35  893017
2       2  8.0978  2.3473        62  976933
3       3  0.9990  1.0591        62  907285
4       4  0.6670  9.7254        40  914399


In [12]:
test_data = test_data.drop(columns=['row_id'])
train_data = train_data.drop(columns=['row_id'])
print(test_data.shape)
print(train_data.shape)



(8607230, 4)
(29118021, 5)


In [13]:
# 将'time'字段转换为星期和24小时
import numpy as np

def extract_week_hour(df):
    # 假设'time'为分钟数
    df['weekday'] = (df['time'] // (60*24)) % 7  # 一周7天
    df['hour'] = (df['time'] // 60) % 24         # 一天24小时
    return df

train_data = extract_week_hour(train_data)
test_data = extract_week_hour(test_data)
print(test_data.shape)
print(train_data.shape)


(8607230, 6)
(29118021, 7)


In [14]:
print(train_data.head())
print(test_data.head())



        x       y  accuracy    time    place_id  weekday  hour
0  0.7941  9.0809        54  470702  8523065625        4    21
1  5.9567  4.7968        13  186555  1757726713        3    13
2  8.3078  7.0407        74  322648  1137537235        0     1
3  7.3665  2.5165        65  704587  6567393236        6     7
4  4.0961  1.1307        31  472130  7440663949        5    20
        x       y  accuracy    time  weekday  hour
0  0.1675  1.3608       107  930883        2    10
1  7.3909  2.5301        35  893017        4     3
2  8.0978  2.3473        62  976933        6    10
3  0.9990  1.0591        62  907285        0     1
4  0.6670  9.7254        40  914399        4    23


In [15]:
test_data = test_data.drop(columns=['time'])
train_data = train_data.drop(columns=['time'])

In [16]:
print(train_data.head())
print(test_data.head())
print(train_data.shape)
print(test_data.shape)



        x       y  accuracy    place_id  weekday  hour
0  0.7941  9.0809        54  8523065625        4    21
1  5.9567  4.7968        13  1757726713        3    13
2  8.3078  7.0407        74  1137537235        0     1
3  7.3665  2.5165        65  6567393236        6     7
4  4.0961  1.1307        31  7440663949        5    20
        x       y  accuracy  weekday  hour
0  0.1675  1.3608       107        2    10
1  7.3909  2.5301        35        4     3
2  8.0978  2.3473        62        6    10
3  0.9990  1.0591        62        0     1
4  0.6670  9.7254        40        4    23
(29118021, 6)
(8607230, 5)


In [17]:
# 对train_data中place_id的数据进行筛选，保留‘place_id’次数>3的数据
place_counts = train_data['place_id'].value_counts()
valid_places = place_counts[place_counts > 3].index
train_data = train_data[train_data['place_id'].isin(valid_places)].reset_index(drop=True)
from sklearn.preprocessing import StandardScaler

# 除去'place_id'列，对其余特征进行标准化
scaler = StandardScaler()
feature_cols = [col for col in train_data.columns if col != 'place_id']
train_data[feature_cols] = scaler.fit_transform(train_data[feature_cols])
test_data=scaler.transform(test_data)#测试集

print(train_data.shape)
print(test_data.shape)


(29116952, 6)
(8607230, 5)


In [18]:

print(train_data.head())

          x         y  accuracy    place_id   weekday      hour
0 -1.471750  1.412667 -0.251404  8523065625  0.510370  1.374086
1  0.334872 -0.071001 -0.608703  1757726713  0.008003  0.218022
2  1.157626  0.706106 -0.077112  1137537235 -1.499099 -1.516074
3  0.828224 -0.860714 -0.155544  6567393236  1.515105 -0.649026
4 -0.316234 -1.340644 -0.451840  7440663949  1.012737  1.229578


In [19]:
# 根据x和y的特征值筛选数据，例如只保留x和y都在一定范围内的数据
# 这里以x在[-2, 2]且y在[-2, 2]为例（可根据实际分布调整范围）

x_min, x_max = 0,0.3 
y_min, y_max = 0,0.3

# 筛选train_data
train_data = train_data[
    (train_data['x'] >= x_min) & (train_data['x'] <= x_max) &
    (train_data['y'] >= y_min) & (train_data['y'] <= y_max)
].reset_index(drop=True)#reset_index(drop=True)是为了保持索引连续

# 筛选test_data
import numpy as np
# test_data可能是numpy数组，需转为DataFrame临时处理
if isinstance(test_data, np.ndarray):
    test_data_df = pd.DataFrame(test_data, columns=feature_cols)
    test_data_df = test_data_df[
        (test_data_df['x'] >= x_min) & (test_data_df['x'] <= x_max) &
        (test_data_df['y'] >= y_min) & (test_data_df['y'] <= y_max)
    ].reset_index(drop=True)
    test_data = test_data_df.values
else:
    test_data = test_data[
        (test_data['x'] >= x_min) & (test_data['x'] <= x_max) &
        (test_data['y'] >= y_min) & (test_data['y'] <= y_max)
    ].reset_index(drop=True)

print("After filtering:")
print("train_data shape:", train_data.shape)
if isinstance(test_data, np.ndarray):
    print("test_data shape:", test_data.shape)
else:
    print("test_data shape:", test_data.shape)


After filtering:
train_data shape: (229185, 6)
test_data shape: (69364, 5)


In [20]:
from sklearn.model_selection import train_test_split

# 假设特征为除'place_id'外的所有列，标签为'place_id'
X = train_data.drop(columns=['place_id'])
y = train_data['place_id']

# 划分训练集和测试集，test_size可以根据需要调整
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

print("X_train shape:", X_train.shape)
print("X_test shape:", X_test.shape)
print("y_train shape:", y_train.shape)
print("y_test shape:", y_test.shape)



X_train shape: (183348, 5)
X_test shape: (45837, 5)
y_train shape: (183348,)
y_test shape: (45837,)


In [21]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score

# 创建KNN分类器实例，n_neighbors可以根据需要调整
knn = KNeighborsClassifier(n_neighbors=5)

# 用训练集训练模型
knn.fit(X_train, y_train)

# 在测试集上进行预测
y_pred = knn.predict(X_test)

# 计算并输出准确率
accuracy = accuracy_score(y_test, y_pred)
print("KNN模型在测试集上的准确率为: {:.2f}%".format(accuracy * 100))


KNN模型在测试集上的准确率为: 16.57%


In [22]:
from sklearn.model_selection import GridSearchCV

# 定义参数网格
param_grid = {
    'n_neighbors': [3, 5, 7, 9, 11],
    'weights': ['uniform', 'distance']
}

# 创建KNN分类器
knn = KNeighborsClassifier()

# 创建GridSearchCV对象
grid_search = GridSearchCV(knn, param_grid, cv=5, scoring='accuracy', n_jobs=-1)#scoring='accuracy'是评估函数,n_jobs=-1表示使用所有CPU核

# 在训练集上拟合
grid_search.fit(X_train, y_train)

print("最佳参数:", grid_search.best_params_)
print("最佳交叉验证准确率: {:.2f}%".format(grid_search.best_score_ * 100))

# 用最佳参数的模型在测试集上评估
best_knn = grid_search.best_estimator_
y_pred_best = best_knn.predict(X_test)
accuracy_best = accuracy_score(y_test, y_pred_best)
print("GridSearchCV优化后KNN在测试集上的准确率为: {:.2f}%".format(accuracy_best * 100))




最佳参数: {'n_neighbors': 5, 'weights': 'distance'}
最佳交叉验证准确率: 19.40%
GridSearchCV优化后KNN在测试集上的准确率为: 20.80%


In [23]:
# 对train_data进行交叉验证
cv_scores = cross_val_score(knn, X, y, cv=5)
print("train_data 5折交叉验证得分:", cv_scores)
print("train_data 5折交叉验证平均得分: {:.2f}%".format(cv_scores.mean() * 100))

# 如果test_data有标签，也可以对test_data进行交叉验证
if isinstance(test_data, np.ndarray):
    test_df = pd.DataFrame(test_data, columns=feature_cols)
    if 'place_id' in test_df.columns:
        X_test_cv = test_df.drop(columns=['place_id'])
        y_test_cv = test_df['place_id']
        test_cv_scores = cross_val_score(knn, X_test_cv, y_test_cv, cv=5)
        print("test_data 5折交叉验证得分:", test_cv_scores)
        print("test_data 5折交叉验证平均得分: {:.2f}%".format(test_cv_scores.mean() * 100))
elif 'place_id' in test_data.columns:
    X_test_cv = test_data.drop(columns=['place_id'])
    y_test_cv = test_data['place_id']
    test_cv_scores = cross_val_score(knn, X_test_cv, y_test_cv, cv=5)
    print("test_data 5折交叉验证得分:", test_cv_scores)
    print("test_data 5折交叉验证平均得分: {:.2f}%".format(test_cv_scores.mean() * 100))



train_data 5折交叉验证得分: [0.1673757  0.1643214  0.16506316 0.16589218 0.16469228]
train_data 5折交叉验证平均得分: 16.55%
