# K近邻（KNN）

In [1]:
import pandas as pd

In [2]:
# 1、获取数据
data = pd.read_csv("../data_big/FBlocation/train.csv")

In [3]:
data.head()

Unnamed: 0,row_id,x,y,accuracy,time,place_id
0,0,0.7941,9.0809,54,470702,8523065625
1,1,5.9567,4.7968,13,186555,1757726713
2,2,8.3078,7.0407,74,322648,1137537235
3,3,7.3665,2.5165,65,704587,6567393236
4,4,4.0961,1.1307,31,472130,7440663949


### 数据处理

In [4]:
# 2、基本的数据处理
# 1）缩小数据范围
data = data.query("x < 2.5 & x > 2 & y < 1.5 & y > 1.0")

In [5]:
data.head()

Unnamed: 0,row_id,x,y,accuracy,time,place_id
112,112,2.236,1.3655,66,623174,7663031065
180,180,2.2003,1.2541,65,610195,2358558474
367,367,2.4108,1.3213,74,579667,6644108708
874,874,2.0822,1.1973,320,143566,3229876087
1022,1022,2.016,1.1659,65,207993,3244363975


In [6]:
# 2）处理时间特征
time_value = pd.to_datetime(data["time"], unit="s")

In [7]:
date = pd.DatetimeIndex(time_value)
date

DatetimeIndex(['1970-01-08 05:06:14', '1970-01-08 01:29:55',
               '1970-01-07 17:01:07', '1970-01-02 15:52:46',
               '1970-01-03 09:46:33', '1970-01-06 19:49:38',
               '1970-01-06 13:33:24', '1970-01-02 22:49:55',
               '1970-01-04 14:30:10', '1970-01-07 16:57:44',
               ...
               '1970-01-02 09:24:50', '1970-01-01 10:29:34',
               '1970-01-09 11:38:46', '1970-01-02 03:42:14',
               '1970-01-04 22:02:44', '1970-01-09 08:31:25',
               '1970-01-07 12:29:49', '1970-01-09 20:46:26',
               '1970-01-02 18:11:58', '1970-01-01 22:06:09'],
              dtype='datetime64[ns]', name='time', length=83197, freq=None)

In [8]:
data["day"] = date.day

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data["day"] = date.day


In [9]:
data["weekday"] = date.weekday

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data["weekday"] = date.weekday


In [10]:
data["hour"] = date.hour

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data["hour"] = date.hour


In [11]:
data.head()

Unnamed: 0,row_id,x,y,accuracy,time,place_id,day,weekday,hour
112,112,2.236,1.3655,66,623174,7663031065,8,3,5
180,180,2.2003,1.2541,65,610195,2358558474,8,3,1
367,367,2.4108,1.3213,74,579667,6644108708,7,2,17
874,874,2.0822,1.1973,320,143566,3229876087,2,4,15
1022,1022,2.016,1.1659,65,207993,3244363975,3,5,9


In [12]:
# 3）过滤签到次数少的地点
place_count = data.groupby("place_id").count()["row_id"]

In [13]:
data.groupby("place_id").count().head()

Unnamed: 0_level_0,row_id,x,y,accuracy,time,day,weekday,hour
place_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
1012165853,1,1,1,1,1,1,1,1
1013991737,3,3,3,3,3,3,3,3
1014605271,28,28,28,28,28,28,28,28
1015645743,4,4,4,4,4,4,4,4
1017236154,31,31,31,31,31,31,31,31


In [14]:
place_count[place_count > 3].head()

place_id
1014605271    28
1015645743     4
1017236154    31
1024951487     5
1028119817     4
Name: row_id, dtype: int64

In [15]:
data_final = data[data["place_id"].isin(place_count[place_count > 3].index.values)]

In [16]:
data_final.head()

Unnamed: 0,row_id,x,y,accuracy,time,place_id,day,weekday,hour
112,112,2.236,1.3655,66,623174,7663031065,8,3,5
367,367,2.4108,1.3213,74,579667,6644108708,7,2,17
874,874,2.0822,1.1973,320,143566,3229876087,2,4,15
1022,1022,2.016,1.1659,65,207993,3244363975,3,5,9
1045,1045,2.3859,1.166,498,503378,6438240873,6,1,19


In [17]:
# 筛选特征值和目标值
x = data_final[["x", "y", "accuracy", "day", "weekday", "hour"]]
y = data_final["place_id"]

In [18]:
x.head()

Unnamed: 0,x,y,accuracy,day,weekday,hour
112,2.236,1.3655,66,8,3,5
367,2.4108,1.3213,74,7,2,17
874,2.0822,1.1973,320,2,4,15
1022,2.016,1.1659,65,3,5,9
1045,2.3859,1.166,498,6,1,19


In [19]:
y.head()

112     7663031065
367     6644108708
874     3229876087
1022    3244363975
1045    6438240873
Name: place_id, dtype: int64

### 数据集划分

In [20]:
# 数据集划分
from sklearn.model_selection import train_test_split

In [21]:
x_train, x_test, y_train, y_test = train_test_split(x, y)

In [22]:
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import GridSearchCV

### 特征工程

In [23]:
# 3）特征工程：标准化
transfer = StandardScaler()
x_train = transfer.fit_transform(x_train)
x_test = transfer.transform(x_test)

# 4）KNN算法预估器
estimator = KNeighborsClassifier()

# 加入网格搜索与交叉验证
# 参数准备
param_dict = {"n_neighbors": [3, 5, 7, 9]}
estimator = GridSearchCV(estimator, param_grid=param_dict, cv=3)
estimator.fit(x_train, y_train)

# 5）模型评估
# 方法1：直接比对真实值和预测值
y_predict = estimator.predict(x_test)
print("y_predict:\n", y_predict)
print("直接比对真实值和预测值:\n", y_test == y_predict)

# 方法2：计算准确率
score = estimator.score(x_test, y_test)
print("准确率为：\n", score)

# 最佳参数：best_params_
print("最佳参数：\n", estimator.best_params_)
# 最佳结果：best_score_
print("最佳结果：\n", estimator.best_score_)
# 最佳估计器：best_estimator_
print("最佳估计器:\n", estimator.best_estimator_)
# 交叉验证结果：cv_results_
print("交叉验证结果:\n", estimator.cv_results_)



y_predict:
 [2517696955 7871954785 1663763627 ... 7605688218 1326517230 7644714882]
直接比对真实值和预测值:
 19923715    False
20056425    False
5698016     False
330734      False
23166517    False
            ...  
10675165    False
7315312     False
17918130    False
5687333     False
14008537     True
Name: place_id, Length: 20228, dtype: bool
准确率为：
 0.36662052600355943
最佳参数：
 {'n_neighbors': 5}
最佳结果：
 0.3344484199973306
最佳估计器:
 KNeighborsClassifier()
交叉验证结果:
 {'mean_fit_time': array([0.05063645, 0.05199607, 0.05191223, 0.04939532]), 'std_fit_time': array([0.00070872, 0.00076641, 0.00205346, 0.00300211]), 'mean_score_time': array([1.22845928, 1.2859381 , 1.35111109, 1.42712569]), 'std_score_time': array([0.00512754, 0.03059895, 0.00896653, 0.01926852]), 'param_n_neighbors': masked_array(data=[3, 5, 7, 9],
             mask=[False, False, False, False],
       fill_value=999999), 'params': [{'n_neighbors': 3}, {'n_neighbors': 5}, {'n_neighbors': 7}, {'n_neighbors': 9}], 'split0_test_score': ar