In [24]:
import pandas as pd

In [25]:
#1.获取数据
data = pd.read_csv("./fb/train.csv")

In [26]:
data = data.query("x < 2.5 & y < 1.5 & y > 1.0")

In [27]:
# 处理时间特征
time_value = pd.to_datetime(data["time"],unit="s")

In [28]:
date = pd.DatetimeIndex(time_value)

In [29]:
data["day"] = date.day

In [30]:
data["weekday"] = date.weekday

In [31]:
data["hour"] = date.hour

In [32]:
# 3.过滤签到次数少的地点
place_count = data.groupby("place_id").count()["row_id"]

In [33]:
place_count[place_count > 50].head()

place_id
1007357594      66
1014605271     123
1015645743     895
1017236154    1255
1024951487     316
Name: row_id, dtype: int64

In [34]:
data_final = data[data["place_id"].isin(place_count[place_count > 50].index.values)]

In [35]:
data_final.head()

Unnamed: 0,row_id,x,y,accuracy,time,place_id,day,weekday,hour
12,12,0.8829,1.3445,64,574488,7652380351,7,2,15
39,39,1.2191,1.3462,743,477469,6171384989,6,1,12
109,109,0.4995,1.4831,155,769344,9841775341,9,4,21
112,112,2.236,1.3655,66,623174,7663031065,8,3,5
172,172,0.7061,1.3806,52,71867,2143257457,1,3,19


In [36]:
# 筛选特征值和目标值
x = data_final[["x", "y", "accuracy", "day", "weekday", "hour"]]
y = data_final["place_id"]

In [37]:
x.head()

Unnamed: 0,x,y,accuracy,day,weekday,hour
12,0.8829,1.3445,64,7,2,15
39,1.2191,1.3462,743,6,1,12
109,0.4995,1.4831,155,9,4,21
112,2.236,1.3655,66,8,3,5
172,0.7061,1.3806,52,1,3,19


In [38]:
y.head()

12     7652380351
39     6171384989
109    9841775341
112    7663031065
172    2143257457
Name: place_id, dtype: int64

In [39]:
# 数据集划分
from sklearn.model_selection import train_test_split

In [40]:
x_train,x_test,y_train,y_test = train_test_split(x,y)

In [41]:
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import GridSearchCV

In [42]:
# 3.特征工程：标准化
transfer = StandardScaler()
x_train = transfer.fit_transform(x_train)
x_test = transfer.transform(x_test)

# 4.KNN算法预估器
estimator = KNeighborsClassifier()

# 加入网格搜索和交叉验证
# 参数准备
param_dict = {"n_neighbors": [2, 3, 5 ,7]}
estimator = GridSearchCV(estimator=estimator, param_grid=param_dict, cv=5)
estimator.fit(x_train, y_train)

# 5.模型评估
# 方法一：直接比对真实值
y_predict = estimator.predict(x_test)
print("y_predict:\n", y_predict)
print("直接比对真实值和预测值：\n", y_test == y_predict)

# 方法二：计算准确率
score = estimator.score(x_test, y_test)
print("准确率为：\n", score)

# 最佳参数：best_params
print("最佳参数：\n", estimator.best_params_)
# 最佳结果：best_score
print("最佳结果：\n", estimator.best_score_)
# 最佳估计器：best_estimator
print("最佳估计器：\n", estimator.best_estimator_)
# 交叉验证结果：cv_results
print("交叉验证结果：\n", estimator.cv_results_)

y_predict:
 [4723615516 2367979052 4182637650 ... 2082127512 8286292454 2191807392]
直接比对真实值和预测值：
 27273679    False
3054693     False
8295402     False
2617968      True
14153083    False
            ...  
14242132     True
23047811    False
11849162    False
10241932     True
20422357    False
Name: place_id, Length: 86293, dtype: bool
准确率为：
 0.3199448390947122
最佳参数：
 {'n_neighbors': 5}
最佳结果：
 0.3040760808808915
最佳估计器：
 KNeighborsClassifier()
交叉验证结果：
 {'mean_fit_time': array([0.80736194, 0.81566744, 0.80941987, 0.82480597]), 'std_fit_time': array([0.04270669, 0.01798544, 0.02058797, 0.02687576]), 'mean_score_time': array([2.18343716, 2.34363623, 2.65644851, 2.86609797]), 'std_score_time': array([0.09662719, 0.02758012, 0.02596084, 0.03609365]), 'param_n_neighbors': masked_array(data=[2, 3, 5, 7],
             mask=[False, False, False, False],
       fill_value='?',
            dtype=object), 'params': [{'n_neighbors': 2}, {'n_neighbors': 3}, {'n_neighbors': 5}, {'n_neighbors': 7}], '