# 案例：Facebook签到位置预测
- 预测用户签到的位置（kaggle）

In [96]:
import pandas as pd
import numpy as np

## 1、读取数据

In [97]:
fb_train = pd.read_csv('./day2资料/02-代码/FBlocation/train.csv')
fb_train.head() # place_id是标签值

Unnamed: 0,row_id,x,y,accuracy,time,place_id
0,0,0.7941,9.0809,54,470702,8523065625
1,1,5.9567,4.7968,13,186555,1757726713
2,2,8.3078,7.0407,74,322648,1137537235
3,3,7.3665,2.5165,65,704587,6567393236
4,4,4.0961,1.1307,31,472130,7440663949


In [98]:
fb_train.shape

(29118021, 6)

In [99]:
fb_test = pd.read_csv('./day2资料/02-代码/FBlocation/test.csv')
fb_test.head()

Unnamed: 0,row_id,x,y,accuracy,time
0,0,0.1675,1.3608,107,930883
1,1,7.3909,2.5301,35,893017
2,2,8.0978,2.3473,62,976933
3,3,0.999,1.0591,62,907285
4,4,0.667,9.7254,40,914399


In [100]:
fb_test.shape

(8607230, 5)

## 2、数据处理
- 目标：得到 特征值x、标签值y
    - （缩小数据范围）
    - time -> 年月日时分秒
    - 过滤签到次数少的地点

In [101]:
fb_train = fb_train.query("x < 2.5 & x > 2 & y < 1.5 & y > 1.0") # 缩小数据范围，不然后面运行时间过长
fb_train.shape

(83197, 6)

### （1）time -> 年月日时分秒

In [102]:
fb_train['time'].head()

112     623174
180     610195
367     579667
874     143566
1022    207993
Name: time, dtype: int64

In [103]:
time_value = pd.to_datetime(fb_train['time'], unit='s') # time -> 秒
time_value.head()

112    1970-01-08 05:06:14
180    1970-01-08 01:29:55
367    1970-01-07 17:01:07
874    1970-01-02 15:52:46
1022   1970-01-03 09:46:33
Name: time, dtype: datetime64[ns]

In [104]:
date_time = pd.DatetimeIndex(time_value) # time -> 年月日时分秒
date_time

DatetimeIndex(['1970-01-08 05:06:14', '1970-01-08 01:29:55',
               '1970-01-07 17:01:07', '1970-01-02 15:52:46',
               '1970-01-03 09:46:33', '1970-01-06 19:49:38',
               '1970-01-06 13:33:24', '1970-01-02 22:49:55',
               '1970-01-04 14:30:10', '1970-01-07 16:57:44',
               ...
               '1970-01-02 09:24:50', '1970-01-01 10:29:34',
               '1970-01-09 11:38:46', '1970-01-02 03:42:14',
               '1970-01-04 22:02:44', '1970-01-09 08:31:25',
               '1970-01-07 12:29:49', '1970-01-09 20:46:26',
               '1970-01-02 18:11:58', '1970-01-01 22:06:09'],
              dtype='datetime64[ns]', name='time', length=83197, freq=None)

In [105]:
# date_time.year # 观察数据发现没啥用
# date_time.month # 观察数据发现没啥用
fb_train['date'] = date_time.day
fb_train['weekday'] = date_time.weekday
fb_train['hour'] = date_time.hour

In [106]:
fb_train.head()

Unnamed: 0,row_id,x,y,accuracy,time,place_id,date,weekday,hour
112,112,2.236,1.3655,66,623174,7663031065,8,3,5
180,180,2.2003,1.2541,65,610195,2358558474,8,3,1
367,367,2.4108,1.3213,74,579667,6644108708,7,2,17
874,874,2.0822,1.1973,320,143566,3229876087,2,4,15
1022,1022,2.016,1.1659,65,207993,3244363975,3,5,9


### （2）过滤签到次数少的地点

In [107]:
place_count = fb_train.groupby('place_id')['row_id'].count() # 分组、聚合
place_count.head()

place_id
1012165853     1
1013991737     3
1014605271    28
1015645743     4
1017236154    31
Name: row_id, dtype: int64

In [108]:
place_count = place_count[place_count > 3] # 选出签到次数>3的地点id
place_count.head()

place_id
1014605271    28
1015645743     4
1017236154    31
1024951487     5
1028119817     4
Name: row_id, dtype: int64

In [109]:
place_get = fb_train['place_id'].isin(place_count.index.values) # 在fb_train中找到选出的地点id，返回bool值
place_get.head()

112      True
180     False
367      True
874      True
1022     True
Name: place_id, dtype: bool

In [110]:
fb_train_final = fb_train[place_get]
fb_train_final.head()

Unnamed: 0,row_id,x,y,accuracy,time,place_id,date,weekday,hour
112,112,2.236,1.3655,66,623174,7663031065,8,3,5
367,367,2.4108,1.3213,74,579667,6644108708,7,2,17
874,874,2.0822,1.1973,320,143566,3229876087,2,4,15
1022,1022,2.016,1.1659,65,207993,3244363975,3,5,9
1045,1045,2.3859,1.166,498,503378,6438240873,6,1,19


In [111]:
fb_train_final.shape

(80910, 9)

### （3）筛选特征值和标签值

In [112]:
x = fb_train_final[['x', 'y', 'accuracy', 'date', 'weekday', 'hour']] # 特征值
x.head()

Unnamed: 0,x,y,accuracy,date,weekday,hour
112,2.236,1.3655,66,8,3,5
367,2.4108,1.3213,74,7,2,17
874,2.0822,1.1973,320,2,4,15
1022,2.016,1.1659,65,3,5,9
1045,2.3859,1.166,498,6,1,19


In [113]:
y = fb_train_final['place_id'] # 标签值
y.head()

112     7663031065
367     6644108708
874     3229876087
1022    3244363975
1045    6438240873
Name: place_id, dtype: int64

## 3、特征工程
- 数据集划分
- 标准化

### （1）数据集划分

In [114]:
from sklearn.model_selection import train_test_split

In [115]:
x_train, x_test, y_train, y_test = train_test_split(x, y, random_state=10)

### （2）标准化

In [116]:
from sklearn.preprocessing import StandardScaler

In [117]:
trans = StandardScaler()
x_train = trans.fit_transform(x_train)
x_test = trans.transform(x_test)

## 4、kNN + 网格搜索 + 交叉验证

In [118]:
from sklearn.neighbors import KNeighborsClassifier

esti = KNeighborsClassifier() # 实例化一个预估器

In [119]:
from sklearn.model_selection import GridSearchCV

param_dict = {"n_neighbors": [3, 5, 7, 9]}
esti = GridSearchCV(esti, param_grid=param_dict, cv=5)
esti.fit(x_train, y_train) # 训练



## 5、模型评估

In [120]:
y_pred = esti.predict(x_test) # 预测
# print("y_predict =", y_pred)
accuracy_1 = np.sum(y_pred == y_test)/sum(np.ones(y_test.shape))
print("accuracy_1 =", accuracy_1) # 计算准确率

accuracy_1 = 0.3658295432074352


In [121]:
accuracy_2 = esti.score(x_test, y_test) # 计算准确率
print("accuracy_2 =", accuracy_2)

accuracy_2 = 0.3658295432074352


In [122]:
# 最佳参数：best_params_
print("最佳参数：\n", esti.best_params_)

最佳参数：
 {'n_neighbors': 5}


In [123]:
# 最佳结果：best_score_
print("最佳结果：\n", esti.best_score_)

最佳结果：
 0.35170228606837484


In [124]:
# 最佳估计器：best_estimator_
print("最佳估计器:\n", esti.best_estimator_)

最佳估计器:
 KNeighborsClassifier()
