# 过程
1. 获取数据集
2. 数据集基本处理
3. 特征值标准化
4. 机器学习(模型训练)
5. 模型评估


In [8]:
import numpy as np
import pandas as pd

# 划分训练测试 交叉
from sklearn.model_selection import train_test_split, GridSearchCV

# 标准化处理
from sklearn.preprocessing import StandardScaler

# 邻居
from sklearn.neighbors import KNeighborsClassifier

# 1.获取数据

In [2]:
train_data = pd.read_csv("../data/FBlocation/train.csv")
train_data.head()

Unnamed: 0,row_id,x,y,accuracy,time,place_id
0,0,0.7941,9.0809,54,470702,8523065625
1,1,5.9567,4.7968,13,186555,1757726713
2,2,8.3078,7.0407,74,322648,1137537235
3,3,7.3665,2.5165,65,704587,6567393236
4,4,4.0961,1.1307,31,472130,7440663949


In [5]:
train_data.shape

(29118021, 6)

# 2.基本数据处理
## 缩小数据范围

In [4]:
small_train_data = train_data.query("x>2.0 & x<2.5 & y>2.0 & y<2.5")
small_train_data.shape

(71664, 6)

In [6]:
# 等价写法
small_train_data1 = train_data[
    (train_data["x"] > 2.0)
    & (train_data["x"] < 2.5)
    & (train_data["y"] > 2.0)
    & (train_data["y"] < 2.5)
]
small_train_data1.shape

(71664, 6)

In [9]:
np.all(small_train_data == small_train_data1)

True

## 选择时间特征

In [10]:
small_train_data["time"].head()

163     669737
310     234719
658     502343
1368    319822
1627    595084
Name: time, dtype: int64

In [11]:
# 时间戳转换为真实的时间       unit="s" 单位是秒
time = pd.to_datetime(small_train_data["time"], unit="s")
time.head()

163    1970-01-08 18:02:17
310    1970-01-03 17:11:59
658    1970-01-06 19:32:23
1368   1970-01-04 16:50:22
1627   1970-01-07 21:18:04
Name: time, dtype: datetime64[ns]

In [12]:
# 转换为DataFrame,让其可以读取日期和时间
time_pd = pd.DatetimeIndex(time)

In [14]:
time_pd.day, time_pd.hour, time_pd.weekday

(Int64Index([8, 3, 6, 4, 7, 2, 7, 5, 1, 9,
             ...
             9, 8, 7, 7, 6, 3, 4, 1, 3, 2],
            dtype='int64', name='time', length=71664),
 Int64Index([18, 17, 19, 16, 21,  3,  3,  3, 18,  7,
             ...
             20,  9,  4, 22, 23, 12, 15, 20,  9, 20],
            dtype='int64', name='time', length=71664),
 Int64Index([3, 5, 1, 6, 2, 4, 2, 0, 3, 4,
             ...
             4, 3, 2, 2, 1, 5, 6, 3, 5, 4],
            dtype='int64', name='time', length=71664))

In [17]:
#                    行,列
small_train_data.loc[:, "day"] = time_pd.day
small_train_data.loc[:, "hour"] = time_pd.hour
small_train_data.loc[:, "weekday"] = time_pd.weekday

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  small_train_data.loc[:, "day"] = time_pd.day
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  small_train_data.loc[:, "hour"] = time_pd.hour
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  small_train_data.loc[:, "weekday"] = time_pd.weekday


In [18]:
small_train_data.head()

Unnamed: 0,row_id,x,y,accuracy,time,place_id,day,hour,weekday
163,163,2.1663,2.3755,84,669737,3869813743,8,18,3
310,310,2.3695,2.2034,3,234719,2636621520,3,17,5
658,658,2.3236,2.1768,66,502343,7877745055,6,19,1
1368,1368,2.2613,2.3392,73,319822,9775192577,4,16,6
1627,1627,2.3331,2.0011,66,595084,6731326909,7,21,2


## 去掉签到较少的地方

In [19]:
# 按照分类获取总数
place_count = small_train_data.groupby("place_id").count()
place_count.head()

Unnamed: 0_level_0,row_id,x,y,accuracy,time,day,hour,weekday
place_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
1006234733,1,1,1,1,1,1,1,1
1008823061,4,4,4,4,4,4,4,4
1012580558,3,3,3,3,3,3,3,3
1025585791,21,21,21,21,21,21,21,21
1026507711,220,220,220,220,220,220,220,220


In [20]:
place_count.shape

(2524, 8)

In [21]:
# 根据row_id 去掉少的值
place_count = place_count[place_count["row_id"] > 3]
place_count.head()

Unnamed: 0_level_0,row_id,x,y,accuracy,time,day,hour,weekday
place_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
1008823061,4,4,4,4,4,4,4,4
1025585791,21,21,21,21,21,21,21,21
1026507711,220,220,220,220,220,220,220,220
1032417180,10,10,10,10,10,10,10,10
1040557418,123,123,123,123,123,123,123,123


In [22]:
place_count.shape

(929, 8)

In [24]:
# place_count的index就是facebook_data的place_id,就是说如果face的id在count中,说明去的次数多
small_train_data["place_id"].isin(place_count.index)

163         True
310         True
658         True
1368        True
1627        True
            ... 
29116142    True
29116267    True
29116295    True
29116475    True
29117203    True
Name: place_id, Length: 71664, dtype: bool

In [25]:
# row_id不是
small_train_data["place_id"].isin(place_count["row_id"])

163         False
310         False
658         False
1368        False
1627        False
            ...  
29116142    False
29116267    False
29116295    False
29116475    False
29117203    False
Name: place_id, Length: 71664, dtype: bool

In [27]:
small_train_data2 = small_train_data[
    small_train_data["place_id"].isin(place_count.index)
]
small_train_data2.shape

(69264, 9)

## 确定特征值和目标值

In [29]:
x = small_train_data2[["x", "y", "accuracy", "day", "hour", "weekday"]]
x.head()

Unnamed: 0,x,y,accuracy,day,hour,weekday
163,2.1663,2.3755,84,8,18,3
310,2.3695,2.2034,3,3,17,5
658,2.3236,2.1768,66,6,19,1
1368,2.2613,2.3392,73,4,16,6
1627,2.3331,2.0011,66,7,21,2


In [30]:
y = small_train_data2[["place_id"]]
y.head()

Unnamed: 0,place_id
163,3869813743
310,2636621520
658,7877745055
1368,9775192577
1627,6731326909


## 分割数据集

In [38]:
x_train, x_val, y_train, y_val = train_test_split(x, y, test_size=0.2, random_state=0)

# 特征工程--特征预处理(标准化)

In [32]:
transfer = StandardScaler()

In [39]:
transfer.fit(x_train)
x_train1 = transfer.transform(x_train)
x_val1 = transfer.transform(x_val)

# knn+cv

## 实例化KNN和CV

In [40]:
estimator = KNeighborsClassifier()

In [41]:
param_grid = {"n_neighbors": [3, 5, 7, 9]}

In [42]:
gs = GridSearchCV(estimator, param_grid=param_grid, cv=3, n_jobs=4)

## 模型训练

In [43]:
gs.fit(x_train1, y_train)

  return self._fit(X, y)


# 模型评估
## 基本评估方式

In [46]:
score = gs.score(x_val1, y_val)
print("最后预测的准确率为: ", score)

最后预测的准确率为:  0.3722659351764961


In [47]:
# predict 预测
y_predict = gs.predict(x_val1)
print("最后的预测值为:\n", y_predict)

最后的预测值为:
 [1837168749 1891783132 5737973743 ... 3455925971 4707444542 3958678140]


## 使用交叉验证后的评估方式

In [48]:
print("在交叉验证中验证的最好结果:\n", gs.best_score_)

在交叉验证中验证的最好结果:
 0.34565338871781054


In [50]:
print("最好的参数模型:\n", gs.best_params_)

最好的参数模型:
 {'n_neighbors': 5}


In [51]:
print("每次交叉验证后的验证集准确率结果和训练集准确率结果:\n", gs.cv_results_)

每次交叉验证后的验证集准确率结果和训练集准确率结果:
 {'mean_fit_time': array([0.15746665, 0.18714786, 0.19809016, 0.18892773]), 'std_fit_time': array([0.00272943, 0.02727897, 0.01070036, 0.00389571]), 'mean_score_time': array([2.23019393, 2.35163593, 2.40535355, 2.59013565]), 'std_score_time': array([0.02661491, 0.07020273, 0.01862799, 0.00307081]), 'param_n_neighbors': masked_array(data=[3, 5, 7, 9],
             mask=[False, False, False, False],
       fill_value='?',
            dtype=object), 'params': [{'n_neighbors': 3}, {'n_neighbors': 5}, {'n_neighbors': 7}, {'n_neighbors': 9}], 'split0_test_score': array([0.3352282 , 0.34573115, 0.34372801, 0.34020898]), 'split1_test_score': array([0.33502978, 0.34656199, 0.34547916, 0.34060639]), 'split2_test_score': array([0.33465079, 0.34466703, 0.34596643, 0.3402274 ]), 'mean_test_score': array([0.33496959, 0.34565339, 0.34505786, 0.34034759]), 'std_test_score': array([0.00023954, 0.00077557, 0.00096116, 0.00018315]), 'rank_test_score': array([4, 1, 2, 3])}
