In [None]:
# 导入必要的库
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split
from datetime import datetime

# 读取Facebook位置数据集
print("读取Facebook位置数据集...")
fb_train = pd.read_csv('./data/FBlocation/train.csv')

# 将time列转换为年月日时分秒格式
print("将time列转换为年月日时分秒格式...")
# 假设time是Unix时间戳（以秒为单位）
fb_train['time_formatted'] = pd.to_datetime(fb_train['time'], unit='s')#格式化时间fb_train['time'],unit='s'
# 从time_formatted提取星期几、小时和天作为新特征
print("从时间中提取星期几、小时和分钟作为新特征...")
fb_train['weekday'] = fb_train['time_formatted'].dt.dayofweek  # 星期几 (0-6, 0是星期一)
fb_train['hour'] = fb_train['time_formatted'].dt.hour  # 小时 (0-23)
fb_train['day'] = fb_train['time_formatted'].dt.day  # 天



In [None]:
print(fb_train.head())
fb_train.info()

In [4]:
fb_train.drop(columns=['time','row_id'],inplace=True)


In [None]:
fb_train.info()


In [None]:
print("\n根据x和y特征列筛选数据...")
# 筛选x和y都在特定范围内的数据
# 例如：选择x在0到3之间，y在0到3之间的数据点
# filtered_fb_train = fb_train[(fb_train['x'] >= 0) & (fb_train['x'] <= 3) & 
#                             (fb_train['y'] >= 0) & (fb_train['y'] <= 3)]
filtered_fb_train=fb_train.query("x > 1.0 &  x < 1.25 & y > 2.5 & y < 2.75")#query()函数用于查询数据,x>1.0&x<1.25&y>2.5&y<2.75

print(f"筛选后的样本数量: {filtered_fb_train.shape[0]}")
print(f"筛选前的样本数量: {fb_train.shape[0]}")
print(f"筛选比例: {filtered_fb_train.shape[0]/fb_train.shape[0]:.2%}")


# 统计每个place_id的出现次数
print("\n统计每个place_id的出现次数...")
place_counts = filtered_fb_train['place_id'].value_counts()

# 找出出现次数大于3的place_id
valid_places = place_counts[place_counts > 3].index

# 只保留出现次数大于3的place_id对应的数据
print("去除place_id出现次数小于等于3的数据...")
filtered_fb_train = filtered_fb_train[filtered_fb_train['place_id'].isin(valid_places)]

print(f"去除后的样本数量: {filtered_fb_train.shape[0]}")
print(f"去除的样本比例: {1 - filtered_fb_train.shape[0]/len(x):.2%}")

In [None]:
# 准备特征和标签
X = fb_train.drop(['place_id', 'time_formatted'], axis=1)
y = fb_train['place_id']

# 划分训练集和测试集
print("划分训练集和测试集...")
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# 标准化特征
print("标准化特征...")
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# 训练KNN模型
print("训练KNN模型...")
knn = KNeighborsClassifier(n_neighbors=5)#KNN模型是基于距离的分类算法，n_neighbors=5表示使用5个邻居
knn.fit(X_train_scaled, y_train)

# 评估模型
print("评估模型...")
train_score = knn.score(X_train_scaled, y_train)
test_score = knn.score(X_test_scaled, y_test)

print(f"训练集准确率: {train_score:.4f}")
print(f"测试集准确率: {test_score:.4f}")
