In [5]:
import pandas as pd
import numpy as np
from sklearn.ensemble import IsolationForest
from sklearn.preprocessing import StandardScaler
import matplotlib.pyplot as plt
from datetime import datetime
import seaborn as sns

In [3]:
df = pd.read_csv('mock_data.csv')
df['time'] = pd.to_datetime(df['time'])

In [9]:
# 按产品组合和时间排序
df = df.sort_values(['product_type1', 'product_type2', 'time'])

# 计算时间差（小时）
df['time_diff'] = df.groupby(['product_type1', 'product_type2'])['time'].diff().dt.total_seconds() / 3600

# 找出每个产品组合的预期时间间隔（众数）
expected_intervals = df.groupby(['product_type1', 'product_type2'])['time_diff'].agg(lambda x: x.mode()[0])

# 定义异常检测函数
def detect_time_anomalies(group, expected_interval):
    # 计算时间差与预期间隔的偏差
    deviation = abs(group['time_diff'] - expected_interval)
    # 如果偏差超过预期间隔的20%，则认为是异常
    return deviation > (expected_interval * 0.2)

# 对每个产品组合进行异常检测
anomalies = []
for (prod1, prod2), expected_interval in expected_intervals.items():
    group_data = df[(df['product_type1'] == prod1) & (df['product_type2'] == prod2)]
    is_anomaly = detect_time_anomalies(group_data, expected_interval)
    anomalies.extend(is_anomaly)

df['is_anomaly'] = anomalies

# 分离正常和异常数据
normal_data = df[~df['is_anomaly']]
anomaly_data = df[df['is_anomaly']]

# 打印统计信息
print(f"总数据点数量: {len(df)}")
print(f"检测到的异常数据点数量: {len(anomaly_data)}")
print(f"正常数据点数量: {len(normal_data)}")

# 为每个产品组合创建时间序列图
for (prod1, prod2), expected_interval in expected_intervals.items():
    group_data = df[(df['product_type1'] == prod1) & (df['product_type2'] == prod2)]
    
    plt.figure(figsize=(15, 5))
    
    # 绘制时间序列
    plt.scatter(group_data['time'], group_data['price1'], 
               c=group_data['is_anomaly'].map({True: 'red', False: 'blue'}),
               alpha=0.6)
    
    plt.title(f'产品组合 {prod1}-{prod2} 的时间序列分析\n预期时间间隔: {expected_interval:.2f}小时')
    plt.xlabel('时间')
    plt.ylabel('价格1')
    plt.xticks(rotation=45)
    plt.tight_layout()
    plt.savefig(f'time_series_{prod1}_{prod2}.png')
    plt.close()
    
    # 只显示前5个组合，避免图表过多
    if list(expected_intervals.keys()).index((prod1, prod2)) >= 4:
        break

# 显示异常数据的详细信息
print("\n异常数据点详细信息：")
anomaly_details = anomaly_data[['product_type1', 'product_type2', 'time', 'time_diff', 'price1', 'price2']]
print(anomaly_details.sort_values(['product_type1', 'product_type2', 'time']))

# 统计每个产品组合的异常数量
anomaly_counts = anomaly_data.groupby(['product_type1', 'product_type2']).size()
print("\n各产品组合的异常数据点数量：")
print(anomaly_counts)

# 保存检测到的异常数据
anomaly_data.to_csv('detected_time_anomalies.csv', index=False)
print("\n异常数据已保存到 detected_time_anomalies.csv") 

总数据点数量: 9562
检测到的异常数据点数量: 223
正常数据点数量: 9339


  plt.tight_layout()
  plt.tight_layout()
  plt.tight_layout()
  plt.tight_layout()
  plt.tight_layout()
  plt.tight_layout()
  plt.tight_layout()
  plt.tight_layout()
  plt.tight_layout()
  plt.tight_layout()
  plt.tight_layout()
  plt.tight_layout()
  plt.tight_layout()
  plt.tight_layout()
  plt.tight_layout()
  plt.tight_layout()
  plt.tight_layout()
  plt.savefig(f'time_series_{prod1}_{prod2}.png')
  plt.savefig(f'time_series_{prod1}_{prod2}.png')
  plt.savefig(f'time_series_{prod1}_{prod2}.png')
  plt.savefig(f'time_series_{prod1}_{prod2}.png')
  plt.savefig(f'time_series_{prod1}_{prod2}.png')
  plt.savefig(f'time_series_{prod1}_{prod2}.png')
  plt.savefig(f'time_series_{prod1}_{prod2}.png')
  plt.savefig(f'time_series_{prod1}_{prod2}.png')
  plt.savefig(f'time_series_{prod1}_{prod2}.png')
  plt.savefig(f'time_series_{prod1}_{prod2}.png')
  plt.savefig(f'time_series_{prod1}_{prod2}.png')
  plt.savefig(f'time_series_{prod1}_{prod2}.png')
  plt.savefig(f'time_series_{prod1}_{prod2}


异常数据点详细信息：
     product_type1 product_type2                time  time_diff  price1  \
17         Type1_1       Type2_1 2024-02-19 20:00:00      260.0  187.90   
28         Type1_1       Type2_1 2024-03-16 20:00:00      104.0  344.21   
37         Type1_1       Type2_1 2024-04-07 12:00:00      104.0  204.28   
48         Type1_1       Type2_1 2024-05-07 20:00:00      208.0  672.77   
109        Type1_1       Type2_1 2024-09-19 04:00:00      104.0  430.94   
...            ...           ...                 ...        ...     ...   
2398       Type1_8       Type2_8 2024-11-06 00:00:00       80.0  955.74   
2701       Type1_9       Type2_9 2024-01-16 00:00:00       26.0  279.18   
2741       Type1_9       Type2_9 2024-01-19 10:00:00        4.0  173.30   
2797       Type1_9       Type2_9 2024-01-24 04:00:00        4.0  837.55   
2838       Type1_9       Type2_9 2024-01-27 16:00:00        4.0  679.09   

      price2  
17    357.90  
28    422.93  
37    438.40  
48    191.46  
109   169.34