In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import chardet

# 检测文件编码
def detect_encoding(file_path):
    with open(file_path, 'rb') as f:
        result = chardet.detect(f.read())
        encoding = result['encoding']
        print(f"检测到的编码: {encoding}")
        return encoding

# 尝试读取CSV文件并处理异常
def read_csv(file_path, encoding):
    try:
        df = pd.read_csv(file_path, encoding=encoding)  # 从CSV文件读取数据
        print("文件读取成功")
        return df
    except FileNotFoundError as e:
        print(f"文件未找到: {e}")
        exit()
    except pd.errors.EmptyDataError as e:
        print(f"发现空数据: {e}")
        exit()
    except UnicodeDecodeError as e:
        print(f"编码错误: {e}。尝试使用不同的编码重新读取文件。")
        exit()

# 确保event_time列是datetime类型
def ensure_datetime(df, column_name):
    try:
        df[column_name] = pd.to_datetime(df[column_name])
    except ValueError as e:
        print(f"日期时间转换错误: {e}")
        exit()

# 分析国家和地区分布
def country_distribution(df):
    country_distribution = df['country'].value_counts()
    print("国家分布:\n", country_distribution)

# 分析城市级别分布
def city_distribution(df):
    city_distribution = df['location'].value_counts()
    print("\n城市分布:\n", city_distribution)

# 分析提交频率
def submission_frequency(df):
    submission_frequency = df['user_id'].value_counts()
    print("\n提交频率:\n", submission_frequency)

# 分析活跃时间段
def active_hours(df):
    ensure_datetime(df, 'event_time')  # 确保event_time列是datetime类型
    df['hour'] = df['event_time'].dt.hour
    active_hours = df['hour'].value_counts().sort_index()
    print("\n活跃时间段:\n", active_hours)

# 主函数
def main():
    file_path = r'D:\DATA\users_combined_info_500_part_7.csv'  # 修改为你的文件路径
    encoding = detect_encoding(file_path)

    df = read_csv(file_path, encoding)

    country_distribution(df)
    city_distribution(df)
    submission_frequency(df)
    active_hours(df)

if __name__ == "__main__":
    main()

检测到的编码: utf-8
文件读取成功
国家分布:
 country
United States           305788
Germany                 182659
China                    73011
United Kingdom           71606
France                   59570
Canada                   58600
Netherlands              52367
Czechia                  48122
Japan                    46553
Switzerland              38093
Australia                35746
Italy                    30671
Poland                   20002
New Zealand              18444
Spain                    14939
Austria                  11758
Sweden                    9851
Finland                   8815
Denmark                   7412
Bulgaria                  7357
Hungary                   7080
Belgium                   6628
Norway                    6004
India                     5689
United Arab Emirates      5264
Singapore                 5205
Brazil                    5022
Hong Kong                 4767
Malaysia                  4538
Pakistan                  4462
Indonesia                 4248
Rus

In [2]:
import pandas as pd
import chardet
import os
from pathlib import Path

# 检测文件编码
def detect_encoding(file_path):
    with open(file_path, 'rb') as f:
        result = chardet.detect(f.read())
        encoding = result['encoding']
        print(f"检测到的编码: {encoding}")
        return encoding

# 尝试读取CSV文件并处理异常
def read_csv(file_path, encoding):
    try:
        df = pd.read_csv(file_path, encoding=encoding)  # 从CSV文件读取数据
        print("文件读取成功")
        return df
    except FileNotFoundError as e:
        print(f"文件未找到: {e}")
        exit()
    except pd.errors.EmptyDataError as e:
        print(f"发现空数据: {e}")
        exit()
    except UnicodeDecodeError as e:
        print(f"编码错误: {e}。尝试使用不同的编码重新读取文件。")
        exit()

# 保存结果到CSV文件
def save_to_csv(data, output_path):
    data.to_csv(output_path, header=['Count'], index_label=data.index.name)
    print(f"结果已保存到CSV文件：{output_path}")

# 国家和地区分布
def country_distribution(df, output_path):
    country_distribution = df['country'].value_counts()
    country_distribution.index.name = 'Country'
    save_to_csv(country_distribution, output_path)

# 城市级别分布
def city_distribution(df, output_path):
    city_distribution = df['location'].value_counts()
    city_distribution.index.name = 'City'
    save_to_csv(city_distribution, output_path)

# 提交频率
def submission_frequency(df, output_path):
    user_submission_frequency = df['user_id'].value_counts()
    user_submission_frequency.index.name = 'User ID'
    save_to_csv(user_submission_frequency, output_path)

# 活跃时间段分析
def active_hours(df, output_path):
    df['event_time'] = pd.to_datetime(df['event_time'])
    df['hour'] = df['event_time'].dt.hour
    active_hours = df['hour'].value_counts().sort_index()
    active_hours.index.name = 'Hour'
    save_to_csv(active_hours, output_path)

# 用户活跃度随时间变化
def user_activity_over_time(df, output_path):
    df['event_time'] = pd.to_datetime(df['event_time'])
    user_activity_over_time = df.resample('M', on='event_time')['user_id'].count()
    user_activity_over_time.index.name = 'Month'
    save_to_csv(user_activity_over_time, output_path)

# 主函数
def main():
    # 文件路径
    file_path = r'D:\DATA\users_combined_info_500_part_7.csv'  # 修改为实际路径
    output_dir = Path(r'D:\DATA')  # 输出文件的目录

    # 检查文件是否存在
    if not os.path.exists(file_path):
        print(f"错误：文件 '{file_path}' 不存在，请检查路径是否正确。")
        exit()

    # 检查输出目录是否存在，如果不存在则创建
    output_dir.mkdir(parents=True, exist_ok=True)

    # 检测编码并读取文件
    encoding = detect_encoding(file_path)
    df = read_csv(file_path, encoding)

    # 进行分析并保存结果
    country_distribution(df, output_dir / 'country_distribution.csv')
    city_distribution(df, output_dir / 'city_distribution.csv')
    submission_frequency(df, output_dir / 'submission_frequency.csv')
    active_hours(df, output_dir / 'active_hours.csv')
    user_activity_over_time(df, output_dir / 'user_activity_over_time.csv')

if __name__ == "__main__":
    main()

检测到的编码: utf-8
文件读取成功
结果已保存到CSV文件：D:\DATA\country_distribution.csv
结果已保存到CSV文件：D:\DATA\city_distribution.csv
结果已保存到CSV文件：D:\DATA\submission_frequency.csv
结果已保存到CSV文件：D:\DATA\active_hours.csv


  user_activity_over_time = df.resample('M', on='event_time')['user_id'].count()


结果已保存到CSV文件：D:\DATA\user_activity_over_time.csv


In [3]:
import pandas as pd
import chardet
import os
from pathlib import Path

# 检测文件编码
def detect_encoding(file_path):
    with open(file_path, 'rb') as f:
        result = chardet.detect(f.read())
        encoding = result['encoding']
        print(f"检测到的编码: {encoding}")
        return encoding

# 尝试读取CSV文件并处理异常
def read_csv(file_path, encoding):
    try:
        df = pd.read_csv(file_path, encoding=encoding)  # 从CSV文件读取数据
        print("文件读取成功")
        return df
    except FileNotFoundError as e:
        print(f"文件未找到: {e}")
        exit()
    except pd.errors.EmptyDataError as e:
        print(f"发现空数据: {e}")
        exit()
    except UnicodeDecodeError as e:
        print(f"编码错误: {e}。尝试使用不同的编码重新读取文件。")
        exit()

# 统计每个国家的 total_influence 的和的排名
def rank_total_influence_by_country(df, output_path):
    # 检查 total_influence 列是否存在
    if 'total_influence' not in df.columns:
        print("错误：数据中缺少 'total_influence' 列。")
        exit()

    # 按用户ID分组并计算每个用户的 total_influence 总和
    df['total_influence'] = df.groupby('user_id')['total_influence'].transform('sum')
    # 按国家分组并计算 total_influence 的总和
    total_influence_by_country = df.groupby('country')['total_influence'].sum().reset_index()
    # 为 total_influence 的总和排名
    total_influence_by_country['rank'] = total_influence_by_country['total_influence'].rank(ascending=False, method='min')
    # 保存排名结果到 CSV 文件
    total_influence_by_country.to_csv(output_path, index=False)
    print(f"每个国家的 total_influence 的和的排名已保存到CSV文件：{output_path}")

# 主函数
def main():
    # 文件路径
    file_path = r'D:\DATA\users_combined_info_500_part_7.csv'  # 修改为实际路径
    output_dir = Path(r'D:\DATA')  # 输出文件的目录

    # 检查文件是否存在
    if not os.path.exists(file_path):
        print(f"错误：文件 '{file_path}' 不存在，请检查路径是否正确。")
        exit()

    # 检查输出目录是否存在，如果不存在则创建
    output_dir.mkdir(parents=True, exist_ok=True)

    # 检测编码并读取文件
    encoding = detect_encoding(file_path)
    df = read_csv(file_path, encoding)

    # 进行每个国家的 total_influence 的和的排名并保存结果
    rank_total_influence_by_country(df, output_dir / 'total_influence_by_country_ranking.csv')

if __name__ == "__main__":
    main()

检测到的编码: utf-8
文件读取成功
每个国家的 total_influence 的和的排名已保存到CSV文件：D:\DATA\total_influence_by_country_ranking.csv
