In [1]:
import json
import pandas as pd
import numpy as np
import re
import matplotlib.pyplot as plt
import itertools
from collections import Counter
from matplotlib_venn import venn3
import matplotlib.font_manager as fm
import seaborn as sns

In [2]:
import os
path = "D:/Coding/Caleb"
os.chdir(path)
os.getcwd()

'D:\\Coding\\Caleb'

In [3]:
df_time = pd.read_json('./data/time.json')
df_general = pd.read_json('./data/general.json')
df_popularity = pd.read_json('./data/popularity.json')
df_combined = pd.concat([df_time, df_general, df_popularity])

In [4]:
df = df_combined.drop_duplicates('note_id')

In [5]:
df = df.copy()
df["time"] = pd.to_datetime(df["time"], unit="ms", origin="unix", utc=True).dt.tz_localize(None)
df["last_modify_ts"] = pd.to_datetime(df["last_modify_ts"], unit="ms", origin="unix", utc=True).dt.tz_localize(None)

In [6]:
df.drop(columns=["video_url","avatar","image_list","last_modify_ts","xsec_token"], inplace=True)

In [7]:
df["creation_date"] = df["time"].dt.date
df["creation_time"] = df["time"].dt.time
df["creation_year_month"] = df["time"].dt.to_period('M')
df["time_diff"] = (df["time"] - df["last_update_time"]).abs()

In [8]:
df["content_edit"] = df["time_diff"] > pd.Timedelta(minutes=10)

In [9]:
df["tags"] = df["tag_list"].str.split(",\s*")

In [10]:
def convert_to_numeric(value):
    if isinstance(value, str):
        match = re.match(r"([0-9\.]+)万", value) 
        if match:
            return str(int(float(match.group(1)) * 10000))
        match = re.match(r"([0-9\.]+)千.*", value) 
        if match:
            return str(int(float(match.group(1)) * 1000)) 
        match = re.match(r"([0-9]+)\+", value)
        if match:
            return str(int(match.group(1))) 
    return value

# Apply the function to the 'xxx_count' column
df['liked_count_parsed'] = df['liked_count'].apply(convert_to_numeric)
df['collected_count_parsed'] = df['collected_count'].apply(convert_to_numeric)
df['comment_count_parsed'] = df['comment_count'].apply(convert_to_numeric)
df['share_count_parsed'] = df['share_count'].apply(convert_to_numeric)

In [11]:
#correct count columns
df[['liked_count_parsed', 'collected_count_parsed',
    'comment_count_parsed', 'share_count_parsed']] = df[['liked_count_parsed', 'collected_count_parsed',
                                                         'comment_count_parsed', 'share_count_parsed']].astype(int)

In [12]:
df.columns

Index(['note_id', 'type', 'title', 'desc', 'time', 'last_update_time',
       'user_id', 'nickname', 'liked_count', 'collected_count',
       'comment_count', 'share_count', 'ip_location', 'tag_list', 'note_url',
       'source_keyword', 'creation_date', 'creation_time',
       'creation_year_month', 'time_diff', 'content_edit', 'tags',
       'liked_count_parsed', 'collected_count_parsed', 'comment_count_parsed',
       'share_count_parsed'],
      dtype='object')

In [13]:
df.drop(columns=['liked_count', 'collected_count','comment_count', 'share_count'], inplace=True)

In [14]:
df=df.loc[df.creation_year_month>'2025-01']

In [15]:
df.loc[:, 'tags'] = df['tags'].apply(lambda x: x if isinstance(x, list) else [])

In [16]:
ip_groups = ['上海', '中国台湾', '中国澳门', '中国香港', '云南', '内蒙古', '加拿大', '北京','吉林', '四川', '天津', '安徽', '山东', '山西',
             '广东', '广西', '意大利', '挪威', '新加坡','新疆', '日本', '江苏', '江西', '河北', '河南', '浙江', '海南', '湖北', '湖南',
             '澳大利亚','甘肃', '福建', '美国', '英国', '贵州', '辽宁', '重庆', '陕西', '马来西亚', '黑龙江']

df.loc[:, 'ip_location_grouped'] = df['ip_location'].apply(
    lambda x: x if (x in ip_groups)
    else "Others"
)

In [17]:
exclude_tags = ['沈星回','黎深','秦彻','祁煜']
caleb_tags = ['夏以昼','恋与深空夏以昼','夏以昼回航','夏以昼x你','Caleb','爱在夏以昼','夏以昼同人','夏以昼回来','夏以昼日常']

def classify_post(tags):
    tags_set = set(tags)  # Convert list to set for fast lookup
    if any(tag in tags_set for tag in caleb_tags):  # Rule 1: Caleb-related tags exist
        if any(tag in tags_set for tag in exclude_tags):  # Rule 2: Exclude tags exist
            return "General Post"
        return "Caleb Post"
    return "General Post"  # Default if no Caleb tags found

df = df.assign(post_type=df['tags'].apply(classify_post))

In [18]:
df['comment_like_ratio'] = df.apply(
    lambda row: row['comment_count_parsed'] / row['liked_count_parsed'] if row['liked_count_parsed'] > 0 else None, 
    axis=1
)
df_comments = df.dropna(subset=['comment_like_ratio'])

In [19]:
df.shape

(668, 25)

In [20]:
df.head()

Unnamed: 0,note_id,type,title,desc,time,last_update_time,user_id,nickname,ip_location,tag_list,...,time_diff,content_edit,tags,liked_count_parsed,collected_count_parsed,comment_count_parsed,share_count_parsed,ip_location_grouped,post_type,comment_like_ratio
0,67c27a66000000000603d771,video,是谁的卡册来了,给大家准备的福福来啦\n﻿#恋与深空[话题]#﻿ ﻿#拆卡[话题]#﻿ ﻿#二创[话题]#﻿...,2025-03-01 03:09:26,2025-03-01 03:09:27,666c6100000000000d026ec1,芝士塔爱拆卡,河北,"恋与深空,拆卡,二创,沈星回,黎深,祁煜,秦彻,夏以昼,卡册",...,0 days 00:00:01,False,"[恋与深空, 拆卡, 二创, 沈星回, 黎深, 祁煜, 秦彻, 夏以昼, 卡册]",0,0,0,0,河北,General Post,
1,67c27a45000000002903da90,normal,哇塞，这就是你们温良哥推吗，说不过就开始举报了\n#沈星回[话题]# #夏以昼的女人是小偷...,哇塞，这就是你们温良哥推吗，说不过就开始举报了\n#沈星回[话题]# #夏以昼的女人是小偷...,2025-03-01 03:08:53,2025-03-01 03:08:53,60587996000000000101f0c5,沈珏,江苏,"沈星回,夏以昼的女人是小偷,夏以昼的女人不读书,夏以昼的女人吃拼好饭中毒",...,0 days 00:00:00,False,"[沈星回, 夏以昼的女人是小偷, 夏以昼的女人不读书, 夏以昼的女人吃拼好饭中毒]",0,0,0,0,江苏,General Post,
2,67c27a2c0000000006028cc9,normal,就这么和夏以昼叱咤风云😼,过了两年，妹也偷偷去参加了比武大会，各家很快就认出来妹用的招式是夏以昼的影子\n妹和哥一样默...,2025-03-01 03:08:28,2025-03-01 03:08:29,632510f2000000002303c666,下一周有下雨天,广西,"恋与深空夏以昼,夏以昼,恋与深空",...,0 days 00:00:01,False,"[恋与深空夏以昼, 夏以昼, 恋与深空]",0,0,0,0,广西,Caleb Post,
3,67c2783b000000000602822a,normal,模仿夏以昼签名被抓,#恋与深空夏以昼[话题]# #夏以昼[话题]# #talkmaker[话题]#,2025-03-01 03:00:11,2025-03-01 03:00:12,66067a58000000000d025499,商秋宴,湖北,"恋与深空夏以昼,夏以昼,talkmaker",...,0 days 00:00:01,False,"[恋与深空夏以昼, 夏以昼, talkmaker]",12,1,4,1,湖北,Caleb Post,0.333333
4,67c275e8000000002602dfc2,normal,DeepSeek造福人类[色色R]终于说出来了，夏以昼你不是胆小鬼#恋与深空夏以昼[话题]#,DeepSeek造福人类[色色R]终于说出来了，夏以昼你不是胆小鬼#恋与深空夏以昼[话题]#,2025-03-01 02:50:16,2025-03-01 02:58:17,5fb658300000000001005cb4,圈圈,北京,恋与深空夏以昼,...,0 days 00:08:01,False,[恋与深空夏以昼],0,0,0,0,北京,Caleb Post,


In [30]:
df.to_csv("./data/cleaned_raw.csv", index=False, encoding="utf-8-sig")