In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import networkx as nx
import plotly.express as px
from plotly.subplots import make_subplots
import plotly.graph_objects as go
import warnings
warnings.filterwarnings('ignore')

In [2]:
path = "../data_final_project/KuaiRec 2.0/data/"
small_matrix = pd.read_csv(path + "small_matrix.csv")
big_matrix = pd.read_csv(path + "big_matrix.csv")
item_categories = pd.read_csv(path + "item_categories.csv")
item_features = pd.read_csv(path + "item_daily_features.csv")
social_network = pd.read_csv(path + "social_network.csv")
user_features = pd.read_csv(path + "user_features.csv")

In [3]:
try:
    # Method 1: Using Python engine instead of C engine
    captions = pd.read_csv(path + "kuairec_caption_category.csv", engine='python')
    print("Captions loaded successfully with Python engine")
except Exception as e:
    print(f"Method 1 failed: {e}")
    try:
        # Method 2: Increase buffer size and skip bad lines
        captions = pd.read_csv(path + "kuairec_caption_category.csv", 
                              engine='python', 
                              on_bad_lines='skip',
                              encoding='utf-8')
        print("Captions loaded successfully with skipping bad lines")
    except Exception as e:
        print(f"Method 2 failed: {e}")
        print("Unable to load captions file. Proceeding without it.")
        captions = None

Captions loaded successfully with Python engine


In [4]:
sns.set(font_scale=1.2)
pd.set_option('display.max_columns', None)

In [5]:
print(f"Small matrix shape: {small_matrix.shape}")
print(f"Big matrix shape: {big_matrix.shape}")
print(f"Item categories shape: {item_categories.shape}")
print(f"Item features shape: {item_features.shape}")
print(f"Social network shape: {social_network.shape}")
print(f"User features shape: {user_features.shape}")
print(f"Captions shape: {captions.shape}")


Small matrix shape: (4676570, 8)
Big matrix shape: (12530806, 8)
Item categories shape: (10728, 2)
Item features shape: (343341, 58)
Social network shape: (472, 2)
User features shape: (7176, 31)
Captions shape: (10732, 10)


In [6]:
def display_dataset_info(df, name):
    print(f"\n=== {name} Dataset ===")
    print(f"Shape: {df.shape}")
    print(f"Columns: {df.columns.tolist()}")
    print("\nData Types:")
    print(df.dtypes)
    print("\nSample Data:")
    display(df.head())
    print("\nBasic Statistics:")
    display(df.describe())
    
    # Check for missing values
    missing = df.isnull().sum()
    if missing.sum() > 0:
        print("\nMissing Values:")
        display(missing[missing > 0])
    else:
        print("\nNo missing values found.")
        
    # Check for duplicates
    duplicates = df.duplicated().sum()
    print(f"\nDuplicate rows: {duplicates}")
    
    print("\n" + "="*50)

In [7]:
display_dataset_info(small_matrix, "Small Matrix")


=== Small Matrix Dataset ===
Shape: (4676570, 8)
Columns: ['user_id', 'video_id', 'play_duration', 'video_duration', 'time', 'date', 'timestamp', 'watch_ratio']

Data Types:
user_id             int64
video_id            int64
play_duration       int64
video_duration      int64
time               object
date              float64
timestamp         float64
watch_ratio       float64
dtype: object

Sample Data:


Unnamed: 0,user_id,video_id,play_duration,video_duration,time,date,timestamp,watch_ratio
0,14,148,4381,6067,2020-07-05 05:27:48.378,20200705.0,1593898000.0,0.722103
1,14,183,11635,6100,2020-07-05 05:28:00.057,20200705.0,1593898000.0,1.907377
2,14,3649,22422,10867,2020-07-05 05:29:09.479,20200705.0,1593898000.0,2.063311
3,14,5262,4479,7908,2020-07-05 05:30:43.285,20200705.0,1593898000.0,0.566388
4,14,8234,4602,11000,2020-07-05 05:35:43.459,20200705.0,1593899000.0,0.418364



Basic Statistics:


Unnamed: 0,user_id,video_id,play_duration,video_duration,date,timestamp,watch_ratio
count,4676570.0,4676570.0,4676570.0,4676570.0,4494578.0,4494578.0,4676570.0
mean,3631.649,4974.939,8612.637,14486.45,20200770.0,1596241000.0,0.9070695
std,2043.873,3064.622,12236.61,20467.11,48.9518,1254444.0,1.362324
min,14.0,103.0,0.0,3067.0,20200700.0,1593801000.0,0.0
25%,1834.0,2369.0,5811.0,7523.0,20200720.0,1595210000.0,0.4675769
50%,3687.0,4692.0,7549.0,9600.0,20200800.0,1596224000.0,0.7691666
75%,5421.0,7474.0,9880.0,11934.0,20200810.0,1597121000.0,1.12059
max,7162.0,10595.0,7988155.0,315072.0,20200900.0,1599321000.0,571.5214



Missing Values:


time         181992
date         181992
timestamp    181992
dtype: int64


Duplicate rows: 0



In [8]:
display_dataset_info(big_matrix, "Big Matrix")


=== Big Matrix Dataset ===
Shape: (12530806, 8)
Columns: ['user_id', 'video_id', 'play_duration', 'video_duration', 'time', 'date', 'timestamp', 'watch_ratio']

Data Types:
user_id             int64
video_id            int64
play_duration       int64
video_duration      int64
time               object
date                int64
timestamp         float64
watch_ratio       float64
dtype: object

Sample Data:


Unnamed: 0,user_id,video_id,play_duration,video_duration,time,date,timestamp,watch_ratio
0,0,3649,13838,10867,2020-07-05 00:08:23.438,20200705,1593879000.0,1.273397
1,0,9598,13665,10984,2020-07-05 00:13:41.297,20200705,1593879000.0,1.244082
2,0,5262,851,7908,2020-07-05 00:16:06.687,20200705,1593879000.0,0.107613
3,0,1963,862,9590,2020-07-05 00:20:26.792,20200705,1593880000.0,0.089885
4,0,8234,858,11000,2020-07-05 00:43:05.128,20200705,1593881000.0,0.078



Basic Statistics:


Unnamed: 0,user_id,video_id,play_duration,video_duration,date,timestamp,watch_ratio
count,12530810.0,12530810.0,12530810.0,12530810.0,12530810.0,12530810.0,12530810.0
mean,3574.377,5057.748,9027.027,14621.57,20200800.0,1596799000.0,0.9445059
std,2067.008,3089.868,15473.43,19834.74,50.80192,1514698.0,1.674601
min,0.0,0.0,0.0,140.0,20200700.0,1592872000.0,0.0
25%,1788.0,2387.0,4218.0,7434.0,20200800.0,1596339000.0,0.3148246
50%,3578.0,4822.0,7277.0,9636.0,20200810.0,1596669000.0,0.723471
75%,5343.75,7600.0,10350.0,12179.0,20200830.0,1598502000.0,1.177644
max,7175.0,10727.0,999639.0,315072.0,20200900.0,1599694000.0,573.4571



No missing values found.

Duplicate rows: 965819



In [9]:
display_dataset_info(item_categories, "Item Categories")


=== Item Categories Dataset ===
Shape: (10728, 2)
Columns: ['video_id', 'feat']

Data Types:
video_id     int64
feat        object
dtype: object

Sample Data:


Unnamed: 0,video_id,feat
0,0,[8]
1,1,"[27, 9]"
2,2,[9]
3,3,[26]
4,4,[5]



Basic Statistics:


Unnamed: 0,video_id
count,10728.0
mean,5363.5
std,3097.051178
min,0.0
25%,2681.75
50%,5363.5
75%,8045.25
max,10727.0



No missing values found.

Duplicate rows: 0



In [10]:
display_dataset_info(item_features, "Item Features")


=== Item Features Dataset ===
Shape: (343341, 58)
Columns: ['video_id', 'date', 'author_id', 'video_type', 'upload_dt', 'upload_type', 'visible_status', 'video_duration', 'video_width', 'video_height', 'music_id', 'video_tag_id', 'video_tag_name', 'show_cnt', 'show_user_num', 'play_cnt', 'play_user_num', 'play_duration', 'complete_play_cnt', 'complete_play_user_num', 'valid_play_cnt', 'valid_play_user_num', 'long_time_play_cnt', 'long_time_play_user_num', 'short_time_play_cnt', 'short_time_play_user_num', 'play_progress', 'comment_stay_duration', 'like_cnt', 'like_user_num', 'click_like_cnt', 'double_click_cnt', 'cancel_like_cnt', 'cancel_like_user_num', 'comment_cnt', 'comment_user_num', 'direct_comment_cnt', 'reply_comment_cnt', 'delete_comment_cnt', 'delete_comment_user_num', 'comment_like_cnt', 'comment_like_user_num', 'follow_cnt', 'follow_user_num', 'cancel_follow_cnt', 'cancel_follow_user_num', 'share_cnt', 'share_user_num', 'download_cnt', 'download_user_num', 'report_cnt', 'r

Unnamed: 0,video_id,date,author_id,video_type,upload_dt,upload_type,visible_status,video_duration,video_width,video_height,music_id,video_tag_id,video_tag_name,show_cnt,show_user_num,play_cnt,play_user_num,play_duration,complete_play_cnt,complete_play_user_num,valid_play_cnt,valid_play_user_num,long_time_play_cnt,long_time_play_user_num,short_time_play_cnt,short_time_play_user_num,play_progress,comment_stay_duration,like_cnt,like_user_num,click_like_cnt,double_click_cnt,cancel_like_cnt,cancel_like_user_num,comment_cnt,comment_user_num,direct_comment_cnt,reply_comment_cnt,delete_comment_cnt,delete_comment_user_num,comment_like_cnt,comment_like_user_num,follow_cnt,follow_user_num,cancel_follow_cnt,cancel_follow_user_num,share_cnt,share_user_num,download_cnt,download_user_num,report_cnt,report_user_num,reduce_similar_cnt,reduce_similar_user_num,collect_cnt,collect_user_num,cancel_collect_cnt,cancel_collect_user_num
0,0,20200705,3309,NORMAL,2020-03-30,ShortImport,public,5966.0,720,1280,3350323409,841,建筑,14665,11372,10141,7485,88729488,5657,4834,5503,4775,5503,4775,1939,1481,0.79986,6629173,573,569,315,257,87,85,11,11,8,3,0,0,112,61,284,284,0,0,2,2,8,8,0,0,3,3,,,,
1,0,20200706,3309,NORMAL,2020-03-30,ShortImport,public,5966.0,720,1280,3350323409,841,建筑,10883,8513,7321,5490,64264607,4162,3522,4039,3468,4039,3468,1340,1040,0.805253,3997498,302,301,159,142,47,47,7,7,6,1,0,0,60,32,201,200,0,0,1,1,2,2,0,0,5,5,,,,
2,0,20200707,3309,NORMAL,2020-03-30,ShortImport,public,5966.0,720,1280,3350323409,841,建筑,7842,6281,4757,3724,41338741,2734,2403,2640,2376,2640,2376,866,683,0.808821,3314323,205,205,121,84,52,50,4,3,3,1,0,0,59,26,131,131,0,0,1,1,2,2,0,0,0,0,,,,
3,0,20200708,3309,NORMAL,2020-03-30,ShortImport,public,5966.0,720,1280,3350323409,841,建筑,8916,7229,5172,3961,45281254,2950,2525,2865,2498,2865,2498,977,765,0.80168,4235579,297,293,178,119,60,59,4,4,2,2,0,0,91,46,179,179,0,0,2,2,3,3,0,0,3,3,,,,
4,0,20200709,3309,NORMAL,2020-03-30,ShortImport,public,5966.0,720,1280,3350323409,841,建筑,8502,6658,5392,3946,46952744,3058,2566,2946,2533,2945,2532,1046,752,0.805359,3862095,307,305,166,141,57,56,5,2,0,5,0,0,76,47,186,186,0,0,0,0,2,2,2,1,1,1,,,,



Basic Statistics:


Unnamed: 0,video_id,date,author_id,video_duration,video_width,video_height,music_id,video_tag_id,show_cnt,show_user_num,play_cnt,play_user_num,play_duration,complete_play_cnt,complete_play_user_num,valid_play_cnt,valid_play_user_num,long_time_play_cnt,long_time_play_user_num,short_time_play_cnt,short_time_play_user_num,play_progress,comment_stay_duration,like_cnt,like_user_num,click_like_cnt,double_click_cnt,cancel_like_cnt,cancel_like_user_num,comment_cnt,comment_user_num,direct_comment_cnt,reply_comment_cnt,delete_comment_cnt,delete_comment_user_num,comment_like_cnt,comment_like_user_num,follow_cnt,follow_user_num,cancel_follow_cnt,cancel_follow_user_num,share_cnt,share_user_num,download_cnt,download_user_num,report_cnt,report_user_num,reduce_similar_cnt,reduce_similar_user_num,collect_cnt,collect_user_num,cancel_collect_cnt,cancel_collect_user_num
count,343341.0,343341.0,343341.0,332743.0,343341.0,343341.0,343341.0,343341.0,343341.0,343341.0,343341.0,343341.0,343341.0,343341.0,343341.0,343341.0,343341.0,343341.0,343341.0,343341.0,343341.0,343341.0,343341.0,343341.0,343341.0,343341.0,343341.0,343341.0,343341.0,343341.0,343341.0,343341.0,343341.0,343341.0,343341.0,343341.0,343341.0,343341.0,343341.0,343341.0,343341.0,343341.0,343341.0,343341.0,343341.0,343341.0,343341.0,343341.0,343341.0,273658.0,273658.0,273658.0,273658.0
mean,5077.370157,20200800.0,4220.959932,12508.367091,713.117833,1218.797452,3041715000.0,1363.171585,94245.8,88599.75,94187.81,84835.03,1804975000.0,50753.86,48245.5,61956.22,59198.86,52923.14,50794.26,18751.47,16885.51,0.379127,98536390.0,2781.109,2757.48,1371.501,1406.157063,178.559278,168.600135,130.480764,114.464416,96.688115,33.792649,4.858421,4.083762,749.788988,351.127486,302.643914,302.488034,0.023158,0.02261,80.38086,74.78595,44.557484,39.704049,0.04278,0.037549,50.422289,48.942605,20.014686,19.837754,1.320338,1.297514
std,3113.616949,56.54698,2390.317222,13904.816578,122.673938,165.715482,1543376000.0,1090.364659,576170.0,551245.8,593329.8,541359.1,16058570000.0,356349.2,344270.9,417720.7,404990.3,367290.0,357991.3,115296.4,105420.9,0.275441,1199933000.0,18696.03,18508.41,9628.631,9454.005871,1557.676774,1449.526371,1565.610155,1361.543208,1225.041919,506.86229,134.200906,87.218102,7628.812713,3379.499898,2822.293469,2820.667128,0.27733,0.267823,1284.173127,1186.872229,535.57591,471.296386,0.822401,0.695511,281.581431,273.243597,356.678666,353.742019,7.581929,7.364914
min,0.0,20200700.0,0.0,40.0,270.0,320.0,0.0,-124.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,2318.0,20200730.0,2126.0,6901.0,720.0,1280.0,2354487000.0,144.0,55.0,47.0,14.0,12.0,101791.0,4.0,4.0,5.0,5.0,4.0,4.0,6.0,5.0,0.117749,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,5235.0,20200810.0,4402.0,9466.0,720.0,1280.0,3921367000.0,1099.0,1328.0,1126.0,628.0,526.0,5923139.0,208.0,186.0,280.0,253.0,209.0,190.0,182.0,156.0,0.353828,88859.0,9.0,9.0,4.0,5.0,3.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,7725.0,20200830.0,6293.0,12241.0,720.0,1280.0,4098158000.0,2491.0,24190.0,21616.0,20671.0,18014.0,248644400.0,8954.0,8099.0,11373.0,10402.0,9127.0,8315.0,4590.0,4010.0,0.635734,3921212.0,421.0,417.0,192.0,211.0,39.0,37.0,9.0,8.0,6.0,1.0,0.0,0.0,20.0,11.0,39.0,39.0,0.0,0.0,3.0,3.0,3.0,3.0,0.0,0.0,10.0,10.0,1.0,1.0,0.0,0.0
max,10727.0,20200900.0,8369.0,315040.0,3024.0,3024.0,4431044000.0,2891.0,42181540.0,38935330.0,41167950.0,37222770.0,1985890000000.0,25234430.0,24299570.0,29185180.0,28227150.0,26497990.0,25808300.0,10428610.0,9142707.0,1.0,208471300000.0,2673037.0,2651624.0,1702560.0,965569.0,233582.0,209037.0,182959.0,171602.0,174878.0,99624.0,51898.0,18020.0,683828.0,307221.0,493711.0,493128.0,30.0,29.0,299286.0,279187.0,134099.0,133802.0,198.0,176.0,34454.0,33720.0,116971.0,115859.0,2056.0,1971.0



Missing Values:


video_duration             10598
video_tag_name             32434
collect_cnt                69683
collect_user_num           69683
cancel_collect_cnt         69683
cancel_collect_user_num    69683
dtype: int64


Duplicate rows: 0



In [11]:
display_dataset_info(social_network, "Social Network")


=== Social Network Dataset ===
Shape: (472, 2)
Columns: ['user_id', 'friend_list']

Data Types:
user_id         int64
friend_list    object
dtype: object

Sample Data:


Unnamed: 0,user_id,friend_list
0,3371,[2975]
1,24,[2665]
2,4402,[38]
3,4295,[4694]
4,7087,[7117]



Basic Statistics:


Unnamed: 0,user_id
count,472.0
mean,3403.21822
std,2079.018308
min,18.0
25%,1648.0
50%,3268.0
75%,5233.5
max,7174.0



No missing values found.

Duplicate rows: 0



In [12]:
display_dataset_info(user_features, "User Features")


=== User Features Dataset ===
Shape: (7176, 31)
Columns: ['user_id', 'user_active_degree', 'is_lowactive_period', 'is_live_streamer', 'is_video_author', 'follow_user_num', 'follow_user_num_range', 'fans_user_num', 'fans_user_num_range', 'friend_user_num', 'friend_user_num_range', 'register_days', 'register_days_range', 'onehot_feat0', 'onehot_feat1', 'onehot_feat2', 'onehot_feat3', 'onehot_feat4', 'onehot_feat5', 'onehot_feat6', 'onehot_feat7', 'onehot_feat8', 'onehot_feat9', 'onehot_feat10', 'onehot_feat11', 'onehot_feat12', 'onehot_feat13', 'onehot_feat14', 'onehot_feat15', 'onehot_feat16', 'onehot_feat17']

Data Types:
user_id                    int64
user_active_degree        object
is_lowactive_period        int64
is_live_streamer           int64
is_video_author            int64
follow_user_num            int64
follow_user_num_range     object
fans_user_num              int64
fans_user_num_range       object
friend_user_num            int64
friend_user_num_range     object
regist

Unnamed: 0,user_id,user_active_degree,is_lowactive_period,is_live_streamer,is_video_author,follow_user_num,follow_user_num_range,fans_user_num,fans_user_num_range,friend_user_num,friend_user_num_range,register_days,register_days_range,onehot_feat0,onehot_feat1,onehot_feat2,onehot_feat3,onehot_feat4,onehot_feat5,onehot_feat6,onehot_feat7,onehot_feat8,onehot_feat9,onehot_feat10,onehot_feat11,onehot_feat12,onehot_feat13,onehot_feat14,onehot_feat15,onehot_feat16,onehot_feat17
0,0,high_active,0,0,0,5,"(0,10]",0,0,0,0,107,61-90,0,1,17,638,2.0,0,1,6,184,6,3,0,0.0,0.0,0.0,0.0,0.0,0.0
1,1,full_active,0,0,0,386,"(250,500]",4,"[1,10)",2,"[1,5)",327,181-365,0,3,25,1021,0.0,0,1,6,186,6,2,0,0.0,0.0,0.0,0.0,0.0,0.0
2,2,full_active,0,0,0,27,"(10,50]",0,0,0,0,116,91-180,0,6,8,402,0.0,0,0,2,51,2,3,0,0.0,0.0,0.0,0.0,0.0,0.0
3,3,full_active,0,0,0,16,"(10,50]",0,0,0,0,105,61-90,0,1,8,281,0.0,0,0,34,251,3,2,0,0.0,0.0,0.0,0.0,0.0,0.0
4,4,full_active,0,0,0,122,"(100,150]",4,"[1,10)",0,0,225,181-365,0,1,8,316,1.0,0,1,46,99,4,2,0,0.0,0.0,0.0,0.0,0.0,0.0



Basic Statistics:


Unnamed: 0,user_id,is_lowactive_period,is_live_streamer,is_video_author,follow_user_num,fans_user_num,friend_user_num,register_days,onehot_feat0,onehot_feat1,onehot_feat2,onehot_feat3,onehot_feat4,onehot_feat5,onehot_feat6,onehot_feat7,onehot_feat8,onehot_feat9,onehot_feat10,onehot_feat11,onehot_feat12,onehot_feat13,onehot_feat14,onehot_feat15,onehot_feat16,onehot_feat17
count,7176.0,7176.0,7176.0,7176.0,7176.0,7176.0,7176.0,7176.0,7176.0,7176.0,7176.0,7176.0,6975.0,7176.0,7176.0,7176.0,7176.0,7176.0,7176.0,7176.0,7099.0,7101.0,7101.0,7102.0,7102.0,7102.0
mean,3587.5,0.000418,0.006828,0.169593,197.327899,12.553094,4.494844,296.790691,0.39228,2.670569,15.852007,621.423634,1.051613,0.011845,0.567447,18.761148,168.661511,3.83194,2.264353,0.137124,0.298774,0.104633,0.094775,0.018586,0.017882,0.014503
std,2071.677098,0.020444,0.082357,0.375301,426.543245,181.017537,44.897861,286.38132,0.488293,1.782502,8.219267,305.770169,1.000102,0.270757,0.499945,13.118674,96.254783,1.747046,1.063131,0.500184,0.457753,0.306102,0.292925,0.135068,0.132533,0.11956
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,1793.75,0.0,0.0,0.0,9.0,0.0,0.0,132.0,0.0,1.0,8.0,363.0,0.0,0.0,0.0,6.0,88.0,3.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,3587.5,0.0,0.0,0.0,33.0,2.0,0.0,225.0,0.0,2.0,17.0,632.0,1.0,0.0,1.0,16.0,167.0,4.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,5381.25,0.0,0.0,0.0,130.0,6.0,1.0,324.0,1.0,4.0,24.0,918.0,1.0,0.0,1.0,33.0,255.0,5.0,3.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
max,7175.0,1.0,1.0,1.0,2100.0,11401.0,1425.0,2245.0,1.0,7.0,29.0,1075.0,11.0,9.0,2.0,46.0,339.0,6.0,4.0,2.0,1.0,1.0,1.0,1.0,1.0,1.0



Missing Values:


onehot_feat4     201
onehot_feat12     77
onehot_feat13     75
onehot_feat14     75
onehot_feat15     74
onehot_feat16     74
onehot_feat17     74
dtype: int64


Duplicate rows: 0



In [13]:
display_dataset_info(captions, "Captions")


=== Captions Dataset ===
Shape: (10732, 10)
Columns: ['video_id', 'manual_cover_text', 'caption', 'topic_tag', 'first_level_category_id', 'first_level_category_name', 'second_level_category_id', 'second_level_category_name', 'third_level_category_id', 'third_level_category_name']

Data Types:
video_id                       object
manual_cover_text              object
caption                        object
topic_tag                      object
first_level_category_id       float64
first_level_category_name      object
second_level_category_id      float64
second_level_category_name     object
third_level_category_id       float64
third_level_category_name      object
dtype: object

Sample Data:


Unnamed: 0,video_id,manual_cover_text,caption,topic_tag,first_level_category_id,first_level_category_name,second_level_category_id,second_level_category_name,third_level_category_id,third_level_category_name
0,0,UNKNOWN,精神小伙路难走 程哥你狗粮慢点撒,[],8.0,颜值,673.0,颜值随拍,-124.0,UNKNOWN
1,1,UNKNOWN,,[],27.0,高新数码,-124.0,UNKNOWN,-124.0,UNKNOWN
2,2,UNKNOWN,晚饭后，运动一下！,[],9.0,喜剧,727.0,搞笑互动,-124.0,UNKNOWN
3,3,UNKNOWN,我平淡无奇，惊艳不了时光，温柔不了岁月，我只想漫无目的的走走，努力发笔小财，给自己买花 自己长大.,[],26.0,摄影,686.0,主题摄影,2434.0,景物摄影
4,4,五爱街最美美女 一天1q,#搞笑 #感谢快手我要上热门 #五爱市场 这真是完美搭配啊！,"[五爱市场,感谢快手我要上热门,搞笑]",5.0,时尚,737.0,营销售卖,2596.0,女装



Basic Statistics:


Unnamed: 0,first_level_category_id,second_level_category_id,third_level_category_id
count,10728.0,10728.0,10724.0
mean,16.630593,266.66126,661.434446
std,15.853232,341.524977,1104.049489
min,-124.0,-124.0,-124.0
25%,7.0,-124.0,-124.0
50%,15.0,223.0,-124.0
75%,28.0,682.0,1524.0
max,667.0,2377.0,2676.0



Missing Values:


video_id                         1
caption                       1355
topic_tag                        4
first_level_category_id          4
first_level_category_name        4
second_level_category_id         4
second_level_category_name       4
third_level_category_id          8
third_level_category_name        8
dtype: int64


Duplicate rows: 0



In [14]:
sm_users = small_matrix['user_id'].nunique()
sm_video = small_matrix['video_id'].nunique()
print(f"Small Matrix:")
print(f"  - Unique Users: {sm_users}")
print(f"  - Unique Items: {sm_video}")

Small Matrix:
  - Unique Users: 1411
  - Unique Items: 3327


In [15]:
bg_users = big_matrix['user_id'].nunique()
bg_video = big_matrix['video_id'].nunique()
print(f"Big Matrix:")
print(f"  - Unique Users: {bg_users}")
print(f"  - Unique Items: {bg_video}")

Big Matrix:
  - Unique Users: 7176
  - Unique Items: 10728


In [16]:
print(f"User Features:")
print(f"  - Total Users: {len(user_features)}")

User Features:
  - Total Users: 7176


In [17]:
print(f"Item Features:")
print(f"  - Total Items: {item_features['video_id'].nunique()}")

Item Features:
  - Total Items: 10728


In [18]:
timestamp_cols = []
if small_matrix is not None:
    timestamp_cols.extend([col for col in small_matrix.columns if 'timestamp' in col.lower()])
if big_matrix is not None:
    timestamp_cols.extend([col for col in big_matrix.columns if 'timestamp' in col.lower()])

In [19]:
if timestamp_cols:
    print(f"Timestamp columns found: {timestamp_cols}")
    
    # Analyze timestamp range for each column
    for col in timestamp_cols:
        if 'small_matrix' in locals() and col in small_matrix.columns:
            df = small_matrix
            matrix_name = "Small Matrix"
        elif 'big_matrix' in locals() and col in big_matrix.columns:
            df = big_matrix
            matrix_name = "Big Matrix"
        else:
            continue
            
        try:
            # Convert timestamp to datetime if it's numeric
            if pd.api.types.is_numeric_dtype(df[col]):
                min_time = df[col].min()
                max_time = df[col].max()
                
                # Try to convert unix timestamp
                try:
                    min_time_dt = datetime.fromtimestamp(min_time)
                    max_time_dt = datetime.fromtimestamp(max_time)
                    print(f"{matrix_name} - {col}:")
                    print(f"  - Min Time: {min_time} ({min_time_dt})")
                    print(f"  - Max Time: {max_time} ({max_time_dt})")
                    print(f"  - Range: {max_time - min_time} seconds ({(max_time_dt - min_time_dt).days} days)")
                except:
                    print(f"{matrix_name} - {col}:")
                    print(f"  - Min: {min_time}")
                    print(f"  - Max: {max_time}")
                    print(f"  - Range: {max_time - min_time}")
            else:
                print(f"{matrix_name} - {col} is not numeric. Sample values:")
                print(df[col].sample(5).tolist())
        except Exception as e:
            print(f"Error analyzing {col}: {e}")
else:
    print("No timestamp columns identified.")
    
    # Even if no timestamp columns, check item_daily_features for date information
    if item_features is not None and 'date' in item_features.columns:
        print("\nDate information found in item_daily_features:")
        min_date = item_features['date'].min()
        max_date = item_features['date'].max()
        print(f"  - Date range: {min_date} to {max_date}")

Timestamp columns found: ['timestamp', 'timestamp']


Small Matrix - timestamp:
  - Min: 1593800606.06
  - Max: 1599321443.683
  - Range: 5520837.623000145
Small Matrix - timestamp:
  - Min: 1593800606.06
  - Max: 1599321443.683
  - Range: 5520837.623000145
