In [23]:
import pandas as pd
import numpy as np
from pyecharts.charts import Bar,Line,Funnel,Pie,Gauge
from pyecharts.globals import ThemeType
from pyecharts import options as opts 
import warnings 
warnings.filterwarnings(action='ignore')
import matplotlib.pyplot as plt
import math
import seaborn as sns

#### 1. Data preprocessing

In [24]:
# Read files
df = pd.read_csv('./tiktok_dataset1.csv')
df.head()
df.shape

(1737312, 14)

In [25]:
# View the type of data
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1737312 entries, 0 to 1737311
Data columns (total 14 columns):
 #   Column         Dtype  
---  ------         -----  
 0   Unnamed: 0     int64  
 1   uid            int64  
 2   user_city      float64
 3   item_id        int64  
 4   author_id      int64  
 5   item_city      float64
 6   channel        int64  
 7   finish         int64  
 8   like           int64  
 9   music_id       float64
 10  duration_time  int64  
 11  real_time      object 
 12  H              int64  
 13  date           object 
dtypes: float64(3), int64(9), object(2)
memory usage: 185.6+ MB


In [26]:
# Delete useless columns
del df['Unnamed: 0']

In [27]:
# Check for missing values
df.isna().any()

uid              False
user_city        False
item_id          False
author_id        False
item_city        False
channel          False
finish           False
like             False
music_id         False
duration_time    False
real_time        False
H                False
date             False
dtype: bool

In [28]:
# Check for duplicate values
print('Total data before removing duplicate values：',df.shape)
df.drop_duplicates()
print('Total data after removing duplicate values：',df.shape)

Total data before removing duplicate values： (1737312, 13)
Total data after removing duplicate values： (1737312, 13)


In [29]:
# Modifying data types
df['user_city'] = df['user_city'].astype('int').astype('str')
df['item_city'] = df['item_city'].astype('int').astype('str')
df['music_id'] = df['music_id'].astype('int').astype('str')
df['real_time'] = df['real_time'].astype('datetime64')
df['date'] = df['date'].astype('datetime64')

#### 2. User personalities analysis

In [30]:
# Total user number
df.drop_duplicates(['uid']).shape[0]

59232

In [33]:
# Top 20 most frequently used cities
user_info = df.drop_duplicates(['uid','user_city'])[['uid','user_city']]
sumnum = user_info.groupby('user_city')['uid'].count().sort_values(ascending=False).to_list()[0:20]
cityno = user_info.groupby('user_city')['uid'].count().sort_values(ascending=False).index.to_list()[0:20]
bar = Bar(init_opts=opts.InitOpts(theme=ThemeType.LIGHT))
bar.add_xaxis(cityno)
bar.add_yaxis('',sumnum)
bar.set_global_opts(title_opts=opts.TitleOpts(title='Top 20 most frequently used cities'))
bar.set_series_opts(label_opts=opts.LabelOpts(is_show=True))
bar.render_notebook()

In [34]:
# Plotting the number of movie-goers in different time periods
user_H = df.drop_duplicates(['uid','H'])[['uid','H']]
usum = round(user_H.groupby('H')['uid'].count()/10000,1).to_list()
H = user_H.groupby('H')['uid'].count().index.to_list()
line = Line(init_opts=opts.InitOpts(theme=ThemeType.LIGHT))
line.add_xaxis(H)
line.add_yaxis('',usum,is_smooth=False,
                areastyle_opts=opts.AreaStyleOpts(color='black',opacity=0.3),
                itemstyle_opts=opts.ItemStyleOpts(color='red'))
line.set_global_opts(title_opts=opts.TitleOpts(title='Number of people using Tiktok by time period'))
line.set_series_opts(label_opts=opts.LabelOpts(is_show=True))
line.render_notebook()

In [35]:
# Weekly viewership
df['week'] = df['date'].dt.dayofweek + 1
week = df.groupby('week')['week'].count().tolist()
label = df.groupby('week')['week'].count().index.map(lambda x:f'{x}')
pie = Pie(init_opts=opts.InitOpts(theme=ThemeType.INFOGRAPHIC))
pie.add('',[list(z) for z in zip(label,week)],radius=[0,100])
pie.set_global_opts(title_opts=opts.TitleOpts(title='Weekly viewership'))
pie.set_series_opts(label_opts=opts.LabelOpts(
            position="outside",
            formatter="{b|{b}: }{c}  {per|{d}%}  ",
            background_color="#eee",
            border_color="#aaa",
            border_width=1,
            border_radius=4,
            rich={
                "a": {"color": "#999", "lineHeight": 22, "align": "center"},
                "abg": {
                    "backgroundColor": "#e3e3e3",
                    "width": "100%",
                    "align": "right",
                    "height": 22,
                    "borderRadius": [4, 4, 0, 0],
                },
                "hr": {
                    "borderColor": "#aaa",
                    "width": "100%",
                    "borderWidth": 0.5,
                    "height": 0,
                },
                "b": {"fontSize": 16, "lineHeight": 33},
                "per": {
                    "color": "#eee",
                    "backgroundColor": "#334455",
                    "padding": [2, 4],
                    "borderRadius": 2,
                },
            },
        ))

pie.render_notebook()

In [36]:
finish = df[df['finish'] == 1]['finish'].count() # Total number of people who finished broadcasting
like = df[df['like'] == 1]['like'].count()  # Total number of likes
# 5% or more is considered good
likeRate = round((like / df['finish'].count())*100,2)
# 30% or more completion rate are considered good
finishRate = round((finish/df['finish'].count())*100,2)

c = Gauge(init_opts=opts.InitOpts(theme=ThemeType.LIGHT))
c.add("", [("Completion rate",finishRate),("Like rate",likeRate)],split_number=10,
        axisline_opts=opts.AxisLineOpts(linestyle_opts=opts.LineStyleOpts(
             color=[(0.3, "green"), (0.7, "blue"), (1, "red")], width=20)))
c.set_global_opts(title_opts=opts.TitleOpts(title="Completion rate and Like rate"))
c.set_series_opts(label_opts=opts.LabelOpts(is_show=True))
c.render_notebook()

#### 3. Correlation Analysis

In [37]:
# PV:  Page View, the number of page views or clicks, which measures the number of pages visited by website users; the number of views is accumulated if the same page is opened or refreshed several times.
# UV: Unique Visitor, Statistics on the number of users who visited a site in 1 day. The number of page visitors, the same account to visit the same page twice, UV count 1 time
PV = df.groupby('date')['uid'].count().to_list()
UV = df.drop_duplicates('uid').groupby('date')['uid'].count().to_list()
data = list(df.groupby('date').count().index)
line = Line()
line.add_xaxis(xaxis_data=data)

line.add_yaxis('PV',PV,is_smooth=True,
                areastyle_opts=opts.AreaStyleOpts(color = 'red',opacity=0.3))
line.add_yaxis('UV',UV,is_smooth=True,
                areastyle_opts=opts.AreaStyleOpts(color = 'blue',opacity=0.3))
line.set_global_opts(legend_opts=opts.LegendOpts(pos_right='10%',pos_top='2%'),
                title_opts=opts.TitleOpts(title='Page View and Unique Visitor',pos_left='40%'))
line.render_notebook()

In [38]:
# funnel analysis diagram
index = ['View', 'Completion', 'Like']
data = [df['uid'].nunique(), df[df['finish'] == 1]['uid'].nunique(), df[df['like'] == 1]['uid'].nunique()]
funnel = Funnel()
funnel.add('',[list(z) for z in zip(index, data)],)

funnel.render_notebook()

In [25]:
# Create a new table to store customer categories
user = df['uid'].unique()
labels = pd.DataFrame(user,columns=['uid'])

In [26]:
#Divide the time slots into 'Night', 'morning', 'afternoon' and 'evening'
df['H'] = pd.cut(df['H'],bins = 4,labels=['Night','Morning','Afternoon','Evening'])

df_time_active = df.groupby(['uid', 'H'], as_index=False)['item_id'].count()

df_time_active.rename(columns={'item_id': 'H_count'}, inplace=True)

# The most viewed period for each user
df_time_active_max = df_time_active.groupby('uid', as_index=False)['H_count'].max()

df_time_active_max.rename(columns={'H_count': 'read_count_max'}, inplace=True)

df_time_active = pd.merge(df_time_active, df_time_active_max, how='left', on='uid')

In [27]:
# Select the time period with the highest number of views by each user, and if there is a tie for the highest number of time periods, connect them with commas
# df_time_active['H'] = df_time_active['H'].astype(str)
df_time_active_H  = df_time_active.loc[df_time_active['H_count'] == df_time_active['read_count_max'],'H'].groupby(df_time_active['uid']).aggregate(lambda x:','.join(x)).reset_index()
df_time_active_H

Unnamed: 0,uid,H
0,0,Evening
1,1,Night
2,2,Night
3,3,"Night,Evening"
4,4,Evening
...,...,...
59227,70696,Morning
59228,70697,Afternoon
59229,70703,Morning
59230,70709,Afternoon


In [28]:
# Add user browsing active time period to user tag table
labels = pd.merge(labels, df_time_active_H, how='left', on='uid')
labels.rename(columns={'H':'View'},inplace=True)
labels

Unnamed: 0,uid,View
0,15692,Evening
1,44071,Night
2,10902,Evening
3,25300,Night
4,3656,Evening
...,...,...
59227,9247,Night
59228,51731,Morning
59229,48996,Evening
59230,41289,Evening


In [29]:
df_finish_duration_time  = df[df['finish'] == 1].groupby('uid',as_index=False)['duration_time'].mean()
df_finish_duration_time['duration_time'] = df_finish_duration_time['duration_time'].astype(int)

for i in df_finish_duration_time['duration_time']:
    if i > 0 and i <= 10:
        df_finish_duration_time['Duration'] = 'Short'
    elif i>10 & i<23:
        df_finish_duration_time['Duration'] = 'Medium'
    else:
        df_finish_duration_time['Duration'] = 'Long'
#Add user's preferred video duration to the user tag list
labels = pd.merge(labels,df_finish_duration_time,how='left',on='uid')
labels = labels.loc[:,['uid','View','Duration']]
labels

Unnamed: 0,uid,View,Duration
0,15692,Evening,Short
1,44071,Night,Short
2,10902,Evening,Short
3,25300,Night,Short
4,3656,Evening,Short
...,...,...,...
59227,9247,Night,Short
59228,51731,Morning,Short
59229,48996,Evening,Short
59230,41289,Evening,


In [30]:
# agg:dictionary form, execute a function for a particular column  nunique:generate a unique column
df_active_freq  = df.groupby('uid', as_index=False).agg({'item_id': 'count', 'date': pd.Series.nunique})
df_active_freq.rename(columns={'item_id': 'View', 'date': 'Days'})

for i in df_active_freq['date']:
    if i>0 & i<=10:
        df_active_freq['Frequency'] = 'Low'
    elif i>10 & i<20:
        df_active_freq['Frequency'] = 'Mid'
    else:
        df_active_freq['Frequency'] = 'High'
        
labels = pd.merge(labels,df_active_freq,how='left',on='uid')
labels = labels.loc[:,['uid','View','Duration','Frequency']]
labels

Unnamed: 0,uid,View,Duration,Frequency
0,15692,Evening,Short,Low
1,44071,Night,Short,Low
2,10902,Evening,Short,Low
3,25300,Night,Short,Low
4,3656,Evening,Short,Low
...,...,...,...,...
59227,9247,Night,Short,Low
59228,51731,Morning,Short,Low
59229,48996,Evening,Short,Low
59230,41289,Evening,,Low
