In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from pymongo import MongoClient
from pandas.io.json import json_normalize

plt.style.use('ggplot')
from pylab import mpl
mpl.rcParams['font.sans-serif'] = ['SimHei']  #解决seaborn中文字体显示问题
plt.rc('figure', figsize=(10, 10))  #把plt默认的图片size调大一点
plt.rcParams["figure.dpi"] =mpl.rcParams['axes.unicode_minus'] = False # 解决保存图像是负号'-'显示为方块的问题
%matplotlib inline

In [2]:
data = pd.read_csv('.\caixukun.csv')  

In [3]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 102313 entries, 0 to 102312
Data columns (total 20 columns):
 #   Column                  Non-Null Count   Dtype 
---  ------                  --------------   ----- 
 0   attitudes_count         102313 non-null  int64 
 1   comments_count          102313 non-null  int64 
 2   reposts_count           102313 non-null  int64 
 3   mid                     102313 non-null  int64 
 4   raw_text                102313 non-null  object
 5   source                  102188 non-null  object
 6   user.description        4569 non-null    object
 7   user.follow_count       102313 non-null  int64 
 8   user.followers_count    102313 non-null  int64 
 9   user.gender             102313 non-null  object
 10  user.id                 102313 non-null  int64 
 11  user.mbrank             102313 non-null  int64 
 12  user.mbtype             102313 non-null  int64 
 13  user.profile_url        102313 non-null  object
 14  user.profile_image_url  102313 non-n

In [4]:
data.sample(5)

Unnamed: 0,attitudes_count,comments_count,reposts_count,mid,raw_text,source,user.description,user.follow_count,user.followers_count,user.gender,user.id,user.mbrank,user.mbtype,user.profile_url,user.profile_image_url,user.screen_name,user.statuses_count,user.urank,user.verified,user.verified_reason
43728,0,0,0,4348404803079035,I have had my invitation to this world's festi...,天生会转 OPPO N3,,0,1,m,7013022868,0,0,https://m.weibo.cn/u/7013022868?uid=7013022868,https://tvax2.sinaimg.cn/crop.0.0.640.640.180/...,用户7013022868,19,4,False,
98022,0,0,0,4348657018378330,Died as the quiet beauty of autumn leaves,Android,,0,1,m,7011851850,0,0,https://m.weibo.cn/u/7011851850?uid=7011851850,https://tvax4.sinaimg.cn/crop.0.0.640.640.180/...,用户7011851850,69,3,False,
88346,0,0,0,4348640510090710,#东方风云榜让世界看见蔡徐坤# 你是眼中的星光，一笑明亮了整个世界@蔡徐坤,前置双摄vivo X9Plus,,2,1,f,6763514708,0,0,https://m.weibo.cn/u/6763514708?uid=6763514708,https://tvax1.sinaimg.cn/default/images/defaul...,用户6763514708,16,0,False,
93086,0,0,0,4348653549887968,Up late too late//@是积极努力的心心唷:#东方风云榜让世界看见蔡徐坤# 可...,Android,,0,1,m,6940170479,0,0,https://m.weibo.cn/u/6940170479?uid=6940170479,https://tvax4.sinaimg.cn/crop.0.0.536.536.180/...,坤坤屁屁hHo163,114,4,False,
14048,0,0,0,4348320639989904,这眼能所见的， 耳能听闻的不是肩上梅枝， 不是山中细雨， 不是苦酒山河。 而是你，这世间。,vivo AI智慧拍照X21,,0,1,m,7016858862,0,0,https://m.weibo.cn/u/7016858862?uid=7016858862,https://tvax1.sinaimg.cn/crop.0.0.640.640.180/...,小傻坤aSz815,31,3,False,


#### 1. 数据清洗
由于数据入库的时候没有进行清洗，所以数据多出了很多没用的字段，需要先清洗掉

In [5]:
data.columns

Index(['attitudes_count', 'comments_count', 'reposts_count', 'mid', 'raw_text',
       'source', 'user.description', 'user.follow_count',
       'user.followers_count', 'user.gender', 'user.id', 'user.mbrank',
       'user.mbtype', 'user.profile_url', 'user.profile_image_url',
       'user.screen_name', 'user.statuses_count', 'user.urank',
       'user.verified', 'user.verified_reason'],
      dtype='object')

In [6]:
in_columns = ['attitudes_count', 'comments_count', 'reposts_count', 'mid', 'raw_text', 
          'source', 'user.description', 'user.follow_count', 'user.followers_count', 
          'user.gender', 'user.id', 'user.mbrank', 'user.mbtype', 'user.profile_url', 
          'user.profile_image_url', 'user.screen_name', 'user.statuses_count', 
          'user.urank', 'user.verified', 'user.verified_reason']

In [7]:
data = data[in_columns]

In [8]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 102313 entries, 0 to 102312
Data columns (total 20 columns):
 #   Column                  Non-Null Count   Dtype 
---  ------                  --------------   ----- 
 0   attitudes_count         102313 non-null  int64 
 1   comments_count          102313 non-null  int64 
 2   reposts_count           102313 non-null  int64 
 3   mid                     102313 non-null  int64 
 4   raw_text                102313 non-null  object
 5   source                  102188 non-null  object
 6   user.description        4569 non-null    object
 7   user.follow_count       102313 non-null  int64 
 8   user.followers_count    102313 non-null  int64 
 9   user.gender             102313 non-null  object
 10  user.id                 102313 non-null  int64 
 11  user.mbrank             102313 non-null  int64 
 12  user.mbtype             102313 non-null  int64 
 13  user.profile_url        102313 non-null  object
 14  user.profile_image_url  102313 non-n

In [9]:
data.to_csv('.\caixukun.csv', index=False)

问题：
1. 蔡徐坤的微博转发是否存在假流量？
2. 真假流量所占的比例各有多少？
3. 假流量粉丝是如何生产出来的？
4. 真流量粉的粉丝画像

### 1. 蔡徐坤的微博转发是否存在假流量？

In [10]:
# 先来看看蔡徐坤的粉丝性别比例
fans_num = data['user.gender'].value_counts()
fans_num

m    93618
f     8695
Name: user.gender, dtype: int64

In [11]:
!pip install pyecharts==0.5.6

Collecting pyecharts==0.5.6
  Using cached pyecharts-0.5.6-py2.py3-none-any.whl (113 kB)
Installing collected packages: pyecharts
  Attempting uninstall: pyecharts
    Found existing installation: pyecharts 0.1.9.4
    Uninstalling pyecharts-0.1.9.4:
      Successfully uninstalled pyecharts-0.1.9.4
Successfully installed pyecharts-0.5.6


In [12]:
from pyecharts import Bar

bar = Bar("蔡徐坤粉丝性别比例初探", width = 600,height=500)
bar.add("(总数据102313条)", ['男', '女'], fans_num.values, is_stack=True, 
       xaxis_label_textsize=20, yaxis_label_textsize=14, is_label_show=True)
bar

In [13]:
np.round(fans_num/fans_num.sum()*100, 2)

m    91.5
f     8.5
Name: user.gender, dtype: float64

In [14]:
data[data['user.gender']=='m'].sample(5)

Unnamed: 0,attitudes_count,comments_count,reposts_count,mid,raw_text,source,user.description,user.follow_count,user.followers_count,user.gender,user.id,user.mbrank,user.mbtype,user.profile_url,user.profile_image_url,user.screen_name,user.statuses_count,user.urank,user.verified,user.verified_reason
101866,0,0,0,4348681593485483,The darkness is no darkness with thee。,Android,,0,1,m,7013580977,0,0,https://m.weibo.cn/u/7013580977?uid=7013580977,https://tvax2.sinaimg.cn/crop.0.0.640.640.180/...,用户7013580977,76,4,False,
43709,0,0,0,4348376889175043,愿你的星途一-片坦荡，加油蔡徐坤//@小葵的芒果冰_802:希望千千能健康成长🌈,Android,,0,1,m,7011836726,0,0,https://m.weibo.cn/u/7011836726?uid=7011836726,https://tvax3.sinaimg.cn/crop.0.0.640.640.180/...,是葵麻麻xxz112,27,3,False,
43482,0,0,0,4348375098743189,谦虚的美德，你的美德//@石头打瞌睡:#东方风云榜让世界看见蔡徐坤# | #蔡徐坤的未完成...,Android,,0,1,m,7011849137,0,0,https://m.weibo.cn/u/7011849137?uid=7011849137,https://tvax1.sinaimg.cn/crop.0.0.640.640.180/...,坤念念Eij528,73,3,False,
91038,0,0,0,4348648370670333,People blame me and call me heedless; I doubt ...,Android,,0,1,m,7012423612,0,0,https://m.weibo.cn/u/7012423612?uid=7012423612,https://tvax3.sinaimg.cn/crop.0.0.640.640.180/...,发带小葵ndf307,63,2,False,
33544,0,0,0,4348404647771039,Four#东方风云榜让世界看见蔡徐坤#//@刘爱坤Alan:[酸]摸摸千千狗头#东方风云榜让...,红米Redmi,,0,1,m,7012513573,0,0,https://m.weibo.cn/u/7012513573?uid=7012513573,https://tvax4.sinaimg.cn/crop.18.0.640.640.180...,坤不孤单82Y393,11,3,False,


### 2. 真假流量所占的比例各有多少？

In [17]:
data=data.fillna({'user.description':'wu'})

In [18]:
data_fake = data[((data['user.follow_count']<=5)|(data['user.followers_count']<=5))&
                 (data['user.description']=='wu')&
                 (data['comments_count']==0)&
                (data['attitudes_count']==0)&
                (data['reposts_count']==0)&
                (data['user.mbrank']==0)]
data_fake.sample(5)

Unnamed: 0,attitudes_count,comments_count,reposts_count,mid,raw_text,source,user.description,user.follow_count,user.followers_count,user.gender,user.id,user.mbrank,user.mbtype,user.profile_url,user.profile_image_url,user.screen_name,user.statuses_count,user.urank,user.verified,user.verified_reason
35642,0,0,0,4348403359924493,Oh how'd you'd ever do that to me,Android,wu,0,1,m,7011740068,0,0,https://m.weibo.cn/u/7011740068?uid=7011740068,https://tvax4.sinaimg.cn/crop.0.0.100.100.180/...,用户7011740068,31,3,False,
100272,0,0,0,4348663209883922,It was my part at this feast to play upon my i...,Android,wu,0,1,m,7011843777,0,0,https://m.weibo.cn/u/7011843777?uid=7011843777,https://tvax1.sinaimg.cn/crop.0.0.640.640.180/...,奎葵麻麻w9Z380,50,3,False,
1691,0,0,0,4348015164961680,Be a sensible person in complex times//@余申小姐:千千拜拜,Android,wu,0,1,m,7012048922,0,0,https://m.weibo.cn/u/7012048922?uid=7012048922,https://tvax3.sinaimg.cn/crop.0.0.640.640.180/...,旺仔坤8aZ515,8,4,False,
52880,0,0,0,4348428752111120,终于等到你的专属舞台//@Elvirababe-:健健康康的成长//@AK47-HIAHIA...,Flyme,wu,0,1,m,7018840286,0,0,https://m.weibo.cn/u/7018840286?uid=7018840286,https://tvax2.sinaimg.cn/crop.0.0.640.640.180/...,吧唧坤呀nLM765,40,3,False,
55381,0,0,0,4348450864669327,The market day is over and work is all done fo...,Android,wu,0,1,m,7011850230,0,0,https://m.weibo.cn/u/7011850230?uid=7011850230,https://tvax2.sinaimg.cn/crop.0.0.640.640.180/...,用户7011850230,3,0,False,


In [19]:
data_fake.shape

(95326, 20)

In [20]:
# 昵称里包含“用户”的，基本上可以断定是假粉丝
data_fake2_index = data[(data['user.follow_count']>5)&
                        (data['user.followers_count']>5)&
                        (data['user.screen_name'].str.contains('用户'))].index

In [21]:
# 把假的流量粉丝转发组合起来
data_fake = pd.concat([data_fake, data.iloc[data_fake2_index]])

In [22]:
data_fake.shape

(95397, 20)

In [23]:
# 取出真粉的转发
data_true = data.drop(data_fake.index)

In [24]:
data_true.shape

(6916, 20)

In [25]:
print('真粉丝转发数占总转发数的{}%'.format(np.round(data_true.shape[0]/data.shape[0]*100, 2)))
print('假粉丝转发数占总转发数的{}%'.format(np.round(data_fake.shape[0]/data.shape[0]*100, 2)))

真粉丝转发数占总转发数的6.76%
假粉丝转发数占总转发数的93.24%


In [26]:
bar = Bar("蔡徐坤真假流量的转发量", width = 600,height=500)
bar.add("(总数据102313条)", ['总转发量', '假粉丝转发量', '真粉丝转发量'], 
        [data.shape[0], data_fake.shape[0], data_true.shape[0]], is_stack=True, 
       xaxis_label_textsize=20, yaxis_label_textsize=14, is_label_show=True)
bar

In [27]:
real_fans_num = data_true.drop_duplicates(subset='user.id').shape[0]

In [28]:
bar = Bar("蔡徐坤真假流量的转发量与真实转发粉丝量(总数据102313条)", width = 600,height=500)
bar.add('', ['总转发量', '假粉丝转发量', '真粉丝转发量', '真实转发粉丝量'], 
        [data.shape[0], data_fake.shape[0], data_true.shape[0], real_fans_num], is_stack=True, 
       xaxis_label_textsize=20, yaxis_label_textsize=14, is_label_show=True, xaxis_rotate=20)
bar

In [29]:
print('真实转发粉丝量占总转发数的{}%'.format(np.round(real_fans_num/data.shape[0]*100, 2)))

真实转发粉丝量占总转发数的3.84%


-----------------吴青峰微博数据做对比-----------------

In [30]:
db = conn.get_database('WuQingFeng')  # WuQingFeng

repost = db.get_collection('repost') # repost
mon_data = repost.find()  # 查询这个集合下的所有记录

NameError: name 'conn' is not defined

In [194]:
wqf_data = json_normalize([comment for comment in mon_data])

In [195]:
wqf_data = wqf_data[in_columns]

In [196]:
wqf_data.shape

(10006, 20)

In [229]:
wqf_data_fake = wqf_data[((wqf_data['user.follow_count']<=5)|(wqf_data['user.followers_count']<=5))&
                         (wqf_data['user.description']=='')&
                         (wqf_data['comments_count']==0)&
                         (wqf_data['attitudes_count']==0)&
                         (wqf_data['reposts_count']==0)&
                         (wqf_data['user.mbrank']==0)]

wqf_data_fake2_index = wqf_data[(wqf_data['user.follow_count']>5)&
                                (wqf_data['user.followers_count']>5)&
                                (wqf_data['user.screen_name'].str.contains('用户'))].index
wqf_data_fake = pd.concat([wqf_data_fake, wqf_data.iloc[wqf_data_fake2_index]])
wqf_data_true = wqf_data.drop(wqf_data_fake.index)

In [230]:
print('吴青峰真粉丝转发数占总转发数的{}%'.format(np.round(wqf_data_true.shape[0]/wqf_data.shape[0]*100, 2)))
print('吴青峰假粉丝转发数占总转发数的{}%'.format(np.round(wqf_data_fake.shape[0]/wqf_data.shape[0]*100, 2)))

吴青峰真粉丝转发数占总转发数的96.52%
吴青峰假粉丝转发数占总转发数的3.48%


In [231]:
bar = Bar("吴青峰真假流量的转发量", width = 600,height=500)
bar.add("(总数据10006条)", ['总转发量', '假粉丝转发量', '真粉丝转发量'], 
        [wqf_data.shape[0], wqf_data_fake.shape[0], wqf_data_true.shape[0]], is_stack=True,
        xaxis_label_textsize=20, yaxis_label_textsize=14, is_label_show=True)
bar

In [232]:
wqf_real_fans_num = wqf_data_true.drop_duplicates(subset='user.id').shape[0]

bar = Bar("吴青峰真假流量的转发量与真实转发粉丝量(总数据10006条)", width = 600,height=500)
bar.add('', ['总转发量', '假粉丝转发量', '真粉丝转发量', '真实转发粉丝量'], 
        [wqf_data.shape[0], wqf_data_fake.shape[0], wqf_data_true.shape[0], 
         wqf_real_fans_num], is_stack=True, 
        xaxis_label_textsize=20, yaxis_label_textsize=14, is_label_show=True, xaxis_rotate=20)
bar

In [31]:
wqf_data.sample(5)

NameError: name 'wqf_data' is not defined

In [32]:
data.sample(5)

Unnamed: 0,attitudes_count,comments_count,reposts_count,mid,raw_text,source,user.description,user.follow_count,user.followers_count,user.gender,user.id,user.mbrank,user.mbtype,user.profile_url,user.profile_image_url,user.screen_name,user.statuses_count,user.urank,user.verified,user.verified_reason
37873,0,0,0,4348396230023448,一起走花路吧蔡//@华小葵想改变逸:#东方风云榜让世界看见蔡徐坤# 再见了您呢千千//@A...,Flyme,wu,0,1,m,7004626118,0,0,https://m.weibo.cn/u/7004626118?uid=7004626118,https://tvax4.sinaimg.cn/crop.0.0.100.100.180/...,蔡老师jx0975,28,0,False,
28399,0,0,0,4348375337898861,善待动物,vivo AI智慧拍照X21,wu,0,1,m,7011872075,0,0,https://m.weibo.cn/u/7011872075?uid=7011872075,https://tvax1.sinaimg.cn/crop.0.0.640.640.180/...,大坤超凶LxY354,33,4,False,
28118,0,0,0,4348388453483713,#东方风云榜让世界看见蔡徐坤#As much as I should,Android,wu,0,0,m,7019662207,0,0,https://m.weibo.cn/u/7019662207?uid=7019662207,https://tvax4.sinaimg.cn/crop.0.0.300.300.180/...,旺仔坤8Vz880,9,2,False,
87088,0,0,0,4348632469585329,不过惊鸿一瞥，误见你眉眼，这欢喜，够我喜欢多年。,Android,wu,0,1,m,7018266131,0,0,https://m.weibo.cn/u/7018266131?uid=7018266131,https://tvax1.sinaimg.cn/crop.0.0.300.300.180/...,大奎的腰QlJ252,16,0,False,
22574,0,0,0,4348384862892207,细数来 你曾是 孤鸿一瞥时,Android,wu,0,1,m,7014559799,0,0,https://m.weibo.cn/u/7014559799?uid=7014559799,https://tvax3.sinaimg.cn/crop.0.0.300.300.180/...,我坤貂呢Hnl494,45,3,False,


### 3. 假流量粉丝是如何生产出来的？

In [33]:
data_fake_gender = data_fake.drop_duplicates(subset='user.id')['user.gender'].value_counts()
data_fake_gender

m    38969
f     1869
Name: user.gender, dtype: int64

In [34]:
data_fake[data_fake['user.gender']=='f'].sample(5)

Unnamed: 0,attitudes_count,comments_count,reposts_count,mid,raw_text,source,user.description,user.follow_count,user.followers_count,user.gender,user.id,user.mbrank,user.mbtype,user.profile_url,user.profile_image_url,user.screen_name,user.statuses_count,user.urank,user.verified,user.verified_reason
59317,0,0,0,4348408808569658,As much as I should//@储蓄卡的光萌萌:#东方风云榜让世界看见蔡徐坤#,三星Galaxy NOTE III,wu,60,1,f,6788943265,0,0,https://m.weibo.cn/u/6788943265?uid=6788943265,https://tvax2.sinaimg.cn/crop.0.0.180.180.180/...,我的昵称呢-呢呢,16,4,False,
11424,0,0,0,4348295666972100,希望一起走过所有四季，寻着太阳升沉的轨迹，遇见你-蔡徐坤,Android,wu,203,1,f,6640363879,0,0,https://m.weibo.cn/u/6640363879?uid=6640363879,https://tvax3.sinaimg.cn/crop.0.0.180.180.180/...,巫璩计旱,114,4,False,
19494,0,0,0,4348369818082387,1//@蔡先生的小布点:#东方风云榜让世界看见蔡徐坤# //@CXK-FANSCLUB2:千...,Android客户端,wu,69,1,f,6868539583,0,0,https://m.weibo.cn/u/6868539583?uid=6868539583,https://tvax3.sinaimg.cn/crop.0.0.664.664.180/...,kun的旺仔,1185,4,False,
84829,0,0,0,4348629349176654,I'd stop if I could,Android,wu,57,1,f,6673457179,0,0,https://m.weibo.cn/u/6673457179?uid=6673457179,https://tvax2.sinaimg.cn/crop.0.0.200.200.180/...,scorpion-佳哥婳婳花花,451,4,False,
35614,0,0,0,4348401329916286,[吃瓜]//@从来没有人能在我的小号里打败我:B@蔡徐坤,Android,wu,0,1,f,6736543453,0,0,https://m.weibo.cn/u/6736543453?uid=6736543453,https://tvax2.sinaimg.cn/default/images/defaul...,用户6736543453,340,4,False,


In [35]:
bar = Bar("蔡徐坤假粉丝性别比例", width = 600,height=500)
bar.add("(假粉丝总数为40838)", ['男', '女'], data_fake_gender.values, is_stack=True, 
       xaxis_label_textsize=20, yaxis_label_textsize=14, is_label_show=True)
bar

In [36]:
38969/40838

0.954233801851217

In [37]:
data_fake['raw_text'].value_counts()

转发微博                                                                                        429
I am only waiting for love to give myself up at last into his hands.                        375
想你//@蔡徐坤的南岸末阴大小姐:#东方风云榜让世界看见蔡徐坤# /#蔡徐坤的未完成#祝千千在新家能快快乐乐 健健康康的@蔡徐坤                            289
我心悦你//@蔡徐坤的南岸末阴大小姐:#东方风云榜让世界看见蔡徐坤# /#蔡徐坤的未完成#祝千千在新家能快快乐乐 健健康康的@蔡徐坤                          288
爱你//@蔡徐坤的南岸末阴大小姐:#东方风云榜让世界看见蔡徐坤# /#蔡徐坤的未完成#祝千千在新家能快快乐乐 健健康康的@蔡徐坤                            278
                                                                                           ... 
Left to the East to go West, Gu, the dead must not return to//@心情不好就吃吃吃:再见千千，在新家一切顺利          1
Familiarity breeds contempt.//@CandyTlll:#东方风云榜让世界看见蔡徐坤# 要一直幸福哦 千千                            1
从头发到脚都是完美//@蔡瑶瑶诶诶诶:我也很任性的@蔡徐坤//@CXK-FANSCLUB6:see you again                                   1
The devil finds work for idle hands to do.//@柳觞儿:#蔡徐坤[超话]# 🐾#东方风云榜让世界看见蔡徐坤# 千千 快乐健康哦@蔡徐坤      1
le pietre della strada,//@大头在和坤哥约会--Augu

In [38]:
fake_source = data_fake['source'].value_counts()[:10]

In [39]:
bar = Bar("蔡徐坤假粉丝Top10转发设备", width = 600,height=600)
bar.add("", fake_source.index, fake_source.values, is_stack=True, 
       xaxis_label_textsize=11, yaxis_label_textsize=14, is_label_show=True, xaxis_rotate=30)
bar

In [40]:
data_fake['user.follow_count'].mean()

3.4412612555950397

In [41]:
data_fake['user.followers_count'].mean()

1.04576663836389

In [42]:
data_fake_sample = data_fake.sample(5)

In [43]:
data_fake_sample['user.screen_name']

92669      坤吃饱了CtG554
91996      我家坤坤bse745
91236      最爱的菜FFN031
76347      坤妻最棒dbK954
88035    用户7019454229
Name: user.screen_name, dtype: object

In [44]:
data_fake_sample['user.profile_image_url'].values

array(['https://tvax4.sinaimg.cn/crop.1.0.98.98.180/007DYbPoly8g0d0tzqvspj302s02qq32.jpg',
       'https://tvax1.sinaimg.cn/crop.0.0.640.640.180/007Ex04Zly8g0kco4v4ygj30hs0ht75p.jpg',
       'https://tvax1.sinaimg.cn/crop.0.0.640.640.180/007Ezi2oly8g0kb3xq0lkj30hs0hsjsb.jpg',
       'https://tvax3.sinaimg.cn/crop.0.0.640.640.180/007EwSyaly8g0kb4aegjuj30hs0hsjs7.jpg',
       'https://tvax2.sinaimg.cn/crop.0.0.640.640.180/007F2UHHly8g0pge0h1bsj30hs0hs0tp.jpg'],
      dtype=object)

In [45]:
data_fake.sample(5)['user.screen_name']

66341    用户6876954213
24654    用户6899308884
63134      小葵爱笑JJ4177
29746      守护困坤dqx931
19604       坤坤5BUS949
Name: user.screen_name, dtype: object

In [46]:
data_fake['user.screen_name'].str.contains('蔡|坤|葵|kun').sum()

41766

In [47]:
data_fake.shape[0]

95397

In [48]:
data_fake['user.statuses_count'].mean()

72.4942503433022

### 4. 真流量粉的粉丝画像

In [49]:
data_true.sample(5)

Unnamed: 0,attitudes_count,comments_count,reposts_count,mid,raw_text,source,user.description,user.follow_count,user.followers_count,user.gender,user.id,user.mbrank,user.mbtype,user.profile_url,user.profile_image_url,user.screen_name,user.statuses_count,user.urank,user.verified,user.verified_reason
47458,0,0,0,4348400360731569,32//@西瓜味的kk:#东方风云榜让世界看见蔡徐坤# [音乐] #蔡徐坤的未完成# 千千...,前置双摄vivo X9Plus,wu,87,13,m,6569732788,0,0,https://m.weibo.cn/u/6569732788?uid=6569732788,https://tvax3.sinaimg.cn/crop.0.0.996.996.180/...,小唐僧献,2481,9,False,
31872,0,0,0,4348398217223419,#东方风云榜让世界看见蔡徐坤# [可爱]#蔡徐坤的未完成# [太开心]//@August-小...,小米手机4,wu,122,110,f,6222219275,1,2,https://m.weibo.cn/u/6222219275?uid=6222219275,https://tvax2.sinaimg.cn/crop.0.0.996.996.180/...,Amygirl_P的坤坤,6599,14,False,
53677,0,0,0,4348436621433644,//@ikun涵宝0802:#东方风云榜让世界看见蔡徐坤# 加油[拳头]//@蔡徐坤正宫后援...,vivo X23 AI非凡摄影,披金成王，伴坤远航💛 💛,156,99,f,6605043319,1,11,https://m.weibo.cn/u/6605043319?uid=6605043319,https://tvax3.sinaimg.cn/crop.0.0.664.664.180/...,ikun涵涵0802,9116,17,False,
32341,0,0,0,4348402756657518,千千再见,vivo X23 AI非凡摄影,蔡徐坤的千军万马......中的一员😍😍😍,329,226,f,6518074126,3,11,https://m.weibo.cn/u/6518074126?uid=6518074126,https://tvax4.sinaimg.cn/crop.0.0.996.996.180/...,昏昏欲睡的小灰灰儿809,2382,20,False,
36225,0,0,0,4348406832434420,#东方风云榜让世界看见蔡徐坤# 正能量音乐人 蔡徐坤👆 👆 👆 蔡徐坤cxk,红米Redmi,花花世界，静守己心,80,14,f,6333619858,1,11,https://m.weibo.cn/u/6333619858?uid=6333619858,https://tvax2.sinaimg.cn/crop.0.0.664.664.180/...,加油加油坤,1766,9,False,


In [50]:
data_true_gender = data_true.drop_duplicates(subset='user.id')['user.gender'].value_counts()
data_true_gender

f    3287
m     639
Name: user.gender, dtype: int64

In [51]:
bar = Bar("蔡徐坤真粉丝性别比例", width = 600,height=500)
bar.add("(真粉丝总数为3926)", ['女', '男'], data_true_gender.values, is_stack=True, 
       xaxis_label_textsize=20, yaxis_label_textsize=14, is_label_show=True)
bar

In [52]:
data_true['raw_text'].value_counts()

转发微博                                                                                                                   1045
@蔡徐坤 我永远支持你！我们一起拿下 #明星势力榜# 第一名！                                                                                         622
#东方风云榜让世界看见蔡徐坤#                                                                                                          73
@蔡徐坤  我在#明星ALL榜[超话]#上为你加油啦，你是我今生唯一的执著哦。#蔡徐坤[超话]# 棒棒哒！快来为TA应援吧                                                            50
//@蔡徐坤工作室:#蔡徐坤[超话]#[给你小心心]#蔡徐坤的未完成#之宠物医院终于步入尾声，在短暂的相处时间里，因为工作忙碌无法养育小动物的@蔡徐坤 也获得了片刻的慰藉，感谢千千给我们带来了一段难忘的回忆[心]#蔡徐坤 ONE#      42
                                                                                                                       ... 
#东方风云榜让世界看见蔡徐坤# 天长地久有时尽，打榜绵绵无绝期！助@蔡徐坤 蔡徐坤一臂之力，争做全网数据第一!✨                                                                  1
[心]//@KUN的小喵咪:#东方风云榜让世界看见蔡徐坤#                                                                                             1
众里寻他千百度，

In [53]:
true_source = data_true['source'].value_counts()[:10]

In [54]:
bar = Bar("蔡徐坤真粉丝Top10转发设备", width = 600,height=600)
bar.add("", true_source.index, true_source.values, is_stack=True, 
       xaxis_label_textsize=11, yaxis_label_textsize=14, is_label_show=True, xaxis_rotate=30)
bar

In [55]:
data_true['user.follow_count'].mean()

222.0597165991903

In [56]:
data_true['user.followers_count'].mean()

178.9480913823019

In [57]:
data_true.sample(5)['user.screen_name']

15845    晓-岚海星1181
75379     HO-Wanna
41058        坤的lxy
45727     AprilASH
42249    奶菜的ikun鸭-
Name: user.screen_name, dtype: object

In [58]:
data_true['user.screen_name'].str.contains('蔡|坤|葵|kun').sum()

3153

In [59]:
data_true.shape[0]

6916

In [60]:
# 绘制蔡徐坤真粉丝的简介词云图
import jieba
from collections import Counter
from pyecharts import WordCloud

jieba.add_word('蔡徐坤')

swords = [x.strip() for x in open ('stopwords.txt')]

ModuleNotFoundError: No module named 'jieba'

In [61]:
def plot_word_cloud(data, swords, columns):
    text = ''.join(data[columns])
    words = list(jieba.cut(text))
    ex_sw_words = []
    for word in words:
        if len(word)>1 and (word not in swords):
            ex_sw_words.append(word)
    c = Counter()
    c = Counter(ex_sw_words)
    wc_data = pd.DataFrame({'word':list(c.keys()), 'counts':list(c.values())}).sort_values(by='counts', ascending=False).head(100)
    wordcloud = WordCloud(width=1300, height=620)
    wordcloud.add("", wc_data['word'], wc_data['counts'], word_size_range=[20, 100])
    return wordcloud

In [309]:
plot_word_cloud(data=data_true, swords=swords, columns='user.description')

In [310]:
plot_word_cloud(data=data_true, swords=swords, columns='raw_text')