# 对评论部分的数据分析
1. 对每部电影进行评论词云分析，查看特征
2. 对每部电影进行用户评分分析，查看评分占比
3. 根据用户对一部电影的综合评分对这些电影进行排名
4. 查询评论最多的用户找出前十位，
5. 对这几位用户的评论进行分词分析查看该结果能否一定程度上表现用户特征

 导入库

In [None]:
from pymongo import MongoClient
import pandas as pd
import matplotlib.pyplot as plt
import warnings
import jieba
from wordcloud import WordCloud
'''
配置
1.中文乱码
2.尺寸
3.风格配置
4.忽略警告信息
'''
plt.rcParams['axes.unicode_minus'] = False
plt.rcParams['figure.figsize'] = 20,6
plt.rcParams['figure.dpi'] = 200
warnings.filterwarnings('ignore')


连接数据库

In [None]:
client = MongoClient('mongodb://localhost:27017/')
db = client['doubantop250_db']
info_collection = db['movie_info_collection']
review_collection = db['movie_review_collection']

设置停用词

In [None]:
stopwords=[]
with open('./stop_words.txt','r',encoding='utf-8') as f:
  for line in f:
    stopwords.append(line.strip())

处理、合并数据

In [None]:
#读取电影数据，拿到电影名称
info_result=info_collection.find()
info_dataframe=pd.DataFrame(info_result)
movie_dataframe=info_dataframe[['id','title']]
movie_dataframe = movie_dataframe.rename(columns={'id': 'movie_id'})
print(movie_dataframe)

In [None]:
# 读取评论数据
results = review_collection.find()
review_dataframe = pd.DataFrame(results)
# string -> float
review_dataframe['user_rating'] = review_dataframe['user_rating'].astype(float)
# 去除从数据库中提取出来的_id列
review_dataframe = review_dataframe.drop('_id', axis=1)
print(review_dataframe)


In [None]:
#设置movie_id为索引
review_df_id_index=review_dataframe.set_index('movie_id')
print(review_df_id_index)

In [None]:
#合并电影名和评论
movie_review_dataframe=pd.merge(movie_dataframe,review_dataframe,left_on='movie_id',right_on='movie_id')
print(movie_review_dataframe)

# 对每部电影进行评论词云分析，查看特征

词云生成方法

In [None]:
def wc(text):
  seg_list=jieba.cut(text)
  filtered_words=[word for word in seg_list if word not in stopwords]
  filtered_text=' '.join(filtered_words)
  wordcloud=WordCloud(font_path='./msyh.ttf',width=800,height=400,background_color='white').generate(filtered_text)
  plt.figure(figsize=(10,5))
  plt.imshow(wordcloud,interpolation='bilinear')
  plt.axis('off')
  plt.show()

In [None]:
comment_dataframes=movie_review_dataframe.groupby('title')['comment'].apply(lambda x:' '.join(x))
for title,comment in comment_dataframes.head(10).items():
  print(f"电影: {title}")
  wc(comment)


对所有评分进行汇总，找出评分特征

In [None]:
ratings=movie_review_dataframe[movie_review_dataframe['user_rating'] != 0]['user_rating']
ratings_counts=ratings.value_counts()
print(ratings_counts)

In [None]:
plt.bar(ratings_counts.index,ratings_counts.values)
plt.title('用户评分综合情况')
plt.xlabel('分数')
plt.ylabel('次数')
plt.show()

# 计算每部电影的用户评分并对电影排序

In [None]:
average_ratings=movie_review_dataframe.groupby('title')['user_rating'].mean()
sorted_movies=average_ratings.sort_values(ascending=False)
print(sorted_movies)

In [None]:
#根据用户评分绘制条形图
plt.bar(sorted_movies.index,sorted_movies.values)

plt.title('用户综合评分排名')
plt.xlabel('影片')
plt.ylabel('得分')

plt.xticks(rotation=90)
plt.show()

查找出评论次数最多的前十名用户

In [None]:
#所有用户评论次数统计
users=movie_review_dataframe[['user_id','comment']]
users_count=users['user_id'].value_counts()
color=['white','blue','yellow','red','gray','orange','green','brown']
plt.pie(users_count,colors=color)
plt.axis('equal')
plt.title('用户评论次数统计')
plt.show()

In [None]:
top20_user=users_count.nlargest(20)
plt.bar(top20_user.index,top20_user.values)
plt.xlabel('用户ID')
plt.ylabel('评论次数')
plt.title('评论次数最多的前20名用户')
plt.xticks(rotation='90')
plt.show()

对这些用户的评论进行词云分析

In [None]:
for user in top20_user.index:
  user_data=users[users['user_id']==user]
  text=' '.join(user_data['comment'])
  print('用户：',user)
  wc(text)
