In [1]:
import pandas as pd
import re
import jieba.posseg as pseg
from snownlp import SnowNLP
import os
data = pd.read_csv('./captainmarvel700.csv')

In [2]:
data['content']=[str(text) for text in data['content']]
#convert traditional Chinese to simplified Chinese
data['content']=[SnowNLP(i).han for i in data['content']]

In [3]:
stop_words_path='./stopwords-chinese'
stopwords1 = [line.rstrip() for line in open(os.path.join(stop_words_path, '中文停用词表.txt'), 'r', encoding='utf-8')]
stopwords2 = [line.rstrip() for line in open(os.path.join(stop_words_path, '哈工大停用词表.txt'), 'r', encoding='utf-8')]
stopwords3 = [line.rstrip() for line in open(os.path.join(stop_words_path, '四川大学机器智能实验室停用词库.txt'), 'r', encoding='utf-8')]
stopwords = stopwords1 + stopwords2 + stopwords3

In [48]:
def proc_text(raw_line):
    """
        process texts
        return words
    """

    # 1. filter and keep Chinese characters only
    filter_pattern = re.compile('[^\u4E00-\u9FD5]+')
    chinese_only = filter_pattern.sub('', raw_line)

    # 2. segmentation with jieba and part-of-speech tagging
    word_list = pseg.cut(chinese_only)

    # 3. delete stopwords and 
    # only keep adjectives, adverbs and nouns
    used_flags = ['a', 'ad','n']
    meaninful_words = []
    for word, flag in word_list:
        if (word not in stopwords) and (flag in used_flags):
            meaninful_words.append(word)
    return ' '.join(meaninful_words)

In [49]:
data['words'] = data['content'].apply(proc_text)

In [50]:
data['content'].head(20)

0                                                 马克
1                                              女Boss
2                                              一星预订。
3                           经期队长 从名字就能知道 1:是女的 2:惹不起
4             看电影呈现的效果吧，目前漫威的表现都还好，这么关键的英雄，不会砸自己招牌的。
5                                                李佩斯
6                                               顺序21
7                                         穿这么严实干嘛...
8                                              快上映啊！
9                                      三八妇女节上映吗，有点含意
10                                             支持布丽！
11                                   会是深入人心的角色的，相信影后
12                            欢迎裘德洛来到漫威宇宙，欢迎科尔森特工回归。
13                                          漫威宇宙铺的好大
14    裘花终于也来到了漫威宇宙，这是怎样的盛世啊，想看两代福华正面飙戏，想看糖球在mcu铜矿嘤嘤嘤
15                                         看漫威队长手撕灭霸
16                                     封面很令人深思啊 寇森命大
17          漫威家的粉丝认证！以后每部mcu都会去影院支持！拍个一百部吧，活该你们会赚钱:）
18                          理清了整个宇宙，虽然知道剧情走向但是

In [51]:
data['words'].head()

0               
1               
2              星
3       经期 队长 名字
4    电影 效果 漫威 关键
Name: words, dtype: object

In [52]:
from sklearn import feature_extraction
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer(min_df=1)
tfidf=vectorizer.fit_transform(data['words'])

In [53]:
words = vectorizer.get_feature_names()
tfidf_dict= {key:0 for key in words if len(key) > 1}
for i in range(0,len(data)):
    for j in range(len(words)):
        if tfidf[i,j] > 1e-5:
              tfidf_dict[words[j]]=tfidf[i,j]

In [54]:
tfidf_df= pd.DataFrame(list(tfidf_dict.items()), columns=['word','TF-IDF'])

In [59]:
tfidf_df.sort_values(by='TF-IDF',ascending=False).head(20)

Unnamed: 0,word,TF-IDF
601,恶臭,1.0
917,理由,1.0
720,日子,1.0
846,演员表,1.0
689,收藏夹,1.0
850,漫威,1.0
130,俱乐部,1.0
154,全村人,1.0
169,兴趣,1.0
879,爱女,1.0
