In [1]:
import re
import numpy as np
import pandas as pd
def DataframeCleanUp(dataframe):
    #删除content是空的那些行
    dataframe = dataframe[dataframe['content'].notna()]
    return dataframe


'''
文本清洗
@return text：清洗后的文本
'''
def process_text(text):
    text = re.sub(r"[0-9]+", '', text) 
    text = re.sub(r'\s+',' ',text)
    text = text.lower()
    text = re.sub(r'\d',' ',text)
    text = re.sub(r'\s+',' ',text)
    text = ''.join(e for e in text if e.isalnum())
    return text


'''
文本清洗
@param dataframe：数据来源dataframe
@param row_to_clean：需要清洗的column
@return dataframe：清洗后的dataframe，加了一列clean_cmt为清洗后的文本
'''
def SentenceCleanUp(dataframe, row_to_clean):
    content = dataframe[row_to_clean].dropna() #把空的content去掉
    processed_content = []
    for i in range(len(content)):
        processed_content.append([process_text(content.iloc[i])])
    dataframe["clean_cmt"] = np.array(processed_content) 
    return dataframe


'''
文本清洗
@return stopwordlist
'''
def makeStopWords(wordfile):
    stopwords = wordfile[wordfile['Unnamed: 2'] == 1]
    stopwordlist = list(stopwords['Unnamed: 0'])
    stopwordlist.extend(['飞吻','笑哭','偷笑','哭惹','派对','微笑','害羞']) #加入额外的自定义stopwords

    return stopwordlist

In [None]:
import time
from collections import Counter
import pandas as pd
import numpy as np
import jieba
import jieba.posseg as pseg
import jieba.analyse as analyse


'''
@param: N/A
@return brand_code_matching：品牌名和品牌code对应字典
'''
def brandDictionary():
    code_form = pd.read_excel('tbl_brand.xlsx', engine='openpyxl')
    brand_code_matching={}  ##品牌名和品牌code的对应字典
    for brand in code_form['name']: 
        row_index = code_form.index[code_form['name'] == brand].tolist()
        brand_code_matching[brand] = code_form.at[row_index[0],'code']
    return brand_code_matching


'''
@param: 品牌名
@return brandCode：品牌code
'''
def getBrandCode(brandName):
    brand_code_dict = brandDictionary()
    brandCode = brand_code_dict[brandName] 
    return brandCode

'''
创立2D array的function
'''
def init_list_of_objects(size):
    list_of_objects = list()
    for i in range(0,size):
        list_of_objects.append( list() ) #different object reference each time
    return list_of_objects


'''
@param brand: 品牌名
@param dataframe: 数据来源dataframe
@return brand_content：list；此品牌下所有评论文本
'''
def getComments(brand, dataframe):
    brand_contents = []
    for index,row in dataframe.iterrows():
        if brand in str(row['brand']):  #将所有是此品牌的row摘出
            brand_contents.append(row['clean_cmt']) #将品牌对应的品论文本摘出
    return brand_contents

'''
@param contents: list; 品牌评论文本list
@param stopwordlist: 定义的stopwords
@return jieba_results：list；此品牌下所有评论的jieba分词结果
'''
def jiebaAnalyzeContents(contents, stopwordlist):
    jieba_results = []
    allow_pos = ('na') #选出形容词&名词
    for i in range(len(contents)): #loop品牌1的每一条评论
        lines = contents[i].split()
        content1 = "".join(lines)
        words = jieba.analyse.extract_tags(content1, topK=50, withWeight=False, allowPOS=(allow_pos))
        for w in words:
            if w in stopwordlist:
                words.remove(w)
        jieba_results.append([i,",".join(words)])
    return jieba_results

'''
@param jieba_results：list；此品牌下所有评论的jieba分词结果
@return keywords, counts: 高频词&频率的list
'''
def countWords(jieba_results):
    #separated = [jieba_results[i][1].split(',') for i in len(jieba_results)] ###只提取关键词那一列, 
    #separated = separated.split(',') #拆分为一个个词
    
    separated = [','.join(jieba_results[i][1].split(',')) for i in range(len(jieba_results))]
    separatedstr = ','.join(separated)
    separatedstr = separatedstr.split(',')

    counter = Counter([i for i in separatedstr])

    keywords = [key for key, value in counter.items()]
    counts = [value for key, value in counter.items()]
    
    return keywords, counts

'''
用一个品牌的 contents & results 创立三个lists
@param brand: 品牌名
@param brand_content：list；此品牌下所有评论文本
@param brand_results: list；此品牌下所有评论的星级打分
@return data: data dictionary; 加在total_df下面
'''
def MakeNewRow(brand, results):
    #mydf=pd.DataFrame(columns={'brand_code','label','count'})

    word, count = countWords(results)

    brand_code = []
    brand_code.extend([getBrandCode(brand) for i in range(len(word))]) #有多少词就append多少个brand-code

    data = [{'brand_code':x,'word':y,'count':z} for x,y,z in zip(brand_code,word,count)]
    

    return data


if __name__ == "__main__":
    dfred=pd.read_csv('red_top1000.csv')
    brands_total = dfred['brand'].unique() #列出总共有多少个不同品牌（unique values of the brand column）

    dfred = DataframeCleanUp(dfred)  #做dataframe的清理
    dfred = SentenceCleanUp(dfred, 'content') #做content列的文本清理

    wordfile = pd.read_excel('red_words.xlsx',engine = 'openpyxl')
    stopwordlist = makeStopWords(wordfile)

    #total_df=pd.DataFrame(columns={'brand_code','word','count'})
    total_df=pd.DataFrame()

    total, cnt = len(brands_total), 0
    
    for brand in brands_total: #每一个品牌
        tic = time.perf_counter() ###开始时间
        cnt += 1
        contents = getComments(brand, dfred) #挑出对应文本
        results=jiebaAnalyzeContents(contents, stopwordlist)
        
        data = MakeNewRow(brand, results)
        total_df = total_df.append(data,sort=False)
        total_df = total_df[['brand_code','word','count']]  #整理列的呈现顺序
        print(total_df)
        total_df.to_excel("小红书品牌1000词频统计.xlsx")
        toc = time.perf_counter() ###结束时间
        print(f"此品牌跑了 {toc - tic:0.4f} seconds")
        print('当前进度：', cnt/total)

    #total_df.to_excel("小红书品牌1000词频统计.xlsx")


Building prefix dict from the default dictionary ...
Loading model from cache /tmp/jieba.cache
Loading model cost 0.857 seconds.
Prefix dict has been built successfully.
