In [1]:
import sys
import os
import jieba
import config # 自定义配置文件
import numpy as np
import pandas as pd

# 不显示VisibleDeprecation警告
import warnings
warnings.filterwarnings("ignore", category=Warning)

### 导入数据

In [2]:
data = pd.read_csv(config.AF_Data_path + "AFAfter.csv")

In [3]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 17651 entries, 0 to 17650
Data columns (total 16 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   Education   17651 non-null  int64  
 1   Id          17651 non-null  object 
 2   age         17651 non-null  int64  
 3   gender      17651 non-null  int64  
 4   query       17651 non-null  object 
 5   SpaceNum    17651 non-null  float64
 6   SpaceRATIO  17651 non-null  float64
 7   LinkNum     17651 non-null  float64
 8   LinkRATIO   17651 non-null  float64
 9   TextSum     17651 non-null  float64
 10  TextMax     17651 non-null  float64
 11  TextMin     17651 non-null  float64
 12  TextMedian  17651 non-null  float64
 13  TextMean    17651 non-null  float64
 14  SearchNum   17651 non-null  float64
 15  HighWords   17651 non-null  object 
dtypes: float64(10), int64(3), object(3)
memory usage: 2.2+ MB


### 导入自定义词库

In [4]:
# 创建停用词列表
def stopwordslist():
    # 获取停用词文件列表
    for root, dirs, files in os.walk(config.stopword_path):
        pass
    # 停用词list
    stopwords = []
    for filename in files:
        for line in open(config.stopword_path + filename, 'r+', encoding='utf-8').readlines():
            stopword = line.strip()
            stopwords.append(stopword)
    return list(set(stopwords))

In [5]:
# 创建新增词汇
def add_Words():
    # 获取自定义词典文件名
    for root, dirs, files in os.walk(config.addword_path):
        filenames = [filename for filename in files if 'txt' in filename]
    addWords = []
    for filename in filenames:
        for line in open(config.addword_path + filename, 'r+').readlines():
            addword = line.strip()
            addWords.append(addword)
    return list(set(addWords))

In [6]:
addwordslist = add_Words() # 自定义词典
stopwordslist = stopwordslist() # 停用词词典

In [7]:
# jieba.add_word(word) # 新增词汇
jieba.load_userdict(addwordslist) # 添加用户自定义词典

Building prefix dict from the default dictionary ...
Loading model from cache C:\Users\11147\AppData\Local\Temp\jieba.cache
Loading model cost 0.552 seconds.
Prefix dict has been built successfully.


In [8]:
# 停用词处理
def Deactivate_Words(words,stopwordslist):
    if len(words) <= 2:
        word_After_Stop = words
    elif len(words) > 2:
        word_After_Stop = list(set(words).difference(stopwordslist)) 
    return word_After_Stop

### 分词

#### jieba

In [9]:
# 分词处理
def get_tokens(wordtxt):
    tokens = [] # 用于存储分词结果
    for query in wordtxt.split('\t'): # 使用 \t 识别搜内容文本
        words = [word for word in jieba.cut(query)] # 分词
        words = Deactivate_Words(words,stopwordslist) # 停用词处理
#         print(words)
        # 分词结果进行拼接；强化词语语义
        for gram in [1,2]:
            for i in range(len(words) - gram + 1): # 根据分词结果列表进行拼接，最大拼接长度：2
                tokens += ["_*_".join(words[i:i+gram])] # 使用join拼接词语，并将结果添加至tokens
#                 print("_*_".join(words[i:i+gram]))
#         print("-"*20)
    return tokens

In [14]:
PJ_df = pd.DataFrame(columns=["token"])
for word in data["query"]:
    tokens = get_tokens(word)
    df_dict = {"token":tokens}
    PJ_df = PJ_df.append(df_dict,ignore_index=True)

In [20]:
PJ_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 17651 entries, 0 to 17650
Data columns (total 1 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   token   17651 non-null  object
dtypes: object(1)
memory usage: 138.0+ KB


#### FoolNLTK

In [18]:
FL_df = pd.DataFrame(columns=["token"])
for HighWords in data["HighWords"]:
    hw_words = HighWords[2:-2].split("', '")
    df_dict = {"token":hw_words}
    FL_df = FL_df.append(df_dict,ignore_index=True)

In [19]:
FL_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 17651 entries, 0 to 17650
Data columns (total 1 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   token   17651 non-null  object
dtypes: object(1)
memory usage: 138.0+ KB


### 分词结果持久化

In [27]:
PJ_df.to_excel(config.WordData_path + "Participle-Jieba.xlsx",encoding="utf-8")

In [28]:
FL_df.to_excel(config.WordData_path + "Participle-FoolNLTK-HW.xlsx",encoding="utf-8")