In [1]:
import pandas as pd
import numpy as np
import warnings
warnings.simplefilter("ignore")
#nltk
import nltk
#stopwords
from nltk.corpus import stopwords
#tokenizing
from nltk import word_tokenize,sent_tokenize
#Beautiful Soup
from bs4 import BeautifulSoup 
# regex
import re
#word2vec
from gensim.models import Word2Vec
#machine learning
from sklearn.feature_extraction.text import CountVectorizer

### 读入评论文本

In [2]:
data = pd.read_csv('employee_reviews.csv')
reviews = data[['company', 'pros', 'cons']]

### 评论文本预处理

In [3]:
def clean_reviews(review):
    #Removing html tags
    review_text = BeautifulSoup(review).get_text()
    #Retaining only alphabets.
    review_text = re.sub("[^a-zA-Z]"," ",review_text)   
    #Converting to lower case and splitting
    word_tokens= review_text.lower().split()
    #Remove stopwords
    stop_words= set(stopwords.words("english"))     
    word_tokens= [w for w in word_tokens if not w in stop_words]
    #from words back to reviews
    cleaned_review=" ".join(word_tokens)
    return cleaned_review

In [4]:
#clean reviews
reviews['pros']=reviews['pros'].apply(clean_reviews)
reviews['cons']=reviews['cons'].apply(clean_reviews)

### 期望交叉熵

In [5]:
#得到类的字典
def get_class_dict(doc_class_list):
    class_set = sorted(list(set(doc_class_list)))
    class_dict = dict(zip(class_set, range(len(class_set))))#class set 排序后，按照索引做出字典
    return  class_dict

#得到词的字典
def get_term_dict(doc_terms_list):
    term_set_dict = {}
    for doc_terms in doc_terms_list:
        for term in doc_terms:
            term_set_dict[term] = 1
    term_set_list = sorted(term_set_dict.keys())       
    term_set_dict = dict(zip(term_set_list, range(len(term_set_list))))#term set 排序后，按照索引做出字典
    return term_set_dict

def stats_class_df(doc_class_list, class_dict):
    class_df_list = [0] * len(class_dict)
    for doc_class in doc_class_list:
        class_df_list[class_dict[doc_class]] += 1
    return class_df_list

def stats_term_class_df(doc_terms_list, doc_class_list, term_dict, class_dict):
    term_class_df_mat = np.zeros((len(term_dict), len(class_dict)), np.float32)
    for k in range(len(doc_class_list)):
        class_index = class_dict[doc_class_list[k]]
        doc_terms = doc_terms_list[k]
        for term in set(doc_terms):
            term_index = term_dict[term]
            term_class_df_mat[term_index][class_index] +=1
    return  term_class_df_mat

#期望交叉熵
def calculateKL(class_df_list, term_set, term_class_df_mat, class_index):
    A = term_class_df_mat#每个数代表一个特征出现属于某类别的文档数
    B = np.array([(sum(x) - x).tolist() for x in A])
    N = sum(class_df_list)#总文档数（总评论数）
    term_df_array = np.sum(A, axis = 1)#所有词出现过的文档数的矩阵
    class_df_array = np.sum(A, axis = 0)
    class_set_size = len(class_df_list)#class的数量
    sorted_term_score_index = []

    p_t = term_df_array / N #Pt
    p_c_t_mat =  (A + 1) / (A + B + class_set_size)
    p_c_mat = np.array(class_df_list)/sum(class_df_list)
    ece = p_t * np.sum(p_c_t_mat * np.log(p_c_t_mat/ p_c_mat), axis = 1)
    cd = np.array([(x/sum(x)).tolist() for x in A])
    dd = A/np.sum(A, axis = 0)
    #cd_ece = [cd[:,i]*dd[:,i]*ece for i in range(len(class_df_list))]
    i = class_index
    cd_ece = cd[:,i]*dd[:,i]*ece
    sorted_term_score_index = cd_ece.argsort()[: : -1]
    term_set_fs = [term_set[index] for index in sorted_term_score_index]   
    return term_set_fs

def feature_selection(doc_str_list, doc_class_list, company):
    vectorizer = CountVectorizer(binary = True)   
    word_tokenizer = vectorizer.build_tokenizer()
    doc_terms_list = [word_tokenizer(doc_str) for doc_str in doc_str_list]
    
    class_dict = get_class_dict(doc_class_list)
    class_index = class_dict[company]
    term_dict = get_term_dict(doc_terms_list)
    class_df_list = stats_class_df(doc_class_list, class_dict)#每种类别的数量dict
    class_count = class_df_list[class_index]
    #print("the index and count:", class_index, class_count)
    term_class_df_mat = stats_term_class_df(doc_terms_list, doc_class_list, term_dict, class_dict)
    term_set = [term[0] for term in sorted(term_dict.items(), key = lambda x : x[1])]
    term_set_fs = []
    term_set_fs = calculateKL(class_df_list, term_set, term_class_df_mat, class_index)
    
    return term_set_fs

### 提取各类特征词

In [6]:
company = 'google'#指定某一类
doc_str_list= reviews['cons']
doc_class_list = reviews['company']
term_fs = feature_selection(doc_str_list, doc_class_list, company)
term_set_fs = term_fs[:20]#取前20个特征词

In [7]:
term_set_fs

['google',
 'big',
 'company',
 'work',
 'large',
 'projects',
 'politics',
 'mountain',
 'bureaucracy',
 'hours',
 'many',
 'impact',
 'slow',
 'long',
 'view',
 'cons',
 'life',
 'people',
 'political',
 'time']