In [1]:
import os
import re
import jieba
import pandas as pd
import numpy as np
from gensim import corpora, models
import pprint
from gensim.models import LdaModel
from gensim.models.coherencemodel import CoherenceModel
import matplotlib.pyplot as plt
import seaborn as sns
import collections  
import itertools
from sklearn.metrics.pairwise import cosine_similarity
import nltk
from nltk import word_tokenize, pos_tag
from nltk.corpus import wordnet
from nltk.stem import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()

In [2]:
data = pd.read_excel('850-project_data.xlsx')
data.head()

Unnamed: 0,project_id,abstract,project_description
0,2183,Crows have been observed to gather and vocaliz...,Wild American crows gather around their dead t...
1,2622,I will be searching caves across northeastern ...,"Widespread Bat White-Nose Syndrome Fungus, Nor..."
2,2108,"On May 16, Dicty and HL60 cells will race for ...",When a healthy person suffers from a cut or a ...
3,1003,What makes some people react to the sight of t...,What do two men kissing and a bucket of maggot...
4,3637,This study will use mobile technology to inves...,We've analyzed some data regarding eating beha...


# 预处理

In [3]:
# 将NLTK的词性标记映射到WordNet的词性标记 
def get_wordnet_pos(treebank_tag): 
    if treebank_tag.startswith('J'): 
        return wordnet.ADJ 
    elif treebank_tag.startswith('V'): 
        return wordnet.VERB 
    elif treebank_tag.startswith('N'): 
        return wordnet.NOUN 
    elif treebank_tag.startswith('R'): 
        return wordnet.ADV 
    else: 
        return wordnet.NOUN
# 定义函数进行处理 
def get_cut_content(x): 
    x = re.sub(r"\d+",' ',x)
    x = re.sub(r"[^\w\s]",' ',x)
    x = re.sub(r"[^a-zA-Z ]+",' ',x)
    x = re.sub(" +", " ", x)
    x = x.lower()
    words = nltk.word_tokenize(x) #分词
    words = [word for word in words if word not in stopwords] 
    tagged_tokens = pos_tag(words) # 获取词性标记 
    lemmatized_tokens = [lemmatizer.lemmatize(word, get_wordnet_pos(tag)) for word, tag in tagged_tokens] # 词形还原 
    words=lemmatized_tokens
    words = [lemmatizer.lemmatize(word) for word in words]
    return words 

In [4]:
#停用词
hit_stopwords = [line.rstrip() for line in open('stopwords.txt', encoding='utf-8')] #英文停用词
mystopwords = [line.rstrip() for line in open('mystopwords.txt', encoding='utf-8')] #自定义停用词
stopwords = hit_stopwords+mystopwords

In [5]:
text = data['project_description'].astype(str) #lda建模目标
#进行预处理
text_cut = text.map(lambda x:get_cut_content(x))

In [6]:
#删除预处理后空白的
text_cut = text_cut.loc[text_cut.map(lambda x:len(x)>0)]
text = text.loc[text_cut.index]
data = data.loc[text_cut.index]
text.index = range(text.shape[0])
text_cut.index = range(text_cut.shape[0])
data.index = range(data.shape[0])
data['text_cut'] = text_cut

In [7]:
data['text_cut']

0      [wild, american, crow, gather, dead, learn, da...
1      [widespread, bat, white, nose, syndrome, fungu...
2      [healthy, person, suffers, cut, burn, immune, ...
3      [kiss, bucket, maggot, common, heterosexual, i...
4      [analyze, data, eat, behavors, mother, project...
                             ...                        
845    [joy, start, family, pregnant, healthy, baby, ...
846    [hawai, coral, reef, fish, specie, saddle, wra...
847    [albatross, scour, mile, ocean, forage, food, ...
848    [fresco, paint, medium, trace, ancient, civili...
849    [zika, virus, zikv, transmit, aedes, mosquito,...
Name: text_cut, Length: 850, dtype: object

In [8]:
#==词频统计
all_words = list(itertools.chain(*text_cut)) #全部的单词
word_counts = collections.Counter(all_words)  #做词频统计
word_counts_top = word_counts.most_common()# 获取前N最高频的词####-------------重要的
pd.DataFrame(word_counts_top,columns=['word','count']).to_excel('词频统计结果.xlsx',index=0) #保存词频统计结果