# 1. 与 google drive 链接，导入文件

In [1]:
from google.colab import drive
drive.mount('/content/drive/')

dir_path = '/content/drive/MyDrive/2023NLPCourse/Assignment1/Part_B'
file_name = 'Data/data.csv'

%cd /content/drive/MyDrive/2023NLPCourse/Assignment1/Part_B

Drive already mounted at /content/drive/; to attempt to forcibly remount, call drive.mount("/content/drive/", force_remount=True).
/content/drive/MyDrive/2023NLPCourse/Assignment1/Part_B


# 2. 使用pandas读取评论数据文件 data.csv

In [2]:
import pandas as pd
df = pd.read_csv(file_name)
df = df.dropna()

# 3. 数据清洗，此处只是移除了文本中存在的表情

In [3]:
!pip install clean-text
import nltk
nltk.download('punkt')

from nltk.tokenize import sent_tokenize
from cleantext import clean

new_df = pd.DataFrame()

review = []
review_id = []

for index in df.index:
    sentences = sent_tokenize(df.loc[index, 'review'])
    for sentence in sentences:
        review.append(clean(sentence, no_emoji=True))
        review_id.append(index)

new_df['review_id'] = review_id
new_df['review'] = review

# 4. 定义文本预处理函数

In [4]:
import string
import emoji
import re
import nltk
nltk.download('wordnet')
nltk.download('stopwords')
from textblob import TextBlob
from nltk.corpus import wordnet
from nltk.metrics import edit_distance
from nltk.stem.porter import PorterStemmer
from nltk.tokenize import word_tokenize
import spacy
nlp = spacy.load('en_core_web_sm', disable=['parser', 'ner'])

stopwords = nltk.corpus.stopwords.words('english')

def preprocess(text):
    
    # 1.text blob 移除一些符号（过程，抽取词语，然后拼接词语）
    text_blob = TextBlob(text)
    text = ' '.join(text_blob.words)
    
    # 2. 移除文本中的数字
    text = re.sub(r'[0-9]', '', text)
    
    # 3. 最小化文本
    text = text.lower()
    
    # 4. 移除标点符号
    for punctuation in string.punctuation:
        text = text.replace(punctuation, '')
    
    # 5. 分词
    text = word_tokenize(text)
    
    # 6. 移除分词后存在的空白token 
    text = [t for t in text if len(t) > 0]
    
    # 7. 移除非英文字符
    text = [t for t in text if t.isalpha()]
    
    # 8. 替换否定表达形式的 token，减少模型理解难度
    replacer  = AntonymReplacer()
    text = replacer.replace_negations(text)
    
    # 9. 移除停用词
    text = [i for i in text if i not in stopwords]
    
    # 10. 词性还原
    text = lemmatization(text, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV'])
    
    return text


def lemmatization(sent, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV']):
    doc = nlp(" ".join(sent)) 
    texts_out = [token.lemma_ for token in doc if token.pos_ in allowed_postags]
    return texts_out

class AntonymReplacer(object):
    def replace(self, word, pos=None):
        antonyms = set()

        for syn in wordnet.synsets(word, pos=pos):
            for lemma in syn.lemmas():
                for antonym in lemma.antonyms():
                    antonyms.add(antonym.name())

        if len(antonyms) == 1:
            return antonyms.pop()
        else:
            return None

    def replace_negations(self, sent):
        i, l = 0, len(sent)
        words = []

        while i < l:
            word = sent[i]

            if word == 'not' and i+1 < l:
                ant = self.replace(sent[i+1])

                if ant:
                    words.append(ant)
                    i += 2
                    continue

            words.append(word)
            i += 1

        return words

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


# 5. 对输入数据进行预处理

In [None]:
cleaned_text = []
counter = 1

for text in new_df["review"]:
    print(f'Preprocessing row {counter}')
    counter+=1
    cleaned_text.append(preprocess(str(text)))

new_df["Clean"] = cleaned_text
new_df = new_df.dropna()

new_df.to_excel("Data/preprocessed_data.xlsx")
print('Preprocessing done.')

# 6. LDA模型训练

In [None]:
# 以下生成的 extracted_data.xlsx 是为 Bertopic 准备的数据，Bertopic模型数据输入上限为 100K，故只保存了100k
    # 经过预处理后，获得的数据数量为 120k左右，只减少了 20k 尽量保证数据本身不受影响
pd.read_excel("Data/preprocessed_data.xlsx").sample(n=100000).to_excel("Data/extracted_data.xlsx", index=False)
extracted_data = pd.read_excel("Data/preprocessed_data.xlsx")


import gensim.corpora as corpora
import gensim.models as models

from ast import literal_eval
from pprint import pprint

data_words = []
for x in extracted_data['Clean']:
    data_words.append(literal_eval(x))

# 创建词典
id2word = corpora.Dictionary(data_words)
# 创建语料库
texts = data_words
corpus = [id2word.doc2bow(text) for text in texts]

# LDA模型选择的主题数量，以及每个主题的word数量
num_topics = 36
num_words = 10

# 根据词典，语料库，以及主题数量训练 LDA 主题模型
lda_model1 = models.LdaMulticore(corpus=corpus,
                              id2word=id2word,
                              num_topics=num_topics)

# 7. 保存模型

In [None]:
import pickle
topics = lda_model1.print_topics(num_topics=num_topics, num_words=num_words)
topic_list = []
for topic in topics:
    topic_list.append(topic[1])

df_topics = pd.DataFrame(topic_list,  columns =['topics'])

df_topics.to_excel("Data/LDA_topics.xlsx")
pickle.dump(lda_model1, open('Models/LDA_model.model', 'wb'))   # save lda model
pickle.dump(id2word, open('Models/LDA_model.dict', 'wb'))       # save dictionary
pickle.dump(corpus, open('Models/LDA_model.corpus', 'wb'))      # save corpus

# 8.LDA主题模型结果的可视化结果展示

In [None]:
import pickle

loaded_lda_model = pickle.load(open('Models/LDA_model.model', 'rb'))  # load trained model
loaded_dictionary = pickle.load(open('Models/LDA_model.dict', 'rb'))  # load dictionary 
loaded_corpus = pickle.load(open('Models/LDA_model.corpus', 'rb'))    # load corpus

!pip install pyLDAvis
import pyLDAvis
import pyLDAvis.gensim_models as gensimvis

pyLDAvis.enable_notebook()

In [6]:
vis = gensimvis.prepare(loaded_lda_model, loaded_corpus, loaded_dictionary)
vis

  and should_run_async(code)
