In [None]:
import re   
import pandas as pd  
import numpy as np
import matplotlib.pyplot as plt  
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer   
from nltk.tokenize import word_tokenize 
from nltk.corpus import stopwords 
from nltk.stem import WordNetLemmatizer  
from gensim import corpora  
from gensim.models import LdaModel   
from textblob import TextBlob  
from wordcloud import WordCloud  
from bs4 import BeautifulSoup   

#一、数据预处理 
#1.文本预处理函数  
def preprocess_text(text):    
    text = BeautifulSoup(text, "html.parser").get_text()  # 去除HTML标签   
    text = re.sub(r'[^a-zA-Z\s]', '', text) # 去除特殊字符     
    text = text.lower()  # 将所有字母转换为小写   
    stop_words = set(stopwords.words('english'))  # 删除停用词
    text = ' '.join(word for word in text.split() if word not in stop_words)  
    lemmatizer = WordNetLemmatizer()  # 词形还原   
    text = ' '.join(lemmatizer.lemmatize(word) for word in text.split())  
    return text  
#2.应用CSV文本文件  
def load_data(file_path):  
    return pd.read_csv(file_path)  
def preprocess_csv(file_path):  
    df = load_data(file_path)  
    df['processed_text'] = df['text'].apply(preprocess_text)  
    return df

if __name__ == "__main__":  
    file_path = r"C:\Users\丁月甜\Desktop\wikipedia_articles.csv"
    output_file_path=r"C:\Users\丁月甜\Desktop\1processed_wikipedia_articles.csv"
    processed_df = preprocess_csv(file_path) 
    processed_df.to_csv(output_file_path, index=False, encoding='utf-8-sig')  

#二、LDA主题分类并绘制词云图
df=pd.read_csv(r"C:\Users\丁月甜\Desktop\数据可视化\数据集\processed_wikipedia_articles.csv")
# print(df.head())
#1.使用TF-IDF向量化  
vectorizer = TfidfVectorizer(stop_words='english')  
tfidf_matrix = vectorizer.fit_transform(df['processed_text'])  
#2.将TF-IDF矩阵转换为稀疏矩阵  
sparse_matrix = tfidf_matrix.toarray() 
#3.创建词典和语料库  
dictionary = corpora.Dictionary([vectorizer.get_feature_names_out()])  
corpus = [dictionary.doc2bow(text) for text in df['processed_text'].apply(lambda x: x.split())]  
#4.应用LDA模型  
num_topics = 5  
lda_model = LdaModel(corpus, num_topics=num_topics, id2word=dictionary, passes=15,random_state=10)  
#5.获取每个文档的主题分布 
topic_distribution = lda_model.get_document_topics(corpus)  
df['Topic'] = [max(dist, key=lambda x: x[1])[0] for dist in topic_distribution] 
#6.从 LDA 模型中提取主题关键词并自定义主题名称 
topics = lda_model.print_topics(num_words=5) 
print(topics)
topic_names = {i: f"Topic {i}" for i in range(num_topics)} 
for i, topic in enumerate(topics):  
    print(f"主题 {i}: {topic[1]}") 
    # 修改条件判断  
    if "state" in topic[1] and "university" in topic[1]:  
        topic_names[i] = "Politics and Global Affairs"  
    elif "game" in topic[1] and "century" in topic[1]:  
        topic_names[i] = "Sports and Entertainment"  
    elif "displaystyle" in topic[1] and "study" in topic[1]:  
        topic_names[i] = "Science and Technology"  
    elif "philosophy" in topic[1] and "theory" in topic[1]:  
        topic_names[i] = "Philosophy and Theory"  
    elif "water" in topic[1] and "energy" in topic[1]:  
        topic_names[i] = "Health and Environmental Studies"       
df['Topic_name'] = df['Topic'].map(topic_names)   
print(df.head())

#6.绘制词云图  
def plot_wordcloud(lda_model, topic_names):  
    for i in range(len(topic_names)):  
        plt.figure(figsize=(10, 5))  
        wordcloud = WordCloud(width=800, height=400, background_color='white').generate_from_frequencies(dict(lda_model.show_topic(i, topn=50)))  
        plt.imshow(wordcloud, interpolation='bilinear')  
        plt.axis('off')   
        plt.title(topic_names[i],fontsize=20)   
        plt.show()  
plot_wordcloud(lda_model, topic_names) 

#三、情感分析并绘制柱状堆积图、雷达图
#1.基于主题分类结果进行情感分析
def analyze_sentiment(text):  
    analysis = TextBlob(text)  
    sentiment_value = analysis.sentiment.polarity   
    if sentiment_value > 0:  
        sentiment_label = 'positive'  
    elif sentiment_value < 0:  
        sentiment_label = 'negative'  
    else:  
        sentiment_label = 'neutral'  
    return sentiment_value, sentiment_label    
df[['Sentiment_Value', 'Sentiment']] = df['processed_text'].apply(lambda x: analyze_sentiment(x)).apply(pd.Series)   

#2.绘制柱状堆积图  
custom_colors = ['#ADD8E6','#F7DC6F','#FFB6C1']  
topic_sentiment_counts = df.groupby(['Topic', 'Sentiment']).size().unstack(fill_value=0)  
topic_sentiment_counts.index = [topic_names[i] for i in topic_sentiment_counts.index]  
ax = topic_sentiment_counts.plot(kind='bar', stacked=True,  
                                 color=[custom_colors[i % len(custom_colors)] for i in range(len(topic_sentiment_counts))])  

plt.title('Sentiment Distribution by Topic')  
plt.xlabel('Topic')  
plt.ylabel('Number of Articles')  
plt.legend(title='Sentiment')  
plt.xticks(rotation=0,fontsize=8)  
plt.tight_layout()  
plt.show() 

#3.绘制雷达图
#3.1准备雷达图数据
sentiment_means = df.groupby(['Topic_name', 'Sentiment'])['Sentiment_Value'].mean().unstack(fill_value=0)  
sentiments = sentiment_means.columns  
num_vars = sentiment_means.index.size 
colors = {  
    'positive': 'green',  
    'negative': 'orange',  
    'neutral': 'red'  
}   
#3.2设置雷达图角度
angles = np.linspace(0, 2 * np.pi, num_vars, endpoint=False).tolist()  
angles += angles[:1]  
#3.3绘制雷达图
fig, ax = plt.subplots(figsize=(10, 6), subplot_kw=dict(polar=True))   
for sentiment in sentiments:  
    values = sentiment_means[sentiment].values.flatten().tolist()  
    values += values[:1]    
    ax.fill(angles, values, color=colors[sentiment], alpha=0.1, label=sentiment)  
    ax.plot(angles, values, color=colors[sentiment], linewidth=1, alpha=0.3)  
    for v in values:  
        if v > 0: 
            for i in range(num_vars):  
                size = values[i]* 200  
                ax.scatter(angles[i], values[i], s=size, color=colors[sentiment], alpha=0.6)   
        elif v==0:
            for i in range(num_vars):  
                size = 20  
                ax.scatter(angles[i], values[i], s=size, color=colors[sentiment], alpha=0.6,marker='*') 
        elif v<0:
            for i in range(num_vars):  
                size = abs(values[i])* 200  
                ax.scatter(angles[i], values[i], s=size, color=colors[sentiment], alpha=0.6,marker='^') 
#3.4设置雷达图的标签  
y_tick_values = np.linspace(-0.2, 0.1, num=4) 
ax.set_yticks(y_tick_values)   
ax.tick_params(axis='y', colors='#4A235A', labelsize=12) 
ax.set_yticklabels([])  
for y_tick in y_tick_values:   
    angle = 0     
    ax.text(angle, y_tick , f"{round(y_tick,2)}", color='black', ha='center', va='center', fontsize=10)  
ax.axvline(x=0, color='#4A235A', linewidth=3, alpha=0.5) 

ax.set_xticks(angles[:-1])  
ax.set_xticklabels(sentiment_means.index)  
#3.5添加图例  
handles = [plt.Line2D([0], [0], color=colors[t], lw=8, alpha=0.1) for t in sentiments]  
labels = list(sentiments)  
for y_val in y_tick_values:  
    scatter_size = abs(y_val) * 200  
    if round(y_val, 2) > 0:   
        handles.append(plt.Line2D([0], [0], marker='o', markersize=scatter_size/3, color='w', markerfacecolor='gray', alpha=0.8)) 
        labels.append(f'Value {round(y_val, 2)}') 
    elif round(y_val, 2)==0:  
        handles.append(plt.Line2D([0], [0], marker='*', markersize=20/3, color='w', markerfacecolor='gray', alpha=0.8))
        labels.append('Value 0')  
    elif round(y_val, 2)<0:
        handles.append(plt.Line2D([0], [0], marker='^', markersize=scatter_size/3, color='w', markerfacecolor='gray', alpha=0.8)) 
        labels.append(f'Value {round(y_val, 2)}')
ax.legend(handles=handles, labels=labels, loc='upper right', bbox_to_anchor=(1.4, 1), fontsize=9, title='Topic and Sentiment') 

plt.title('Sentiment Analysis Radar Chart',fontsize=15)  
plt.show()  