In [None]:
import pandas as pd
from sklearn.manifold import MDS

# Load the provided Excel file
file_path_bbc_csv = 'C:/Users/75800/Desktop/COM5507/1212/csv/NYT clean.csv'
df_bbc_csv = pd.read_csv(file_path_bbc_csv)
# Display the first few rows of the dataframe to understand its structure
df_bbc_csv.head()

from sklearn.feature_extraction.text import TfidfVectorizer

# Applying TF-IDF Vectorization without language filtering
tfidf_vectorizer_no_lang_filter = TfidfVectorizer(stop_words='english', max_df=0.7, min_df=10)
X_tfidf_no_lang_filter = tfidf_vectorizer_no_lang_filter.fit_transform(df_bbc_csv['Comment'].dropna().astype(str))

from sklearn.decomposition import TruncatedSVD

# Applying LSA (Truncated SVD) for topic modeling without language filtering
lsa_no_lang_filter = TruncatedSVD(n_components=5, random_state=0)
lsa_no_lang_filter.fit(X_tfidf_no_lang_filter)

# Function to display the top words in each topic
def display_topics(model, feature_names, no_top_words):
    for topic_idx, topic in enumerate(model.components_):
        print(f"Topic {topic_idx}:")
        print(" ".join([feature_names[i] for i in topic.argsort()[:-no_top_words - 1:-1]]))

# Displaying the top words in each topic without language filtering
no_top_words = 10
display_topics(lsa_no_lang_filter, tfidf_vectorizer_no_lang_filter.get_feature_names_out(), no_top_words)


In [85]:
# Storing the results of LSA topic modeling analysis in a DataFrame for output

# Extracting the top words for each topic
topics = []
for i, topic in enumerate(lsa_no_lang_filter.components_):
    topic_terms = [tfidf_vectorizer_no_lang_filter.get_feature_names_out()[index] for index in topic.argsort()[:-no_top_words - 1:-1]]
    topics.append(" ".join(topic_terms))

# Creating a DataFrame to store the topics
topics_df = pd.DataFrame(topics, columns=["Topic Words"], index=[f"Topic {i+1}" for i in range(len(topics))])

# Displaying the DataFrame
topics_df

# Saving the DataFrame to a CSV file
output_file_path = 'C:/Users/75800/Desktop/COM5507/1212/topic model/NYT topic model.csv'
topics_df.to_csv(output_file_path, index=True)



In [None]:
# Storing the results of LSA topic modeling analysis in a DataFrame for output

# Extracting the top words for each topic
topics = []
for i, topic in enumerate(lsa_no_lang_filter.components_):
    topic_terms = [tfidf_vectorizer_no_lang_filter.get_feature_names_out()[index] for index in topic.argsort()[:-no_top_words - 1:-1]]
    topics.append(" ".join(topic_terms))

# Creating a DataFrame to store the topics
topics_df = pd.DataFrame(topics, columns=["Topic Words"], index=[f"Topic {i+1}" for i in range(len(topics))])

# Displaying the DataFrame
topics_df

# Saving the DataFrame to a CSV file
output_file_path = 'C:/Users/75800/Desktop/COM5507/1212/topic model/NYT topic model.csv'
topics_df.to_csv(output_file_path, index=True)



In [None]:
import matplotlib.pyplot as plt
import seaborn as sns  # Import seaborn for more color options

# Saving the visualization to a file

# Set the file path for the output image
output_image_path = '/Users/75800/Desktop/COM5507/1212/topic model/NYT topic model.png'

# Number of topics and words
n_components = 5  # Replace with the number of topics you have
num_words = 10    # Number of top words to display in each topic

# Recreating the subplots for each topic in a horizontal layout
fig, axes = plt.subplots(1, n_components, figsize=(15, 10))
fig.tight_layout(pad=6.0)

# Define a color palette
palette = sns.color_palette("husl", n_components)

for i, ax in enumerate(axes):
    # Sorting the components by weight and getting the top words
    components = lsa_no_lang_filter.components_[i]
    top_indices = components.argsort()[-num_words:][::-1]
    top_values = components[top_indices]
    top_words = [tfidf_vectorizer_no_lang_filter.get_feature_names_out()[j] for j in top_indices]

    # Creating a bar plot for each topic with different colors
    ax.barh(top_words, top_values, color=palette[i])
    ax.set_title(f'Topic {i+1}')
    ax.invert_yaxis()

# Save the figure
plt.savefig(output_image_path)

# Provide the path for downloading
output_image_path

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD

n_components = 5 
num_words = 10  

# DataFrame to store the distribution of top words across topics
word_distribution = pd.DataFrame()

for i in range(n_components):
    top_indices = lsa_no_lang_filter.components_[i].argsort()[-num_words:][::-1]
    for j in top_indices:
        word = tfidf_vectorizer_no_lang_filter.get_feature_names_out()[j]
        word_distribution.at[word, f'Topic {i+1}'] = lsa_no_lang_filter.components_[i][j]

# Normalizing the distribution
word_distribution = word_distribution.div(word_distribution.max(axis=1), axis=0)

# Plotting the distribution
plt.figure(figsize=(15, 10))
sns.heatmap(word_distribution, annot=True, cmap='YlGnBu')
plt.title("Word Distribution Across Topics")
plt.ylabel("Words")
plt.xlabel("Topics")

# Save the heatmap to a file before calling plt.show()
output_path = '/Users/75800/Desktop/COM5507/1212/heat map/NYT topic heat map.png'
plt.savefig(output_path)

# Now show the plot
plt.show()