<a href="https://colab.research.google.com/github/MK316/Workingpapers/blob/main/DMETA/240222_EDA.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Exploratory Data Analysis (EDA) as of 24.02. 22

[data](https://raw.githubusercontent.com/MK316/Workingpapers/main/DMETA/data/data1.csv)

## Step 1. Data to read

In [None]:
url = "https://raw.githubusercontent.com/MK316/Workingpapers/main/DMETA/data/data1.csv"

In [None]:
import pandas as pd

df = pd.read_csv(url, encoding="utf-8")

## Step 2. Preliminary data check

In [None]:
df.head()

## Keyword preprocessing

Keywords: Keywords1 + Keywords2

In [None]:
import pandas as pd

# Assuming df is your DataFrame

# Combine 'Keywords1' and 'Keywords2' into a single 'Keywords' column
# Here we assume you want to separate keywords from both columns by a comma
# We also handle NaN values to avoid 'nan' strings in the combined column
df['Keywords'] = df[['Keywords1', 'Keywords2']].apply(lambda x: ', '.join(x.dropna()), axis=1)

# Now, you can drop the original 'Keywords1' and 'Keywords2' columns if they are no longer needed
df.drop(['Keywords1', 'Keywords2'], axis=1, inplace=True)

# Save the updated DataFrame to a CSV file
filename = 'data2.csv'
df.to_csv(filename, index=False)

# To ensure the file is saved, you can list the files in the current directory
!ls

# If you need to download the file to your local system from Colab, you can use:
from google.colab import files
files.download(filename)


In [None]:
df=pd.read_csv("data2.csv", encoding="utf-8")

## Descriptive stats

Year info (as string)

In [None]:
# Convert the 'Year' column to string
df['Year'] = df['Year'].astype(str)

# Get descriptive statistics for 'Year' as string
# Since 'Year' is now a string, traditional numerical descriptive stats don't apply.
# However, we can get counts, unique values, most common value, and frequency of the most common value.
year_descriptive_stats = df['Year'].describe()

print(year_descriptive_stats)


barplot by year

In [None]:
import pandas as pd

# Assuming 'Year' is a column in your DataFrame df
year_counts = df['Year'].value_counts().sort_index()

# Convert year_counts Series to DataFrame
year_counts_df = year_counts.reset_index()
year_counts_df.columns = ['Year', 'Frequency']

print(year_counts_df)


In [None]:
import matplotlib.pyplot as plt

# Assuming 'Year' is already converted to string and df is your DataFrame
year_counts = df['Year'].value_counts().sort_index()

plt.figure(figsize=(10, 6))
year_counts.plot(kind='bar')
plt.title('Frequency of Publications by Year')
plt.xlabel('Year')
plt.ylabel('Frequency')
plt.xticks(rotation=45)
plt.show()


In [None]:
# With frequency

import matplotlib.pyplot as plt

# Assuming 'Year' is already converted to string and df is your DataFrame
year_counts = df['Year'].value_counts().sort_index()

plt.figure(figsize=(10, 6))
bars = year_counts.plot(kind='bar')
plt.title('Frequency of Publications by Year')
plt.xlabel('Year')
plt.ylabel('Frequency')
plt.ylim(0,60)
plt.xticks(rotation=45)

# Annotating the bar plot with the frequency of each bar
for p in bars.patches:
    bars.annotate(f'{int(p.get_height())}', (p.get_x() + p.get_width() / 2., p.get_height()),
                  ha = 'center', va = 'center', xytext = (0, 10), textcoords = 'offset points')

plt.show()


## Step 3. Keyword Analysis for 'Digital'
To analyze the frequency of 'digital', you can start by creating a new column in your DataFrame that flags the presence of the word 'digital' in the 'Abstract', 'Title', or 'Keywords' columns.

In [None]:
# Lowercase all relevant columns to standardize the search
df['Abstract'] = df['Abstract'].str.lower()
df['Title'] = df['Title'].str.lower()
df['Keywords'] = df['Keywords'].str.lower()

# Flag rows that mention 'digital'
df['mentions_digital'] = df['Abstract'].str.contains('digital') | df['Title'].str.contains('digital') | df['Keywords'].str.contains('digital')


## Step 4: Frequency of 'Digital' Over Time
Now, calculate the frequency of abstracts mentioning 'digital' each year.

In [None]:
digital_mentions_per_year = df[df['mentions_digital']].groupby('Year').size()


## Step 5. Step 5: Plotting the Trends
To visualize how the mention of 'digital' has changed over time, you can plot the frequency per year.

In [None]:
import matplotlib.pyplot as plt

# Assuming digital_mentions_per_year is a Series or DataFrame column with year as index and counts as values
plt.figure(figsize=(10, 6))
bars = digital_mentions_per_year.plot(kind='bar')
plt.title('Frequency of "Digital" Mentions Over Time')
plt.ylim(0,60)
plt.xlabel('Year')
plt.ylabel('Number of Mentions')
plt.xticks(rotation=45)
plt.grid(axis='y', linestyle='--')

# Annotate each bar with the frequency value
for p in bars.patches:
    bars.annotate(f'{int(p.get_height())}',
                  (p.get_x() + p.get_width() / 2., p.get_height()),
                  ha='center', va='bottom',
                  xytext=(0, 5),
                  textcoords='offset points')

plt.show()


## Step 6: Co-occurrence Analysis (Optional)
For a simple co-occurrence analysis, you might start with identifying the most common words in abstracts that mention 'digital'. This requires more advanced text processing and is not covered in depth here, but you can begin with a basic approach using Counter from the collections module.

In [None]:
# Without stopwords removal
from collections import Counter
import re

# Combine all abstracts mentioning 'digital' into one large text
digital_abstracts_text = ' '.join(df[df['mentions_digital']]['Abstract'].tolist())

# Tokenize the text into words
words = re.findall(r'\w+', digital_abstracts_text)

# Count the words, excluding 'digital'
word_counts = Counter(words)
del word_counts['digital']

# Display the most common words co-occurring with 'digital'
print(word_counts.most_common(10))


Stop words are removed

Add stopwords: abstract specific HF words

In [None]:
from collections import Counter
import re
import nltk

# Download the list of stopwords from nltk
nltk.download('stopwords')
from nltk.corpus import stopwords

In [None]:
# Set up stopwords, you might want to add 'digital' explicitly if it's not already in the list
stop_words = set(stopwords.words('english')) | {'digital'}

In [None]:
# Manually add 'digital' and other specific stopwords
additional_stopwords = {'findings', 'based', 'research','second','language','english',"two","results"}
stop_words.update(additional_stopwords)


In [None]:




# Combine all abstracts mentioning 'digital' into one large text
digital_abstracts_text = ' '.join(df[df['mentions_digital']]['Abstract'].tolist()).lower()

# Tokenize the text into words
words = re.findall(r'\w+', digital_abstracts_text)

# Remove stopwords from the list of words
filtered_words = [word for word in words if word not in stop_words]

# Count the words, excluding stopwords and 'digital'
word_counts = Counter(filtered_words)

# Display the most common words co-occurring with 'digital'
print(word_counts.most_common(20))


These steps will give you a preliminary analysis of how the term 'digital' has been used and its co-occurrence with other terms in your dataset. Depending on your findings, you might refine your analysis to focus on specific years, topics, or co-occurring terms.

## Lemmatize and find co-occurrence

DMC to 'digital multimodal composition'

In [None]:
from collections import Counter
import re
import nltk

# Download necessary NLTK data
nltk.download('stopwords')
nltk.download('wordnet')
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

#Define a mapping of abbreviations to their full forms
abbreviation_expansions = {
    '\\bdmc\\b': 'digital_multimodal_composition',  # Use word boundary regex to match whole word only
    '\\bwtc\\b': 'willing_to_communicate'
    # Add more abbreviations and their expansions as needed
}



# Initialize the lemmatizer
lemmatizer = WordNetLemmatizer()

# Manually add specific stopwords including 'digital'
additional_stopwords = {'l2','digital','findings', 'based', 'research', 'second', 'language', 'english', "two", "results"}

# Extend the stop words list with custom words
stop_words = set(stopwords.words('english')) | additional_stopwords

# Combine all abstracts mentioning 'digital' into one large text
digital_abstracts_text = ' '.join(df[df['mentions_digital']]['Abstract'].tolist())

# Apply abbreviation expansion before lowering case to ensure accurate replacement
for abbr, expansion in abbreviation_expansions.items():
    digital_abstracts_text = re.sub(abbr, expansion, digital_abstracts_text, flags=re.IGNORECASE)

# Lower the case after expansions are done
digital_abstracts_text = digital_abstracts_text.lower()

# Tokenize the text into words
words = re.findall(r'\w+', digital_abstracts_text)

# Lemmatize words and remove those that are in the stop words list
filtered_words = [lemmatizer.lemmatize(word) for word in words if word not in stop_words]

# Count the words, now excluding extended stopwords and simplified words
word_counts = Counter(filtered_words)

# Display the most common words co-occurring with 'digital'
print(word_counts.most_common(20))


## Visualize co-occurrence

In [None]:
import matplotlib.pyplot as plt

# Assuming word_counts is your Counter object from the previous step
most_common_words = word_counts.most_common(30)

# Unpack the words and their frequencies for plotting
words, frequencies = zip(*most_common_words)

# Create a bar chart
plt.figure(figsize=(16, 8))
plt.bar(words, frequencies, color='skyblue')
plt.xlabel('Words')
plt.ylabel('Frequency')
plt.title('Top 10 Most Common Words Co-occurring with "Digital"')
plt.xticks(rotation=45)
plt.show()


---
# 2. Trend analysis

## Part 1: Keyword Frequency Over Time
First, calculate the frequency of 'digital' and its co-occurring terms per year.

In [None]:
import pandas as pd
from collections import defaultdict

# Convert all 'Abstract' entries to strings to avoid AttributeError
df['Abstract'] = df['Abstract'].astype(str).str.lower()

# Initialize a dictionary to hold counts of 'digital' per year
digital_frequency_per_year = defaultdict(int)

# Tokenize and count
for _, row in df.iterrows():
    # Ensure we're only considering non-missing 'Abstract' values
    if row['Abstract'] != 'nan':  # Check for the string representation of NaN
        words = set(row['Abstract'].split())  # Using set to count each word once per abstract
        if 'digital' in words:
            digital_frequency_per_year[row['Year']] += 1

# Convert the dictionary to a DataFrame for easier handling
digital_freq_df = pd.DataFrame(list(digital_frequency_per_year.items()), columns=['Year', 'Frequency']).sort_values(by='Year')

print(digital_freq_df)


## Part 2: Co-occurrence Matrix for 'Digital' and Other Terms
Next, create a co-occurrence matrix. This matrix will show how often other words occur with 'digital' in the same abstracts across all years. This part is a bit more involved and requires careful consideration of the entire dataset.

Lemmatized abstract = data3.csv

In [None]:
import pandas as pd
url = "https://raw.githubusercontent.com/MK316/Workingpapers/main/DMETA/data/data1.csv"
df = pd.read_csv(url, encoding="utf-8")

df['Keywords'] = df[['Keywords1', 'Keywords2']].apply(lambda x: ', '.join(x.dropna()), axis=1)

# Now, you can drop the original 'Keywords1' and 'Keywords2' columns if they are no longer needed
df.drop(['Keywords1', 'Keywords2'], axis=1, inplace=True)

# Save the updated DataFrame to a CSV file
filename = 'data2.csv'
df.to_csv(filename, index=False, encoding='utf-8')

df=pd.read_csv('data2.csv', encoding='utf-8')

In [None]:
import pandas as pd
import nltk
from nltk.stem import WordNetLemmatizer
from nltk import word_tokenize
from nltk.corpus import wordnet
from nltk import pos_tag

# Ensure that NLTK's resources are downloaded
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('wordnet')

# Initialize the lemmatizer
lemmatizer = WordNetLemmatizer()

# Helper function to get the wordnet POS tag
def get_wordnet_pos(treebank_tag):
    if treebank_tag.startswith('J'):
        return wordnet.ADJ
    elif treebank_tag.startswith('V'):
        return wordnet.VERB
    elif treebank_tag.startswith('N'):
        return wordnet.NOUN
    elif treebank_tag.startswith('R'):
        return wordnet.ADV
    else:
        return wordnet.NOUN  # Default to noun

# Function to lemmatize text
def lemmatize_abstract(text):
    # Tokenize the text and get POS tags
    tokens = word_tokenize(text)
    pos_tags = pos_tag(tokens)

    # Lemmatize each token with its POS tag
    lemmatized_tokens = [lemmatizer.lemmatize(token, get_wordnet_pos(tag)) for token, tag in pos_tags]

    # Join the lemmatized tokens back into a single string
    return ' '.join(lemmatized_tokens)

# Ensure that the 'Abstract' column is a string and handle missing values by replacing NaN with an empty string
df['Abstract'] = df['Abstract'].fillna('').astype(str)

# Apply the lemmatization function to the 'Abstract' column
df['Abstract_lemmatized'] = df['Abstract'].apply(lemmatize_abstract)

# Now your DataFrame `df` has an additional column 'Abstract_lemmatized' with the lemmatized text


In [None]:
# Remove Parenthesized words from 'Abstract_lemmatized'
import pandas as pd
import re

# Assuming 'df' is your DataFrame and it contains a column 'Abstract_lemmatized'

# Define a regular expression pattern to find and remove all parenthesized text
# This pattern matches anything that starts with '(' and ends with ')', including nested parentheses
parenthesized_text_pattern = r'\([^()]*\)'

# Remove parenthesized content
df['Abstract_lemmatized'] = df['Abstract_lemmatized'].str.replace(parenthesized_text_pattern, '', regex=True)

# Now 'Abstract_lemmatized' will have all parenthesized text removed


In [None]:
df.to_csv('data3.csv', encoding="utf-8",index=False)
df = pd.read_csv('data3.csv', encoding="utf-8")
df.head()

In [None]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer

# Make sure 'Abstract_lemmatized' column exists in the DataFrame
if 'Abstract_lemmatized' in df.columns:
    # Fill NaN values with empty string
    df['Abstract_lemmatized'] = df['Abstract_lemmatized'].fillna('')

    # Filter abstracts containing 'digital'
    digital_abstracts = df[df['Abstract_lemmatized'].str.contains('digital')]['Abstract_lemmatized']

    # Initialize CountVectorizer, considering only bi- and tri-grams that include 'digital'
    vectorizer = CountVectorizer(ngram_range=(2,3), stop_words='english')
    X = vectorizer.fit_transform(digital_abstracts)

    # Create a DataFrame for the co-occurrence matrix
    co_occurrence_matrix = (X.T * X)  # This is a trick to get co-occurrence from the term-document matrix
    co_occurrence_df = pd.DataFrame(co_occurrence_matrix.toarray(), index=vectorizer.get_feature_names_out(), columns=vectorizer.get_feature_names_out())

    # Since we're interested in terms with 'digital', filter the DataFrame
    digital_co_occurrences = co_occurrence_df.filter(regex='(^|\s)digital(\s|$)', axis=0).filter(regex='(^|\s)digital(\s|$)', axis=1)

    print(digital_co_occurrences)
else:
    print("The 'Abstract_lemmatized' column is not in the DataFrame.")


## 1. Heatmap of the Top N Co-occurring Terms
Focus on a subset of the most interesting terms that co-occur with 'digital'. You can select these based on the highest frequencies or relevance to your research question.

File to read again.

In [None]:
import pandas as pd
url = "https://raw.githubusercontent.com/MK316/Workingpapers/main/DMETA/data/data1.csv"
df = pd.read_csv(url, encoding="utf-8")

df['Keywords'] = df[['Keywords1', 'Keywords2']].apply(lambda x: ', '.join(x.dropna()), axis=1)

# Now, you can drop the original 'Keywords1' and 'Keywords2' columns if they are no longer needed
df.drop(['Keywords1', 'Keywords2'], axis=1, inplace=True)

# Save the updated DataFrame to a CSV file
filename = 'data2.csv'
df.to_csv(filename, index=False)

df=pd.read_csv('data2.csv')

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

# Let's say you've identified the top 20 terms that are most relevant or frequent
top_terms = digital_co_occurrences.sum(axis=1).nlargest(20).index.tolist()

# Filter the co-occurrence matrix to keep only the top terms
filtered_matrix = digital_co_occurrences.loc[top_terms, top_terms]

plt.figure(figsize=(12, 10))
sns.heatmap(filtered_matrix, annot=True, cmap='viridis')
plt.title('Co-occurrence Matrix of Top 20 Terms with Digital')
plt.show()


## Dimensionality reduction

Apply dimensionality reduction techniques (e.g., PCA, t-SNE) to the co-occurrence matrix to visualize the relationships between terms in a 2D or 3D space.

In [None]:
from sklearn.decomposition import PCA

# Using PCA for dimensionality reduction
pca = PCA(n_components=2)
reduced_matrix = pca.fit_transform(filtered_matrix)

plt.figure(figsize=(10, 8))
for i, term in enumerate(filtered_matrix.index):
    plt.scatter(reduced_matrix[i, 0], reduced_matrix[i, 1])
    plt.text(reduced_matrix[i, 0]+0.01, reduced_matrix[i, 1]+0.01, term, fontsize=9)
plt.title('PCA of Terms Co-occurring with Digital')
plt.xlabel('PC1')
plt.ylabel('PC2')
plt.show()


In [None]:
from sklearn.feature_extraction.text import CountVectorizer
import pandas as pd

# Replace NaN values with an empty string to avoid errors with str.contains
df['Abstract'] = df['Abstract'].fillna('')

# Filter abstracts containing 'digital'
# Now that NaN values are handled, this should not raise a ValueError
digital_abstracts = df[df['Abstract'].str.contains('digital')]['Abstract']

# Initialize CountVectorizer, considering only bi- and tri-grams that include 'digital'
vectorizer = CountVectorizer(ngram_range=(2,3), stop_words='english')
X = vectorizer.fit_transform(digital_abstracts)

# Create a DataFrame for the co-occurrence matrix
co_occurrence_matrix = (X.T * X)  # This is a trick to get co-occurrence from the term-document matrix
co_occurrence_df = pd.DataFrame(co_occurrence_matrix.toarray(), index=vectorizer.get_feature_names_out(), columns=vectorizer.get_feature_names_out())

# Since we're interested in terms with 'digital', filter the DataFrame
digital_co_occurrences = co_occurrence_df.filter(regex='(^|\s)digital(\s|$)', axis=0).filter(regex='(^|\s)digital(\s|$)', axis=1)

# Print or use the digital_co_occurrences as needed


The final heatmap provides a visual representation of the relationships between the most prominent terms in the corpus after consolidating similar terms and lemmatization. This can help in understanding the structure of the text data and identifying patterns of term co-occurrence.

In [None]:
import pandas as pd
import numpy as np
import nltk
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import CountVectorizer
import seaborn as sns
import matplotlib.pyplot as plt

# Make sure you've downloaded all necessary NLTK resources
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('stopwords')
nltk.download('wordnet')

# Initialize the lemmatizer
lemmatizer = WordNetLemmatizer()

# Helper function to get wordnet POS tag
def get_wordnet_pos(treebank_tag):
    if treebank_tag.startswith('J'):
        return nltk.corpus.wordnet.ADJ
    elif treebank_tag.startswith('V'):
        return nltk.corpus.wordnet.VERB
    elif treebank_tag.startswith('N'):
        return nltk.corpus.wordnet.NOUN
    elif treebank_tag.startswith('R'):
        return nltk.corpus.wordnet.ADV
    else:
        return nltk.corpus.wordnet.NOUN

# Define a function to lemmatize text
def lemmatize_text(text):
    # Tokenize the text
    words = nltk.word_tokenize(text)
    # Get POS tags for the words
    pos_tags = nltk.pos_tag(words)
    # Map POS tags to lemmatizer format and lemmatize
    lemmatized = [lemmatizer.lemmatize(word, get_wordnet_pos(tag)) for word, tag in pos_tags]
    return ' '.join(lemmatized)

# Manually create a mapping of terms to consolidate plural forms
consolidation_map = {
    'digital games': 'digital game',
    'digital literacies': 'digital literacy',
    # Add more terms as needed
}

# Function to consolidate terms based on the mapping
def consolidate_terms(text):
    for key, value in consolidation_map.items():
        text = text.replace(key, value)
    return text

# Assuming 'df' is your DataFrame and 'Abstract' contains the text data
# Replace NaN values with an empty string to avoid errors with str.contains
df['Abstract'] = df['Abstract'].fillna('').astype(str)
df['Abstract'] = df['Abstract'].str.lower()

# Apply lemmatization and then term consolidation
df['Processed_Abstract'] = df['Abstract'].apply(lemmatize_text).apply(consolidate_terms)

# Create the co-occurrence matrix using CountVectorizer
vectorizer = CountVectorizer(ngram_range=(1, 1), stop_words='english')
X = vectorizer.fit_transform(df['Processed_Abstract'])
features = vectorizer.get_feature_names_out()
co_occurrence_matrix = (X.T * X)  # This is a trick to get the co-occurrence matrix

# Convert to a DataFrame for easier handling
co_occurrence_df = pd.DataFrame(co_occurrence_matrix.toarray(), index=features, columns=features)

# Filter to top 20 terms
top_terms = co_occurrence_df.sum(axis=0).nlargest(20).index
filtered_co_occurrence_df = co_occurrence_df.loc[top_terms, top_terms]

# Plot the heatmap
plt.figure(figsize=(12, 10))
sns.heatmap(filtered_co_occurrence_df, annot=True, fmt='d', cmap='viridis')
plt.title('Co-occurrence Matrix of Top 20 Consolidated Terms with Digital')
plt.show()


## 2. Network Graph
Create a network graph where nodes represent terms and edges represent co-occurrences. This can help visualize how terms cluster around 'digital'.

In [None]:
import networkx as nx

# Create a graph from the co-occurrence matrix
G = nx.from_pandas_adjacency(filtered_matrix)

# Use a layout that spaces nodes using the force-directed algorithm for aesthetic spacing
pos = nx.spring_layout(G, k=0.1)

plt.figure(figsize=(16, 12))
nx.draw(G, pos, with_labels=True, node_size=4000, node_color='skyblue', font_size=10, edge_color='gray')
plt.title('Network Graph of Top Terms Co-occurring with Digital')
plt.show()


# Part III. Thematic analysis

+ Topic Modeling: Use techniques such as Latent Dirichlet Allocation (LDA) to identify prevailing topics within the abstracts. Analyze how topics related to 'digital' emerge, evolve, or decline over the years.
+ Temporal Topic Trends: For topics strongly associated with 'digital', plot their prevalence over time to observe shifts in focus areas.

In [None]:
# data to read
# data4.csv (6 articles from 2024 > 2023)

import pandas as pd

url="https://raw.githubusercontent.com/MK316/Workingpapers/main/DMETA/data/data4.csv"

df = pd.read_csv(url, encoding="utf-8")
df.head()

In [None]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation
import matplotlib.pyplot as plt

# Ensure that the 'Abstract_lemmatized' column is a string and handle missing values
df['Abstract_lemmatized'] = df['Abstract_lemmatized'].fillna('').astype(str)

# Prepare the text data and the year for each document
text_data = df['Abstract_lemmatized'].values
years = df['Year'].values

# Create a document-term matrix
vectorizer = CountVectorizer(stop_words='english')
doc_term_matrix = vectorizer.fit_transform(text_data)

# Fit the LDA model
num_topics = 10  # Adjust the number of topics as needed
lda = LatentDirichletAllocation(n_components=num_topics, random_state=42)
lda.fit(doc_term_matrix)

# ... the rest of the code remains the same as provided earlier


# View the topics in LDA model
def display_topics(model, feature_names, no_top_words):
    for topic_idx, topic in enumerate(model.components_):
        print("Topic %d:" % (topic_idx))
        print(" ".join([feature_names[i] for i in topic.argsort()[:-no_top_words - 1:-1]]))

no_top_words = 10
display_topics(lda, vectorizer.get_feature_names_out(), no_top_words)

# Analyze how topics related to 'digital' emerge over the years
topic_weights = lda.transform(doc_term_matrix)
topic_over_time = pd.DataFrame({'Year': years})

for i in range(num_topics):
    topic_over_time[f'Topic {i}'] = topic_weights[:, i]

# Group by year and get the mean weight of each topic
topic_over_time = topic_over_time.groupby('Year').mean().reset_index()

# Plot the temporal topic trends
plt.figure(figsize=(15, 10))
for i in range(num_topics):
    plt.plot(topic_over_time['Year'], topic_over_time[f'Topic {i}'], label=f'Topic {i}')

plt.title('Temporal Topic Trends')
plt.xlabel('Year')
plt.ylabel('Mean Topic Weight')
plt.legend()
plt.show()


e topics are not as important or relevant, consider removing them from the plot. This will reduce clutter.

Aggregate Data: Instead of plotting every single year, you could aggregate the data over a larger span, such as 2-5 years, to smooth out fluctuations and make trends clearer.

Focus on Major Topics: Instead of showing all topics, focus on a few major ones that are most relevant to your research question.

Interactive Plot: Consider creating an interactive plot that allows you to hover over lines to see more details or to toggle the visibility of individual topics.

Separate Plots: Create separate plots for each topic or group of related topics.

Highlight Digital Topics: If you're specifically interested in topics related to 'digital', you could highlight these lines and use a subdued color for unrelated topics.

Use Mean/median: Rather than plotting individual topic weights, consider plotting the mean or median topic weight across all topics for each year to get a sense of the overall trend.

Annotations: Annotate specific points of interest, like peaks or changes in trends, to draw attention to them.

This code will plot lines for topics related to 'digital' in brighter colors with a thicker line, while other topics will be plotted in light grey. You can adjust the digital_related_topics list to include the topics you're most interested in.

In [None]:
# Assuming topic_over_time is your DataFrame with topic weights and years
plt.figure(figsize=(15, 10))

# Assuming 'digital_related_topics' is a list of topic numbers that are related to 'digital'
digital_related_topics = [0, 2, 4, 7]  # Example: topics 0, 2, 4, 7 are related to 'digital'

# Plot only the digital-related topics with brighter colors
for topic in digital_related_topics:
    plt.plot(topic_over_time['Year'], topic_over_time[f'Topic {topic}'], label=f'Topic {topic}', linewidth=2)

# Plot other topics with a subdued color and less emphasis
for i in range(num_topics):
    if i not in digital_related_topics:
        plt.plot(topic_over_time['Year'], topic_over_time[f'Topic {i}'], color='lightgrey', alpha=0.5)

plt.title('Temporal Topic Trends Related to Digital')
plt.xlabel('Year')
plt.ylabel('Mean Topic Weight')
plt.legend(title='Topic Number')
plt.show()


In [None]:
# Function to display the top words for each topic
def display_topic_keywords(lda_model, feature_names, num_top_words):
    for topic_idx, topic in enumerate(lda_model.components_):
        if topic_idx in digital_related_topics:  # Check if the topic is one of the digital-related topics
            print(f"Topic {topic_idx}:")
            # Get the top words for this topic
            top_words = [feature_names[i] for i in topic.argsort()[:-num_top_words - 1:-1]]
            print(", ".join(top_words))

# Assuming you have a CountVectorizer instance 'vectorizer' that was used with LDA
feature_names = vectorizer.get_feature_names_out()
num_top_words = 20  # Set the number of top words you want for each topic

# Call the function to display the keywords
display_topic_keywords(lda, feature_names, num_top_words)


In [None]:
# Unique words

# Assuming lda is your fitted LDA model and vectorizer is your CountVectorizer

def get_unique_keywords(lda_model, feature_names, num_keywords=20):
    # Get the top keywords for each topic
    top_keywords = {topic_idx: [feature_names[i]
                                for i in topic.argsort()[:-num_keywords - 1:-1]]
                    for topic_idx, topic in enumerate(lda_model.components_)}

    # Create sets for each topic's keywords
    keyword_sets = {topic_idx: set(keywords) for topic_idx, keywords in top_keywords.items()}

    # Determine non-overlapping keywords for each topic
    unique_keywords = {}
    for topic_idx, keywords in keyword_sets.items():
        # Subtract keywords from all other topics
        other_keywords = set().union(*(s for idx, s in keyword_sets.items() if idx != topic_idx))
        unique_keywords[topic_idx] = keywords - other_keywords

    return unique_keywords

# Get the unique keywords
unique_keywords_per_topic = get_unique_keywords(lda, feature_names)

# Display the unique keywords for the specified topics
for topic in digital_related_topics:
    print(f"Topic {topic}:")
    print(", ".join(unique_keywords_per_topic[topic]))
    print("\n")


## Sample size in each year to weight

In [None]:
# Calculate the number of documents for each year
document_counts = df.groupby('Year').size()

# Normalize the topic weights by the number of documents for each year
for topic in range(num_topics):
    column_name = f'Topic {topic}'
    topic_over_time[column_name] = topic_over_time.apply(
        lambda row: row[column_name] / document_counts[row['Year']], axis=1)

# Now topic_over_time contains the weighted mean topic weights

# You can proceed with plotting the weighted trends
plt.figure(figsize=(15, 10))
for topic in digital_related_topics:
    plt.plot(topic_over_time['Year'], topic_over_time[f'Topic {topic}'], label=f'Topic {topic}')

plt.title('Weighted Temporal Topic Trends Related to Digital')
plt.xlabel('Year')
plt.ylabel('Weighted Mean Topic Weight')
plt.legend()
plt.show()


In [None]:
import matplotlib.pyplot as plt
import pandas as pd

# First, we'll create a new column that indicates whether 'digital' is mentioned in each document
df['Mentions_Digital'] = df['Abstract_lemmatized'].str.contains('digital')

# Then, we calculate the proportion of documents mentioning 'digital' per year
digital_proportion_per_year = df.groupby('Year')['Mentions_Digital'].mean()

# Now, let's plot this trend
plt.figure(figsize=(15, 10))
plt.plot(digital_proportion_per_year.index, digital_proportion_per_year.values, label='Proportion of "Digital" Mentions', color='black', linewidth=2.5)

# Optionally, overlay this with the topic trends for the related topics
for topic in digital_related_topics:
    plt.plot(topic_over_time['Year'], topic_over_time[f'Topic {topic}'], label=f'Topic {topic}')

plt.title('Trend of "Digital" Mentions and Related Topics Over Time')
plt.xlabel('Year')
plt.ylabel('Proportion / Weight')
plt.legend()
plt.show()


In [None]:
import matplotlib.pyplot as plt

# Calculate the raw count of documents mentioning 'digital' per year
digital_count_per_year = df.groupby('Year')['Mentions_Digital'].sum()

# Set up the figure and primary y-axis for the topic proportions
fig, ax1 = plt.subplots(figsize=(15, 10))

# Plot the topic proportions on the primary y-axis
for topic in digital_related_topics:
    ax1.plot(topic_over_time['Year'], topic_over_time[f'Topic {topic}'], label=f'Topic {topic}')

# Set up the secondary y-axis for the raw count of 'digital' mentions
ax2 = ax1.twinx()
ax2.plot(digital_count_per_year.index, digital_count_per_year.values, label='Raw Count of "Digital" Mentions', color='black', linestyle='--', linewidth=2)

# Set the labels and titles
ax1.set_xlabel('Year')
ax1.set_ylabel('Topic Proportion')
ax2.set_ylabel('Raw Count of "Digital" Mentions', color='black')
ax1.set_title('Digital Mentions and Topic Proportions Over Time')

# Add legends
ax1_legend = ax1.legend(loc='upper left', title='Topic Number')
ax2_legend = ax2.legend(loc='upper right', title='Digital Mentions')
ax2.get_yaxis().set_label_coords(1.1,0.5)

# Show the plot
plt.show()
