# Analysis of TikTok data

In [None]:
import pandas as pd

In [None]:
dataset = pd.read_csv("data_csv/old_tiktok_videos_detailed.csv")
dataset.info()

In [None]:
dataset.head()

In [None]:
import pandas as pd
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
from nltk.util import ngrams
from collections import Counter
import gensim
from gensim import corpora, models
import pyLDAvis.gensim_models
import matplotlib.pyplot as plt
from wordcloud import WordCloud


In [None]:

# Download necessary NLTK data
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')

# Load your data
df = pd.read_csv('data_csv/old_tiktok_videos_detailed.csv')  # Replace with your file path

# Text cleaning function
def clean_text(text):
    if not isinstance(text, str):
        return ""
    
    # Convert to lowercase
    text = text.lower()
    
    # Remove URLs, mentions, and hashtags (but we'll extract hashtags separately)
    text = re.sub(r'http\S+', '', text)
    text = re.sub(r'@\w+', '', text)
    text = re.sub(r'#', '', text)  # Remove # but keep the word
    
    # Remove punctuation and numbers
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    
    # Tokenize
    tokens = word_tokenize(text)
    
    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    tokens = [token for token in tokens if token not in stop_words]
    
    # Lemmatization
    lemmatizer = WordNetLemmatizer()
    tokens = [lemmatizer.lemmatize(token) for token in tokens]
    
    return ' '.join(tokens)

# Apply cleaning to captions
df['cleaned_caption'] = df['caption'].apply(clean_text)

# Create tokens for LDA
texts = [text.split() for text in df['cleaned_caption']]

# Create dictionary and corpus
dictionary = corpora.Dictionary(texts)
corpus = [dictionary.doc2bow(text) for text in texts]

# Perform LDA
num_topics = 5  # You can adjust this based on your data
lda_model = models.LdaModel(corpus, num_topics=num_topics, id2word=dictionary, passes=15)

# Display topics
print("LDA Topics:")
for idx, topic in lda_model.print_topics(-1):
    print(f"Topic {idx}: {topic}")

# Visualize topics (optional but helpful)
pyLDAvis.enable_notebook()
vis = pyLDAvis.gensim_models.prepare(lda_model, corpus, dictionary)
pyLDAvis.display(vis)

# Supplementary method: N-gram analysis
def get_ngrams(text, n=2):
    tokens = word_tokenize(text)
    return list(ngrams(tokens, n))

# Generate and display most common bigrams and trigrams
all_text = ' '.join(df['cleaned_caption'].tolist())
bigrams = get_ngrams(all_text, 2)
trigrams = get_ngrams(all_text, 3)

print("\nMost common bigrams:")
print(Counter(bigrams).most_common(10))

print("\nMost common trigrams:")
print(Counter(trigrams).most_common(10))

# Generate a word cloud for visual exploration
wordcloud = WordCloud(width=800, height=400, background_color='white').generate(all_text)
plt.figure(figsize=(10, 5))
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis('off')
plt.title('Word Cloud of TikTok Captions')
plt.show()

# Key Word in Context (KWIC) function
def kwic(text, keyword, window=5):
    """
    Extract keyword in context from text
    """
    tokens = word_tokenize(text)
    matches = []
    
    for i, token in enumerate(tokens):
        if token == keyword:
            start = max(0, i - window)
            end = min(len(tokens), i + window + 1)
            context = ' '.join(tokens[start:end])
            matches.append(context)
    
    return matches

# Example: Find contexts for key migration-related terms
migration_terms = ['uk', 'visa', 'work', 'life', 'nurse', 'ghana']
for term in migration_terms:
    contexts = []
    for caption in df['cleaned_caption']:
        contexts.extend(kwic(caption, term))
    
    if contexts:
        print(f"\nContexts for '{term}':")
        for i, context in enumerate(contexts[:3]):  # Show first 3 examples
            print(f"  {i+1}. {context}")

In [None]:
#  import pandas as pd
import numpy as np
# import matplotlib.pyplot as plt
import seaborn as sns
from gensim.models import CoherenceModel
# from textblob import TextBlob

# Assuming you've already run the previous code and have these variables
# df, lda_model, corpus, dictionary, texts

# Step 1: Refine the topic model by finding the optimal number of topics
def compute_coherence_values(dictionary, corpus, texts, limit, start=2, step=1):
    """
    Compute coherence values for different numbers of topics
    """
    coherence_values = []
    model_list = []
    
    for num_topics in range(start, limit, step):
        model = models.LdaModel(corpus, num_topics=num_topics, id2word=dictionary, passes=15, random_state=42)
        model_list.append(model)
        coherencemodel = CoherenceModel(model=model, texts=texts, dictionary=dictionary, coherence='c_v')
        coherence_values.append(coherencemodel.get_coherence())
    
    return model_list, coherence_values

# Compute coherence for different numbers of topics
limit = 10
start = 2
step = 1
model_list, coherence_values = compute_coherence_values(dictionary, corpus, texts, limit, start, step)

# Plot coherence scores
x = range(start, limit, step)
plt.figure(figsize=(10, 5))
plt.plot(x, coherence_values)
plt.xlabel("Number of Topics")
plt.ylabel("Coherence Score")
plt.title("Coherence Scores by Number of Topics")
plt.show()

# Select the model with the highest coherence score
best_index = np.argmax(coherence_values)
best_lda_model = model_list[best_index]
optimal_num_topics = x[best_index]

print(f"Optimal number of topics: {optimal_num_topics}")
print(f"Best coherence score: {coherence_values[best_index]:.4f}")

# Step 2: Assign descriptive labels to each topic
def format_topics_sentences(ldamodel, corpus, texts):
    """
    Assign topics to each document and create a dataframe
    """
    # Init output
    sent_topics_df = pd.DataFrame()

    # Get main topic in each document
    for i, row in enumerate(ldamodel[corpus]):
        row = sorted(row, key=lambda x: (x[1]), reverse=True)
        # Get the dominant topic, percentage contribution, and keywords for each document
        for j, (topic_num, prop_topic) in enumerate(row):
            if j == 0:  # dominant topic
                wp = ldamodel.show_topic(topic_num)
                topic_keywords = ", ".join([word for word, prop in wp])
                sent_topics_df = sent_topics_df.append(pd.Series([int(topic_num), round(prop_topic, 4), topic_keywords]), ignore_index=True)
            else:
                break
    sent_topics_df.columns = ['Dominant_Topic', 'Perc_Contribution', 'Topic_Keywords']

    # Add original text to the dataframe
    contents = pd.Series(texts)
    sent_topics_df = pd.concat([sent_topics_df, contents], axis=1)
    return sent_topics_df

# Create topic assignment dataframe
df_topic_sents_keywords = format_topics_sentences(best_lda_model, corpus, df['cleaned_caption'])

# Merge with original dataframe
df_merged = pd.concat([df, df_topic_sents_keywords], axis=1)

# Manually assign descriptive names based on topic keywords
topic_names = {
    0: "Career Advancement & Nursing Process",
    1: "Education & Exam Preparation",
    2: "Financial Benefits & Opportunities",
    3: "Daily Life & Cultural Experience",
    4: "Nostalgia & Connection to Home"
}

# Add topic names to dataframe
df_merged['Topic_Name'] = df_merged['Dominant_Topic'].map(topic_names)

# Step 3: Calculate topic prevalence
topic_prevalence = df_merged['Topic_Name'].value_counts().reset_index()
topic_prevalence.columns = ['Topic', 'Count']
topic_prevalence['Percentage'] = (topic_prevalence['Count'] / topic_prevalence['Count'].sum()) * 100

print("Topic Prevalence:")
print(topic_prevalence)

# Visualize topic prevalence
plt.figure(figsize=(12, 6))
sns.barplot(x='Percentage', y='Topic', data=topic_prevalence, palette='viridis')
plt.title('Prevalence of Migration Narrative Topics')
plt.xlabel('Percentage of Videos')
plt.tight_layout()
plt.show()

# Step 4: Cross-validate with qualitative analysis
# Sample 5 captions from each topic for manual review
for topic in df_merged['Topic_Name'].unique():
    print(f"\n=== {topic} ===")
    sample_captions = df_merged[df_merged['Topic_Name'] == topic].sample(5, random_state=42)['caption']
    for i, caption in enumerate(sample_captions):
        print(f"{i+1}. {caption}")

# Step 5: Connect topics to engagement metrics
# First, calculate engagement rate (using likes + shares as proxy)
df_merged['engagement'] = df_merged['likes'] + df_merged['shares']

# Calculate average engagement by topic
topic_engagement = df_merged.groupby('Topic_Name')['engagement'].agg(['mean', 'std', 'count']).reset_index()
topic_engagement.columns = ['Topic', 'Mean_Engagement', 'Std_Engagement', 'Count']

print("\nEngagement by Topic:")
print(topic_engagement)

# Visualize engagement by topic
plt.figure(figsize=(12, 6))
sns.barplot(x='Mean_Engagement', y='Topic', data=topic_engagement, palette='magma')
plt.title('Average Engagement by Topic')
plt.xlabel('Average Engagement (Likes + Shares)')
plt.tight_layout()
plt.show()

# Step 6: Perform sentiment analysis for RQ2
def get_sentiment(text):
    """
    Get sentiment polarity using TextBlob
    """
    if not isinstance(text, str) or len(text.strip()) == 0:
        return 0
    return TextBlob(text).sentiment.polarity

# Apply sentiment analysis
df_merged['sentiment'] = df_merged['cleaned_caption'].apply(get_sentiment)

# Categorize sentiment
def categorize_sentiment(score):
    if score > 0.1:
        return 'Positive'
    elif score < -0.1:
        return 'Negative'
    else:
        return 'Neutral'

df_merged['sentiment_label'] = df_merged['sentiment'].apply(categorize_sentiment)

# Analyze sentiment by topic
sentiment_by_topic = pd.crosstab(df_merged['Topic_Name'], df_merged['sentiment_label'], normalize='index') * 100
print("\nSentiment Distribution by Topic (%):")
print(sentiment_by_topic)

# Visualize sentiment by topic
sentiment_by_topic.plot(kind='bar', stacked=True, figsize=(12, 6), colormap='coolwarm')
plt.title('Sentiment Distribution by Topic')
plt.ylabel('Percentage')
plt.xlabel('Topic')
plt.xticks(rotation=45)
plt.legend(title='Sentiment')
plt.tight_layout()
plt.show()

# Correlation between sentiment and engagement
correlation = df_merged['sentiment'].corr(df_merged['engagement'])
print(f"\nCorrelation between sentiment and engagement: {correlation:.3f}")

# Step 7: Prepare for RQ3 (Interpretation of migration bias)
# Calculate the proportion of positive vs negative/neutral content
positive_content = len(df_merged[df_merged['sentiment_label'] == 'Positive']) / len(df_merged) * 100
print(f"\nPercentage of videos with positive sentiment: {positive_content:.1f}%")

# Compare engagement for positive vs negative/neutral content
sentiment_engagement = df_merged.groupby('sentiment_label')['engagement'].mean()
print("\nAverage engagement by sentiment:")
print(sentiment_engagement)

# Save the enriched dataframe for further analysis
df_merged.to_csv('tiktok_data_enriched.csv', index=False)
print("\nEnriched data saved to 'tiktok_data_enriched.csv'")