This notebook prepares the data on child objectification for further processing and performs statistical tests.

**GET DATA**

In [None]:
import pandas as pd

In [None]:
# Load dataset from Drive
file_path = "/INSERT-DATA-PATH.csv"
df = pd.read_csv(file_path)

**EXPLORE DATA**

In [None]:
df.head()

In [None]:
df.columns

DATA OVERVIEW

In [None]:
# Columns of interest
columns_of_interest = ['videoid', 'videoauthor', 'commid']

# Group by 'sex' and calculate unique counts
unique_counts = df.groupby('sex')[columns_of_interest].nunique()

# Calculate overall unique counts
overall_counts = df[columns_of_interest].nunique()

# Compute percentages
percentage_unique = unique_counts.div(overall_counts) * 100

# Create a summary DataFrame
summary_df = unique_counts.copy()
for col in columns_of_interest:
    summary_df[f"{col}_percentage"] = percentage_unique[col]

print(summary_df)


Clean Data First

In [None]:
pip install emoji

In [None]:
import pandas as pd
import re
import emoji

# Ensure "text" column is string and handle NaN values
df["text"] = df["text"].astype(str).fillna("")

# Function to check if a comment consists only of mentions
def is_only_mentions(text):
    mention_pattern = r"^@\w+(\s*@\w+)*$"  # Matches one or more mentions without other text
    return bool(re.fullmatch(mention_pattern, text.strip()))

# Function to check if a comment consists only of emojis
def is_only_emojis(text):
    return all(char in emoji.EMOJI_DATA for char in text.strip())

# Count mention-only comments
mention_only_count = df["text"].apply(is_only_mentions).sum()
mention_only_percentage = (mention_only_count / df.shape[0]) * 100

print(f"Number of mention-only comments: {mention_only_count} ({mention_only_percentage:.2f}%)")

# Remove mention-only comments but keep emoji-only ones
df = df[~df["text"].apply(is_only_mentions)]

# Save cleaned dataset
output_path = ""
df.to_csv(output_path, index=False, encoding="utf-8")



**Frequency Plots** (for general appearance and objectification)

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
from collections import Counter
import emoji
import re

# Function to remove emojis from text
def remove_emojis(text):
    return ''.join(char for char in text if not emoji.is_emoji(char))


# Extract matched words, remove emojis, and filter out excluded words
def clean_matches(column):
    words = [remove_emojis(word) for sublist in df[column].dropna() for word in sublist]
    words = [word.lower() for word in words if word.lower() not in excluded_words and word.strip()]
    return words

appearance_words = clean_matches("appearance_matches")
negative_appearance_words = clean_matches("negative_appearance_matches")

# Count occurrences
appearance_counter = Counter(appearance_words)
negative_appearance_counter = Counter(negative_appearance_words)

# Get 10 most common words/expressions
appearance_top10 = appearance_counter.most_common(10)
negative_appearance_top10 = negative_appearance_counter.most_common(10)

# Convert to separate lists for plotting
appearance_labels, appearance_counts = zip(*appearance_top10)
negative_labels, negative_counts = zip(*negative_appearance_top10)

# Define max x-axis limits for consistency across plots (first graph gets more space)
max_x1 = max(appearance_counts) * 1.2  # More extended x-axis for first plot
max_x2 = max(negative_counts) * 1.1  # Standard extension for second plot

# Plotting
fig, axes = plt.subplots(1, 2, figsize=(13, 5), dpi=120, gridspec_kw={'wspace': 0.1})  # Slightly wider figure

# Colors (same color family but distinct shades)
color1 = "#4C72B0"  # Muted blue
color2 = "#B22222"  # Deep red

# Function to plot bars with labels positioned after the bars
def plot_bars(ax, labels, counts, title, color, max_x):
    bars = ax.barh(range(len(labels)), counts[::-1], color=color, height=0.6)
    ax.set_xlim(0, max_x)  # Set different x-axis limits for the two plots
    ax.set_title(title, fontsize=14, fontweight="bold", pad=12)
    ax.set_xlabel("Frequency", fontsize=12, labelpad=8)
    ax.set_yticks([])  # Remove y-ticks
    ax.tick_params(axis="y", left=False)  # Ensure no y-ticks are shown

    # Add words inside the graph, positioned after the bars
    for i, (bar, label) in enumerate(zip(bars, labels[::-1])):
        ax.text(bar.get_width() + max_x * 0.02, i, label, va='center', ha='left', fontsize=10, color="black")

# Modify labels before plotting
negative_labels = ["(s)he looks like" if label == "he looks like" else label for label in negative_labels]
appearence_labels = ["(s)he looks like" if label == "he looks like" else label for label in negative_labels]

# Define max x-axis limits for consistency across plots
max_x = max(max(appearance_counts), max(negative_counts)) * 1.2  # Get the maximum count from both datasets

# Plot both bar charts with the same x-axis limits
plot_bars(axes[0], appearance_labels, appearance_counts, "General Appearance-Related Words", color1, max_x)
plot_bars(axes[1], negative_labels, negative_counts, "Objectification-Related Words", color2, max_x)

plt.tight_layout()
plt.show()

**Statistical Analysis**

Prepare Data

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

# Scatter plot of independent variables vs dependent variable
df["has_appearance_matches"] = df["appearance_matches"].apply(lambda x: 1 if x is not None else 0)
df["has_neg_appearance_matches"] = df["negative_appearance_matches"].apply(lambda x: 1 if x is not None else 0)

# Get the frequency table
frequency_table_app = pd.crosstab(index=df["has_appearance_matches"], columns="count")

# Print the table
print(frequency_table_app)

frequency_table_app_neg = pd.crosstab(index=df["has_neg_appearance_matches"], columns="count")
print(frequency_table_app_neg)

Group by Video

In [None]:
# Group by 'videoid' and sum 'has_appearance_matches'
appearance_summary = df.groupby('videoid')['has_appearance_matches'].sum().reset_index()
appearance_summary = appearance_summary.rename(columns={'has_appearance_matches': 'appearance_match_count'})

# Group by 'videoid' and sum 'has_neg_appearance_matches'
neg_appearance_summary = df.groupby('videoid')['has_neg_appearance_matches'].sum().reset_index()
neg_appearance_summary = neg_appearance_summary.rename(columns={'has_neg_appearance_matches': 'neg_appearance_match_count'})

# Merge the two summaries on 'videoid'
summary_df = appearance_summary.merge(neg_appearance_summary, on='videoid', how='left')

# Fill NaN values with 0 (in case there are no negative appearance matches for some videos)
summary_df.fillna(0, inplace=True)

# Display the result
print(summary_df.head())



In [None]:
# Get descriptive statistics
summary_stats = summary_df[['appearance_match_count', 'neg_appearance_match_count']].describe()
print(summary_stats)


Add Metadata

In [None]:
# Aggregate gender by taking the first occurrence
video_metadata = df.groupby("videoid").agg({
    "sex": "first",       # Take the first occurrence (assuming gender is consistent per video)
    "n_downl": "sum",     # Sum downloads for each video
    "n_likes": "sum",     # Sum likes for each video
    "n_comms": "sum"      # Sum comments for each video
}).reset_index()


In [None]:
# Merge the aggregated metadata with summary_df on 'videoid'
summary_df = summary_df.merge(video_metadata, on="videoid", how="left")

# Check the updated dataframe
print(summary_df.head())


In [None]:
# Exclude rows where sex == "both"
summary_df = summary_df[summary_df["sex"] != "b"]

# Verify changes
print(summary_df["sex"].value_counts())

# Save the updated DataFrame as an Excel file with UTF-8 encoding
output_path = "/content/drive/MyDrive/TikTok_Harm_Paper/summarized_df_for_stats.xlsx"
summary_df.to_excel(output_path, index=False, engine="openpyxl")

**NEG BINOMIAL REG with INTERACTION**

In [None]:
import statsmodels.api as sm
import statsmodels.formula.api as smf

# Interaction model for appearance-related comments
model_appearance_interaction = smf.glm(
    formula="appearance_match_count ~ sex * n_downl + sex * n_comms",
    data=summary_df,
    family=sm.families.NegativeBinomial()
).fit()

# Interaction model for objectification-related comments
model_objectification_interaction = smf.glm(
    formula="neg_appearance_match_count ~ sex * n_downl + sex * n_comms",
    data=summary_df,
    family=sm.families.NegativeBinomial()
).fit()

# Print summaries
print("Negative Binomial Regression for Appearance-Related Comments:")
print(model_appearance_interaction.summary())

print("\nNegative Binomial Regression for Objectification-Related Comments:")
print(model_objectification_interaction.summary())
