# Import libraries

In [32]:
import pandas as pd
from imgix import UrlBuilder
from google.cloud import vision
from google.oauth2 import service_account
import os
import lzma
import json
from sklearn.decomposition import LatentDirichletAllocation
from gensim.matutils import Sparse2Corpus
from gensim.models import CoherenceModel
from sklearn.feature_extraction.text import CountVectorizer


# Import Google Cloud credentials

In [43]:
# Path to your service account key file
key_path = "/Users/qianlou/Documents/GitHub/Social-Media-Analysis/forward-cacao-420716-6d4b68655e6d.json"

# Set up Google Vision client
credentials = service_account.Credentials.from_service_account_file(key_path)
# Initialize the Vision API Client
client = vision.ImageAnnotatorClient(credentials=credentials)

def detect_labels(image_path):
    """Detect labels in the image at the given path using Google Vision API."""
    with open(image_path, 'rb') as image_file:
        content = image_file.read()
    image = vision.Image(content=content)
    response = client.label_detection(image=image)
    return [label.description for label in response.label_annotations]

def find_key(data, target_key):
    """Recursively search for a key in JSON data."""
    if isinstance(data, dict):
        for key, value in data.items():
            if key == target_key:
                return value
            result = find_key(value, target_key)
            if result:
                return result
    elif isinstance(data, list):
        for item in data:
            result = find_key(item, target_key)
            if result:
                return result

# Import Data

URL Link: https://www.kaggle.com/datasets/thecoderenroute/instagram-posts-dataset?resource=download

In [44]:
data_folder_path = '/Users/qianlou/Documents/GitHub/Social-Media-Analysis/Data'

data = []  # List to hold all data extracted and processed

# Walk through the directory structure
for root, dirs, files in os.walk(data_folder_path):
    folder_name = os.path.basename(root)
    parts = folder_name.rsplit('_', 4)  # Split from the right to capture the last four elements distinctly
    
    if len(parts) < 5:
        continue  # Skip folders that do not have the expected number of parts

    profile_name = parts[0]  # Everything before the last four parts is the profile name
    follower_count, post_id, likes, comments = parts[1:]  # Unpack the last four parts in order
    
    all_labels = []
    json_data = {}

    for file in files:
        file_path = os.path.join(root, file)
        if file.endswith('.json.xz'):
            with lzma.open(file_path, 'rt') as json_file:
                json_data = json.load(json_file)
        elif file.lower().endswith(('.png', '.jpg', '.jpeg')):
            image_labels = detect_labels(file_path)
            all_labels.extend(image_labels)

    engagement_score = find_key(json_data, 'edge_media_to_comment')
    engagement_count = engagement_score.get('count') if engagement_score and isinstance(engagement_score, dict) and 'count' in engagement_score else None

    if all_labels:
        data.append({
            'profile_name': profile_name,
            'follower_count': follower_count,
            'post_id': post_id,
            'likes': likes,
            'comments': comments,
            'labels': ' '.join(all_labels),
            'engagement_score': engagement_count
        })

# Convert the list of data into a DataFrame
df = pd.DataFrame(data)

In [64]:
df.to_csv('data.csv', index=False)

In [66]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 1968 entries, 1489 to 201
Data columns (total 7 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   profile_name      1968 non-null   object
 1   follower_count    1968 non-null   object
 2   post_id           1968 non-null   object
 3   likes             1968 non-null   object
 4   comments          1968 non-null   object
 5   labels            1968 non-null   object
 6   engagement_score  1968 non-null   int64 
dtypes: int64(1), object(6)
memory usage: 123.0+ KB


# Topic Modeling

In [None]:
# Convert image labels to a matrix of token counts
vectorizer = CountVectorizer(max_df=0.95, min_df=2, stop_words='english')
X = vectorizer.fit_transform(data_df['labels'])

# Fit LDA model
lda = LatentDirichletAllocation(n_components=10, random_state=0)  # Adjust n_components as needed
topics = lda.fit_transform(X)

# Display top 25 words for each topic
features = vectorizer.get_feature_names_out()
top_words = lambda t: [features[i] for i in np.argsort(t)[:-26:-1]]
topic_words = ([top_words(t) for t in lda.components_])
print([' '.join(t) for t in topic_words])


# Engagement Analysis

In [None]:
# Add topic distributions as features to the DataFrame
topic_columns = [f'topic_{i}' for i in range(lda.components_.shape[0])]
for index, topic_dist in enumerate(topics):
    data_df.loc[index, topic_columns] = topic_dist

# Sort data based on engagement and find quartiles
data_df.sort_values('comments', ascending=False, inplace=True)
quartile_size = len(data_df) // 4
top_quartile = data_df.head(quartile_size)
bottom_quartile = data_df.tail(quartile_size)

# Compare average topic weights between the top and bottom quartiles
average_top = top_quartile[topic_columns].mean()
average_bottom = bottom_quartile[topic_columns].mean()
comparison_df = pd.DataFrame({'Top Quartile': average_top, 'Bottom Quartile': average_bottom})
print(comparison_df)
