# Import libraries

In [32]:
import pandas as pd
from imgix import UrlBuilder
from google.cloud import vision
from google.oauth2 import service_account
import os
import lzma
import json
from sklearn.decomposition import LatentDirichletAllocation
from gensim.matutils import Sparse2Corpus
from gensim.models import CoherenceModel
from sklearn.feature_extraction.text import CountVectorizer


# Import Google Cloud credentials

In [43]:
# Path to your service account key file
key_path = "/Users/qianlou/Documents/GitHub/Social-Media-Analysis/forward-cacao-420716-6d4b68655e6d.json"

# Set up Google Vision client
credentials = service_account.Credentials.from_service_account_file(key_path)
# Initialize the Vision API Client
client = vision.ImageAnnotatorClient(credentials=credentials)

def detect_labels(image_path):
    """Detect labels in the image at the given path using Google Vision API."""
    with open(image_path, 'rb') as image_file:
        content = image_file.read()
    image = vision.Image(content=content)
    response = client.label_detection(image=image)
    return [label.description for label in response.label_annotations]

def find_key(data, target_key):
    """Recursively search for a key in JSON data."""
    if isinstance(data, dict):
        for key, value in data.items():
            if key == target_key:
                return value
            result = find_key(value, target_key)
            if result:
                return result
    elif isinstance(data, list):
        for item in data:
            result = find_key(item, target_key)
            if result:
                return result

# Import Data

URL Link: https://www.kaggle.com/datasets/thecoderenroute/instagram-posts-dataset?resource=download

In [44]:
data_folder_path = '/Users/qianlou/Documents/GitHub/Social-Media-Analysis/Data'

data = []  # List to hold all data extracted and processed

# Walk through the directory structure
for root, dirs, files in os.walk(data_folder_path):
    folder_name = os.path.basename(root)
    parts = folder_name.rsplit('_', 4)  # Split from the right to capture the last four elements distinctly
    
    if len(parts) < 5:
        continue  # Skip folders that do not have the expected number of parts

    profile_name = parts[0]  # Everything before the last four parts is the profile name
    follower_count, post_id, likes, comments = parts[1:]  # Unpack the last four parts in order
    
    all_labels = []
    json_data = {}

    for file in files:
        file_path = os.path.join(root, file)
        if file.endswith('.json.xz'):
            with lzma.open(file_path, 'rt') as json_file:
                json_data = json.load(json_file)
        elif file.lower().endswith(('.png', '.jpg', '.jpeg')):
            image_labels = detect_labels(file_path)
            all_labels.extend(image_labels)

    engagement_score = find_key(json_data, 'edge_media_to_comment')
    engagement_count = engagement_score.get('count') if engagement_score and isinstance(engagement_score, dict) and 'count' in engagement_score else None

    if all_labels:
        data.append({
            'profile_name': profile_name,
            'follower_count': follower_count,
            'post_id': post_id,
            'likes': likes,
            'comments': comments,
            'labels': ' '.join(all_labels),
            'engagement_score': engagement_count
        })

# Convert the list of data into a DataFrame
df = pd.DataFrame(data)

In [64]:
df.to_csv('data.csv', index=False)

In [65]:
df.head(10)

Unnamed: 0,profile_name,follower_count,post_id,likes,comments,labels,engagement_score
1489,zomato,809822,3048876535670217534,22392,177,Food Ingredient Recipe Natural foods Cuisine D...,177
1828,zomato,809822,3030021979666495337,17536,256,Rectangle Font Magenta Screenshot Parallel Num...,256
1235,zomato,809822,3017722431480959941,6778,134,Font Screenshot Terrestrial plant Rectangle Nu...,134
1819,zomato,809822,3058297827754329035,346067,1603,Bird Organism Beak Adaptation Terrestrial anim...,1603
1205,zomato,809822,3053234851745688757,30434,142,Smile Product Happy Kitchen appliance Tablewar...,142
674,zomato,809822,3062737605779199560,6904,629,Font Magenta Brand Graphics Event,629
84,zomato,809822,3055384952337369278,33433,467,Watch Automotive lighting Nickel Font Clock Je...,467
1770,zomato,809822,3068460342803822988,7996,80,Font Event Poster Advertising Brand Graphics L...,80
82,zomato,809822,3037776198347562779,18995,234,Font Magenta Rectangle Brand Event Graphics Ca...,234
1154,zomato,809822,3009013430304069934,49085,658,Plant Property Light Building Architecture Yel...,658


# Topic Modeling

In [None]:
# Convert image labels to a matrix of token counts
vectorizer = CountVectorizer(max_df=0.95, min_df=2, stop_words='english')
X = vectorizer.fit_transform(data_df['labels'])

# Fit LDA model
lda = LatentDirichletAllocation(n_components=10, random_state=0)  # Adjust n_components as needed
topics = lda.fit_transform(X)

# Display top 25 words for each topic
features = vectorizer.get_feature_names_out()
top_words = lambda t: [features[i] for i in np.argsort(t)[:-26:-1]]
topic_words = ([top_words(t) for t in lda.components_])
print([' '.join(t) for t in topic_words])


# Engagement Analysis

In [None]:
# Add topic distributions as features to the DataFrame
topic_columns = [f'topic_{i}' for i in range(lda.components_.shape[0])]
for index, topic_dist in enumerate(topics):
    data_df.loc[index, topic_columns] = topic_dist

# Sort data based on engagement and find quartiles
data_df.sort_values('comments', ascending=False, inplace=True)
quartile_size = len(data_df) // 4
top_quartile = data_df.head(quartile_size)
bottom_quartile = data_df.tail(quartile_size)

# Compare average topic weights between the top and bottom quartiles
average_top = top_quartile[topic_columns].mean()
average_bottom = bottom_quartile[topic_columns].mean()
comparison_df = pd.DataFrame({'Top Quartile': average_top, 'Bottom Quartile': average_bottom})
print(comparison_df)
