In [9]:
# Instagram Influencers Analysis
# Authors: Musa Misto & Neda Mohamed

# Purpose:
# This notebook serves as the main workspace for the CS412 course project on Instagram influencer analysis.
# The project aims to develop machine learning models for two primary tasks:
# 1. Multi-class classification to predict influencer categories based on profile meta-data and recent posts.
# 2. Regression analysis to estimate content popularity (e.g., like_count) using relevant features.
# The goal is to explore the dataset, preprocess the data, build and evaluate models, and document findings.

# Imports

In [43]:
import os
import re
import json
import gzip
import nltk
import numpy as np
import pandas as pd
from pprint import pprint

# For text preprocessing and modeling
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import accuracy_score, classification_report
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor

# Ensure necessary downloads for nltk stopwords
nltk.download('stopwords')
turkish_stopwords = stopwords.words('turkish')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\itsmm\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


# Reading Data


In [44]:
# We assume this code is running from within `repo/notebooks` or similar. 
# We'll go up one directory and then into `data/training/` to find the training files.

current_dir = os.getcwd()               # e.g., .../repo/notebooks
training_dir = os.path.join(current_dir, "..", "data", "training")

# Define file paths
csv_path = os.path.join(training_dir, "train-classification.csv")
json_path = os.path.join(training_dir, "training-dataset.jsonl.gz")

# If you have additional test files in `repo/data/testing`, you can similarly build their paths:
# testing_dir = os.path.join(current_dir, "..", "data", "testing")
# e.g. test_classification_file = os.path.join(testing_dir, "test-classification-round1.dat")
# e.g. test_regression_file      = os.path.join(testing_dir, "test-regression-round1.jsonl")

# For demonstration, we’ll just name them (change as needed):
test_classification_file = os.path.join(current_dir, "..", "data", "testing", "test-classification-round1.dat")
test_regression_file      = os.path.join(current_dir, "..", "data", "testing", "test-regression-round1.jsonl")

# Read classification CSV => user_id -> category
train_classification_df = pd.read_csv(csv_path)
train_classification_df.rename(columns={'Unnamed: 0': 'user_id', 'label': 'category'}, inplace=True)
train_classification_df["category"] = train_classification_df["category"].apply(str.lower)  # unify label casing

# Build dictionary: username -> category
username2category = train_classification_df.set_index("user_id")["category"].to_dict()

# Read gzipped JSON lines => separate train vs. test by presence in username2category
username2posts_train = {}
username2profile_train = {}
username2posts_test = {}
username2profile_test = {}

with gzip.open(json_path, "rt", encoding="utf-8") as fh:
    for line in fh:
        sample = json.loads(line.strip())
        profile = sample["profile"]
        username = profile.get("username", "")

        # If user is in the labeled set => train data
        if username in username2category:
            username2posts_train[username] = sample["posts"]
            username2profile_train[username] = profile
        else:
            # Otherwise => test or unlabeled data
            username2posts_test[username] = sample["posts"]
            username2profile_test[username] = profile

# Convert profile dicts to DataFrames
train_profile_df = pd.DataFrame(username2profile_train).T.reset_index(drop=True)
test_profile_df = pd.DataFrame(username2profile_test).T.reset_index(drop=True)

# PREPROCESSING & FEATURE ENGINEERING

In [45]:
# A) Basic text cleaning
def preprocess_text(text: str):
    text = text.casefold()
    # Remove URLs
    text = re.sub(r'http\S+|www\S+|https\S+', '', text, flags=re.MULTILINE)
    # Keep only alphanumeric + Turkish chars
    text = re.sub(r'[^a-zçğıöşü0-9\s#@]', '', text)
    # Remove numbers
    text = re.sub(r'\d+', '', text)
    # Normalize spacing
    text = re.sub(r'\s+', ' ', text).strip()
    return text

# B) Build a train corpus for TF-IDF (aggregating post captions per user)
train_usernames = []
train_corpus = []
y_train_raw = []  # raw category labels

for uname, posts in username2posts_train.items():
    train_usernames.append(uname)
    cat_label = username2category[uname]
    y_train_raw.append(cat_label)

    # combine captions
    cleaned_captions = []
    for post in posts:
        pc = post.get("caption", "")
        if pc:
            pc_clean = preprocess_text(str(pc))
            if pc_clean:
                cleaned_captions.append(pc_clean)
    user_post_captions = "\n".join(cleaned_captions)
    train_corpus.append(user_post_captions)

# Fit TF-IDF on training corpus
vectorizer = TfidfVectorizer(stop_words=turkish_stopwords, max_features=5000)
vectorizer.fit(train_corpus)

X_train_tfidf = vectorizer.transform(train_corpus)
y_train_final = np.array(y_train_raw)

# C) Additional numeric features from train_profile_df
# Set index to "username" for direct lookups
train_profile_df.set_index("username", inplace=True, drop=False)

follower_counts = []
business_flags = []
private_flags = []
verified_flags = []

for uname in train_usernames:
    if uname in train_profile_df.index:
        rowp = train_profile_df.loc[uname]
        fc = rowp.get("follower_count", 0) or 0
        follower_counts.append(fc)
        business_flags.append(1 if rowp.get("is_business_account", False) else 0)
        private_flags.append(1 if rowp.get("is_private", False) else 0)
        verified_flags.append(1 if rowp.get("is_verified", False) else 0)
    else:
        # fallback if missing
        follower_counts.append(0)
        business_flags.append(0)
        private_flags.append(0)
        verified_flags.append(0)

extra_train_features = np.vstack([
    follower_counts,
    business_flags,
    private_flags,
    verified_flags
]).T

# Merge TF-IDF + numeric
X_train_full = np.hstack([X_train_tfidf.toarray(), extra_train_features])

# TRAIN / VALIDATION SPLIT, MODEL SELECTION, & TRAINING (CLASSIFICATION)

In [47]:
# Necessary Imports
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.ensemble import RandomForestClassifier
from imblearn.over_sampling import SMOTE
from sklearn.metrics import classification_report, accuracy_score
from sklearn.preprocessing import StandardScaler
from scipy.stats import randint

# Train/Validation Split
X_tr, X_val, y_tr, y_val = train_test_split(X_train_full, y_train_final, test_size=0.2, random_state=42)

# Apply SMOTE to handle class imbalance
smote = SMOTE(random_state=42)
X_tr_smote, y_tr_smote = smote.fit_resample(X_tr, y_tr)

print(f"Original Training Shape: {X_tr.shape}, {y_tr.shape}")
print(f"After SMOTE: {X_tr_smote.shape}, {y_tr_smote.shape}")

# Define the Random Forest Classifier
rf_classifier = RandomForestClassifier(random_state=42)

# Define the hyperparameter grid
param_dist = {
    "n_estimators": randint(100, 1000),
    "max_depth": randint(10, 50),
    "min_samples_split": randint(2, 10),
    "min_samples_leaf": randint(1, 10),
    "criterion": ["gini", "entropy"]
}

# Randomized Search for Hyperparameter Optimization
random_search = RandomizedSearchCV(
    estimator=rf_classifier,
    param_distributions=param_dist,
    n_iter=50,
    scoring="accuracy",
    cv=5,
    verbose=2,
    random_state=42,
    n_jobs=-1
)

# Train the model with SMOTE-augmented data
random_search.fit(X_tr_smote, y_tr_smote)

# Output the best parameters and accuracy
print("Best Parameters:", random_search.best_params_)
print("Best Cross-Validation Accuracy:", random_search.best_score_)

# Best model from RandomizedSearchCV
best_rf_model = random_search.best_estimator_

# Predict on Validation Set
y_val_pred = best_rf_model.predict(X_val)

# Evaluate Performance
print("Validation Accuracy:", accuracy_score(y_val, y_val_pred))
print("Classification Report:\n", classification_report(y_val, y_val_pred))


Original Training Class Distribution: Counter({np.str_('health and lifestyle'): 406, np.str_('food'): 397, np.str_('tech'): 287, np.str_('entertainment'): 264, np.str_('fashion'): 244, np.str_('travel'): 232, np.str_('art'): 153, np.str_('mom and children'): 115, np.str_('sports'): 86, np.str_('gaming'): 8})
After SMOTEENN Class Distribution: Counter({np.str_('gaming'): 28, np.str_('mom and children'): 26, np.str_('art'): 21, np.str_('tech'): 21, np.str_('sports'): 14, np.str_('entertainment'): 12, np.str_('fashion'): 12, np.str_('food'): 6, np.str_('travel'): 6, np.str_('health and lifestyle'): 3})
Fitting 5 folds for each of 50 candidates, totalling 250 fits




Best Parameters: {'criterion': 'gini', 'max_depth': 46, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 502}
Best Cross-Validation Accuracy: 0.8397701149425287
Validation Accuracy: 0.2568306010928962
Classification Report:
                       precision    recall  f1-score   support

                 art       0.18      0.16      0.17        38
       entertainment       0.25      0.07      0.11        59
             fashion       0.19      0.69      0.30        55
                food       0.89      0.22      0.35       114
              gaming       0.33      0.20      0.25         5
health and lifestyle       0.00      0.00      0.00        96
    mom and children       0.28      0.50      0.36        34
              sports       0.08      0.04      0.05        27
                tech       0.25      0.83      0.38        59
              travel       0.00      0.00      0.00        62

            accuracy                           0.26       549
           macr

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


# PREDICT ON TEST DATA FOR CLASSIFICATION & SAVE

In [15]:
test_unames_from_file = []
if os.path.exists(test_classification_file):
    with open(test_classification_file, "rt", encoding="utf-8") as fh:
        for line in fh:
            line_stripped = line.strip()
            # skip header if any
            if line_stripped.lower() == "screenname":
                continue
            test_unames_from_file.append(line_stripped)
else:
    print("\nWarning: Test classification file not found. Using placeholder list.")
    # If file not found, we’ll just use all test user names
    test_unames_from_file = list(username2posts_test.keys())

# Build feature vectors for test
test_captions_corpus = []
for uname in test_unames_from_file:
    # gather their posts from the test dictionary
    posts = []
    if uname in username2posts_test:
        posts = username2posts_test[uname]
    elif uname in username2posts_train:
        # fallback if user unexpectedly in train
        posts = username2posts_train[uname]

    cleaned_captions = []
    for post in posts:
        pc = preprocess_text(str(post.get("caption", "")))
        if pc:
            cleaned_captions.append(pc)
    joined_captions = "\n".join(cleaned_captions)
    test_captions_corpus.append(joined_captions)

X_test_tfidf = vectorizer.transform(test_captions_corpus)

# Extra numeric for test
test_profile_df.set_index("username", inplace=True, drop=False)

follower_counts_test = []
business_flags_test = []
private_flags_test = []
verified_flags_test = []

for uname in test_unames_from_file:
    if uname in test_profile_df.index:
        rowp = test_profile_df.loc[uname]
    elif uname in train_profile_df.index:
        # fallback if found in train profile
        rowp = train_profile_df.loc[uname]
    else:
        rowp = {}
    fc = rowp.get("follower_count", 0) or 0
    follower_counts_test.append(fc)
    business_flags_test.append(1 if rowp.get("is_business_account", False) else 0)
    private_flags_test.append(1 if rowp.get("is_private", False) else 0)
    verified_flags_test.append(1 if rowp.get("is_verified", False) else 0)

extra_test_features = np.vstack([
    follower_counts_test,
    business_flags_test,
    private_flags_test,
    verified_flags_test
]).T

X_test_full = np.hstack([X_test_tfidf.toarray(), extra_test_features])
test_predictions = best_clf.predict(X_test_full)

# Save classification results
classification_output = {uname: pred for uname, pred in zip(test_unames_from_file, test_predictions)}

with open("classification_output.json", "w", encoding="utf-8") as f:
    json.dump(classification_output, f, indent=4, ensure_ascii=False)

print("\nSaved classification_output.json with predicted categories.")


Saved classification_output.json with predicted categories.


# REGRESSION FOR LIKE_COUNT

In [16]:
# We'll train on *per-post* data from the training set. 
# Evaluate in log space: MSE of log(like_count + 1).

def log_mse_like_counts(y_true, y_pred):
    # Evaluate MSE in log scale => MSE of log(1 + y)
    y_true = np.array(y_true)
    y_pred = np.array(y_pred)
    log_y_true = np.log1p(y_true)
    log_y_pred = np.log1p(y_pred)
    return np.mean((log_y_true - log_y_pred) ** 2)

# Build a per-post training DataFrame
train_posts_data = []
for uname, posts in username2posts_train.items():
    # possible user aggregates
    user_followers = 0
    if uname in train_profile_df.index:
        rowp = train_profile_df.loc[uname]
        user_followers = rowp.get("follower_count", 0) or 0
    
    # average likes of user
    user_likes_arr = [p.get("like_count", 0) or 0 for p in posts]
    avg_user_like = float(np.mean(user_likes_arr)) if len(user_likes_arr) > 0 else 0.0

    for post in posts:
        post_likes = post.get("like_count", 0) or 0
        post_caption = preprocess_text(str(post.get("caption", "")))

        train_posts_data.append({
            "username": uname,
            "follower_count": user_followers,
            "avg_user_like": avg_user_like,
            "current_like": post_likes,
            "caption": post_caption
        })

df_reg_train = pd.DataFrame(train_posts_data)

# TF-IDF for post-level caption
vectorizer_reg = TfidfVectorizer(stop_words=turkish_stopwords, max_features=300)
vectorizer_reg.fit(df_reg_train["caption"])

X_cap_tfidf = vectorizer_reg.transform(df_reg_train["caption"]).toarray()
X_num = df_reg_train[["follower_count", "avg_user_like"]].to_numpy()
X_reg_full = np.hstack([X_cap_tfidf, X_num])

y_reg_full = df_reg_train["current_like"].to_numpy()

# Train/Val split for regression
Xr_tr, Xr_val, yr_tr, yr_val = train_test_split(X_reg_full, y_reg_full, test_size=0.2, random_state=42)

# Let's do a RandomForestRegressor with some basic hyperparameter tuning
rf_reg = RandomForestRegressor(random_state=42)
rf_reg_params = {
    "n_estimators": [100, 200],
    "max_depth": [10, 20, None],
    "min_samples_split": [2, 5]
}
grid_reg = GridSearchCV(rf_reg, rf_reg_params, cv=3, n_jobs=-1)
grid_reg.fit(Xr_tr, yr_tr)
best_reg = grid_reg.best_estimator_

yr_val_pred = best_reg.predict(Xr_val)
mse_log_val = log_mse_like_counts(yr_val, yr_val_pred)
print("\n=== Regression Model ===")
print("Best Params:", grid_reg.best_params_)
print(f"Validation MSE (log scale): {mse_log_val:.4f}")

KeyboardInterrupt: 

# TEST REGRESSION PREDICTIONS & SAVE

In [None]:
output_list = []
if os.path.exists(test_regression_file):
    with open(test_regression_file, "r", encoding="utf-8") as fh:
        for line in fh:
            sample = json.loads(line.strip())
            uname_test = sample.get("username", "")

            # get user-level aggregates
            user_followers = 0
            if uname_test in train_profile_df.index:
                rowp = train_profile_df.loc[uname_test]
                user_followers = rowp.get("follower_count", 0) or 0
            elif uname_test in test_profile_df.index:
                rowp = test_profile_df.loc[uname_test]
                user_followers = rowp.get("follower_count", 0) or 0

            # find average user likes if we have them in train or test
            # fallback is 0 if unknown
            user_posts_for_avg = []
            if uname_test in username2posts_train:
                user_posts_for_avg = username2posts_train[uname_test]
            elif uname_test in username2posts_test:
                user_posts_for_avg = username2posts_test[uname_test]
            likes_list = [p.get("like_count", 0) or 0 for p in user_posts_for_avg]
            avg_user_like = float(np.mean(likes_list)) if len(likes_list) > 0 else 0.0

            # preprocess the caption if present
            test_caption = preprocess_text(str(sample.get("caption", "")))
            Xc_tfidf = vectorizer_reg.transform([test_caption]).toarray()
            Xn = np.array([[user_followers, avg_user_like]])
            X_test_instance = np.hstack([Xc_tfidf, Xn])

            predicted_like = best_reg.predict(X_test_instance)[0]
            sample["like_count"] = int(predicted_like)
            output_list.append(sample)
else:
    print("\nNo test-regression file found; skipping regression prediction.")

# Save regression output
with open("test_regression_output.jsonl", "w", encoding="utf-8") as out_f:
    for item in output_list:
        out_f.write(json.dumps(item) + "\n")

print("\nSaved test_regression_output.jsonl with predicted like counts.")
print("Done. Classification and Regression outputs are ready.")