# Import Dependencies

In [82]:
import numpy as np
import pandas as pd
import gzip
import json
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import accuracy_score, classification_report
from imblearn.over_sampling import RandomOverSampler
from imblearn.pipeline import Pipeline as ImbPipeline


import os

from pprint import pprint

In [83]:
#@title Turkish StopWords

import nltk
from nltk.corpus import stopwords

nltk.download('stopwords')
turkish_stopwords = stopwords.words('turkish')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\itsmm\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


# Influencer Category Classification



1.   Read Data
2.   Preprocess Data
3.   Prepare Model
4.   Predict Test Data
4.   Save outputs



In [95]:
# Step 1: Define File Paths Dynamically (keep the same)
...

# Step 2: Load Data Dynamically
train_classification_df = pd.read_csv(train_classification_path)
train_classification_df = train_classification_df.rename(columns={'Unnamed: 0': 'user_id', 'label': 'category'})

# Check for duplicates
duplicate_users = train_classification_df[train_classification_df['user_id'].duplicated(keep=False)]
if not duplicate_users.empty:
    print("\nWarning: Found duplicate user_ids:")
    print(duplicate_users.sort_values('user_id'))
    
    # Option 1: Keep first occurrence
    train_classification_df = train_classification_df.drop_duplicates(subset='user_id', keep='first')
    print("\nKept first occurrence of duplicates.")
    
    # Option 2: Could also check if duplicates have different categories
    duplicate_categories = duplicate_users.groupby('user_id')['category'].nunique() > 1
    if duplicate_categories.any():
        print("\nWarning: Some duplicates have different categories:")
        for user_id in duplicate_categories[duplicate_categories].index:
            user_entries = duplicate_users[duplicate_users['user_id'] == user_id]
            print(f"\nUser {user_id} has multiple categories:")
            print(user_entries[['user_id', 'category']])

# Step 3: Unify Labels
train_classification_df["category"] = train_classification_df["category"].apply(str.lower)

# Step 4: Create User-to-Category Mapping
username2_category = train_classification_df.set_index("user_id").to_dict()["category"]

# Step 5: Verify Output
print("\nFirst few rows of the training classification DataFrame:")
print(train_classification_df.head())

print("\nDataset Statistics:")
print(f"Total unique users: {len(username2_category)}")
print("\nCategory distribution:")
print(train_classification_df['category'].value_counts())


First few rows of the training classification DataFrame:
           user_id          category
0    taskirancemal  mom and children
1    tam_kararinda              food
2         spart4nn              food
3  sosyalyiyiciler              food
4  sonaydizdarahad  mom and children

Dataset Statistics:
Total unique users: 2742

Category distribution:
category
food                    511
health and lifestyle    503
tech                    346
entertainment           323
fashion                 299
travel                  294
art                     191
mom and children        149
sports                  113
gaming                   13
Name: count, dtype: int64


In [85]:
# stats about the labels
train_classification_df.groupby("category").count()

Unnamed: 0_level_0,user_id
category,Unnamed: 1_level_1
art,209
entertainment,351
fashion,316
food,531
gaming,20
health and lifestyle,547
mom and children,152
sports,133
tech,371
travel,316


In [86]:
username2_category["sonaydizdarahad"]

'mom and children'

In [96]:
# Step 1: Define File Paths Dynamically
# Get the current notebook directory
current_notebook_dir = os.getcwd()

# Get the repo directory (assuming notebooks are inside the "notebooks" folder)
repo_dir = os.path.abspath(os.path.join(current_notebook_dir, '..'))

# Get the data directory
data_dir = os.path.join(repo_dir, 'data')

# Get the training directory
training_dir = os.path.join(data_dir, 'training')

# File path for 'training-dataset.jsonl.gz'
train_data_path = os.path.join(training_dir, 'training-dataset.jsonl.gz')

# Step 2: Initialize Dictionaries for Data
username2posts_train = dict()
username2profile_train = dict()

username2posts_test = dict()
username2profile_test = dict()
# Step 1 and 2 remain the same...

# Initialize sets to track post IDs
train_post_ids = set()
test_post_ids = set()
duplicate_posts = []

# Step 3: Process Data with duplicate checking
with gzip.open(train_data_path, "rt", encoding="utf-8") as fh:
    for line in fh:
        sample = json.loads(line)

        profile = sample["profile"]
        username = profile.get("username", "").strip()
        if not username:
            continue

        # Check posts for duplicates
        filtered_posts = []
        for post in sample["posts"]:
            post_id = post.get("id")
            if post_id:
                if username in username2_category:
                    if post_id in train_post_ids:
                        duplicate_posts.append({
                            'id': post_id,
                            'username': username,
                            'timestamp': post.get('timestamp'),
                            'dataset': 'train'
                        })
                        continue
                    train_post_ids.add(post_id)
                else:
                    if post_id in test_post_ids:
                        duplicate_posts.append({
                            'id': post_id,
                            'username': username,
                            'timestamp': post.get('timestamp'),
                            'dataset': 'test'
                        })
                        continue
                    test_post_ids.add(post_id)
            filtered_posts.append(post)

        if username in username2_category:
            username2posts_train[username] = filtered_posts
            username2profile_train[username] = profile
        else:
            username2posts_test[username] = filtered_posts
            username2profile_test[username] = profile

# Print statistics
print("\nDataset Statistics:")
print(f"Number of Training Users: {len(username2posts_train)}")
print(f"Number of Testing Users: {len(username2posts_test)}")
print(f"Total unique training posts: {len(train_post_ids)}")
print(f"Total unique testing posts: {len(test_post_ids)}")

if duplicate_posts:
    print("\nFound duplicate posts:")
    print(f"Total duplicates: {len(duplicate_posts)}")
    print("\nFirst few duplicates:")
    for dup in duplicate_posts[:5]:
        print(f"Post ID: {dup['id']}")
        print(f"Username: {dup['username']}")
        print(f"Timestamp: {dup['timestamp']}")
        print(f"Dataset: {dup['dataset']}")
        print("---")

# Also check for cross-dataset duplicates
cross_duplicates = train_post_ids.intersection(test_post_ids)
if cross_duplicates:
    print("\nWarning: Found posts that appear in both train and test sets!")
    print(f"Number of cross-dataset duplicates: {len(cross_duplicates)}")
    print("First few cross-dataset duplicate IDs:", list(cross_duplicates)[:5])


Dataset Statistics:
Number of Training Users: 2741
Number of Testing Users: 2674
Total unique training posts: 94824
Total unique testing posts: 92478


In [88]:
# Profile Dataframe
train_profile_df = pd.DataFrame(username2profile_train).T.reset_index(drop=True)
test_profile_df = pd.DataFrame(username2profile_test).T.reset_index(drop=True)

train_profile_df.head(1)

Unnamed: 0,username,id,full_name,biography,category_name,post_count,follower_count,following_count,is_business_account,is_private,...,business_category_name,overall_category_name,category_enum,is_verified_by_mv4b,is_regulated_c18,profile_pic_url,should_show_category,should_show_public_contacts,show_account_transparency_details,profile_picture_base64
0,deparmedya,3170700063,Depar Medya,#mediaplanning #mediabuying #sosyalmedya,Local business,,1167,192,True,False,...,,,LOCAL,False,False,https://instagram.fsaw2-3.fna.fbcdn.net/v/t51....,True,True,True,/9j/4AAQSkZJRgABAQAAAQABAAD/2wBDAAgGBgcGBQgHBw...


In [89]:
test_profile_df.head(1)

Unnamed: 0,username,id,full_name,biography,category_name,post_count,follower_count,following_count,is_business_account,is_private,...,business_category_name,overall_category_name,category_enum,is_verified_by_mv4b,is_regulated_c18,profile_pic_url,should_show_category,should_show_public_contacts,show_account_transparency_details,profile_picture_base64
0,beyazyakaliyiz,8634457436,Selam Beyaz Yakalı,Beyaz yakalıların dünyasına hoşgeldiniz 😀😀😀,Personal blog,,1265,665,True,False,...,,,PERSONAL_BLOG,False,False,https://instagram.fist6-1.fna.fbcdn.net/v/t51....,True,True,True,/9j/4AAQSkZJRgABAQAAAQABAAD/2wBDAAgGBgcGBQgHBw...


In [90]:
# Step 5: Verify Output
print("Columns in train_classification_df:")
print(train_classification_df.columns.tolist())

print("\nFirst few rows of the training classification DataFrame:")
print(train_classification_df.head())

print("\nLabel distribution:")
print(train_classification_df['category'].value_counts())

Columns in train_classification_df:
['user_id', 'category']

First few rows of the training classification DataFrame:
           user_id          category
0    taskirancemal  mom and children
1    tam_kararinda              food
2         spart4nn              food
3  sosyalyiyiciler              food
4  sonaydizdarahad  mom and children

Label distribution:
category
health and lifestyle    547
food                    531
tech                    371
entertainment           351
travel                  316
fashion                 316
art                     209
mom and children        152
sports                  133
gaming                   20
Name: count, dtype: int64


In [91]:
# Profile Dataframe
print("Columns in train_profile_df:")
print(train_profile_df.columns.tolist())

print("\nFirst few rows of train_profile_df:")
print(train_profile_df.head(1))

print("\nColumns in test_profile_df:")
print(test_profile_df.columns.tolist())

print("\nFirst few rows of test_profile_df:")
print(test_profile_df.head(1))

Columns in train_profile_df:
['username', 'id', 'full_name', 'biography', 'category_name', 'post_count', 'follower_count', 'following_count', 'is_business_account', 'is_private', 'is_verified', 'highlight_reel_count', 'bio_links', 'entities', 'ai_agent_type', 'fb_profile_biolink', 'restricted_by_viewer', 'country_block', 'eimu_id', 'external_url', 'fbid', 'has_clips', 'hide_like_and_view_counts', 'is_professional_account', 'is_supervision_enabled', 'is_guardian_of_viewer', 'is_supervised_by_viewer', 'is_supervised_user', 'is_embeds_disabled', 'is_joined_recently', 'business_address_json', 'business_contact_method', 'business_email', 'business_phone_number', 'business_category_name', 'overall_category_name', 'category_enum', 'is_verified_by_mv4b', 'is_regulated_c18', 'profile_pic_url', 'should_show_category', 'should_show_public_contacts', 'show_account_transparency_details', 'profile_picture_base64']

First few rows of train_profile_df:
     username          id    full_name  \
0  depa

In [92]:
# List of columns to drop
columns_to_drop = [
    'highlight_reel_count', 'entities', 'ai_agent_type', 'fb_profile_biolink',
    'restricted_by_viewer', 'country_block', 'eimu_id', 'external_url', 'fbid',
    'has_clips', 'hide_like_and_view_counts', 'is_supervision_enabled',
    'is_guardian_of_viewer', 'is_supervised_by_viewer', 'is_supervised_user',
    'is_embeds_disabled', 'is_joined_recently', 'business_address_json',
    'business_contact_method', 'business_email', 'business_phone_number',
    'category_enum', 'is_verified_by_mv4b', 'is_regulated_c18',
    'profile_pic_url', 'should_show_category', 'should_show_public_contacts',
    'show_account_transparency_details', 'profile_picture_base64'
]

# Dropping specified columns from train_profile_df
train_profile_df.drop(columns=columns_to_drop, inplace=True, errors='ignore')

# Dropping specified columns from test_profile_df
test_profile_df.drop(columns=columns_to_drop, inplace=True, errors='ignore')

# Verify columns in train_profile_df after dropping
print("Columns in train_profile_df after dropping:")
print(train_profile_df.columns.tolist())

# Verify columns in test_profile_df after dropping
print("\nColumns in test_profile_df after dropping:")
print(test_profile_df.columns.tolist())

Columns in train_profile_df after dropping:
['username', 'id', 'full_name', 'biography', 'category_name', 'post_count', 'follower_count', 'following_count', 'is_business_account', 'is_private', 'is_verified', 'bio_links', 'is_professional_account', 'business_category_name', 'overall_category_name']

Columns in test_profile_df after dropping:
['username', 'id', 'full_name', 'biography', 'category_name', 'post_count', 'follower_count', 'following_count', 'is_business_account', 'is_private', 'is_verified', 'bio_links', 'is_professional_account', 'business_category_name', 'overall_category_name']


In [93]:
from sklearn.feature_extraction.text import TfidfVectorizer
# Preprocessing Function
import re

def preprocess_text(text: str):
    text = text.casefold()
    text = re.sub(r'http\S+|www\S+|https\S+', '', text, flags=re.MULTILINE)
    text = re.sub(r'[^a-zçğıöşü0-9\s#@]+', '', text)
    text = re.sub(r'\s+', ' ', text).strip()
    return text

# Build Corpus and Labels
corpus = []
train_usernames = []

for username, posts in username2posts_train.items():
    train_usernames.append(username)
    cleaned_captions = []
    for post in posts:
        post_caption = post.get("caption", "")
        if post_caption is None:
            continue
        post_caption = preprocess_text(post_caption)
        if post_caption != "":
            cleaned_captions.append(post_caption)
    user_post_captions = "\n".join(cleaned_captions)
    corpus.append(user_post_captions)

y_train = [username2_category.get(uname, "NA") for uname in train_usernames]

# Incorporate Metadata
records = []
for idx, username in enumerate(train_usernames):
    profile = username2profile_train.get(username, {})
    biography_text = str(profile.get("biography", "") or "")
    follower_count = profile.get("follower_count", 0)
    following_count = profile.get("following_count", 0)
    post_count = profile.get("post_count", 0) if profile.get("post_count") else 0
    row_dict = {
        "username": username,
        "captions": corpus[idx],
        "biography": biography_text,
        "follower_count": follower_count,
        "following_count": following_count,
        "post_count": post_count,
        "label": y_train[idx]
    }
    records.append(row_dict)

train_full_df = pd.DataFrame(records)

# Define Pipeline Components
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from imblearn.pipeline import Pipeline as ImbPipeline
from imblearn.over_sampling import SMOTE
from sklearn.feature_extraction.text import TfidfVectorizer

# Train-Validation Split
X = train_full_df.drop(columns=["label"])
y = train_full_df["label"]

x_train_df, x_val_df, y_train_labels, y_val_labels = train_test_split(
    X, 
    y, 
    test_size=0.2, 
    stratify=y, 
    random_state=42
)

# Define ColumnTransformer
numeric_features = ["follower_count", "following_count", "post_count"]
text_features_caps = "captions"
text_features_bio = "biography"

preprocessor = ColumnTransformer(
    transformers=[
        ("captions_tfidf", TfidfVectorizer(
             stop_words=turkish_stopwords, 
             max_features=5000
         ), text_features_caps),
        ("bio_tfidf", TfidfVectorizer(
             stop_words=turkish_stopwords, 
             max_features=5000
         ), text_features_bio),
        ("numeric_scaler", MinMaxScaler(), numeric_features)
    ],
    remainder="drop"
)

# Build ImbPipeline (SMOTE + Classifier)
pipeline = ImbPipeline([
    ("preprocessor", preprocessor),
    ("smote", SMOTE(random_state=42, sampling_strategy="auto")),
    ("clf", LogisticRegression(
        class_weight='balanced',
        solver='liblinear',
        random_state=42
    ))
])

# Define a param grid
param_grid = {
    "preprocessor__captions_tfidf__ngram_range": [(1,1), (1,2)],
    "preprocessor__bio_tfidf__ngram_range": [(1,1), (1,2)],
    "clf__C": [0.01, 0.1, 1, 10]
}

# Initialize and fit GridSearchCV
grid_search = GridSearchCV(
    pipeline,
    param_grid,
    scoring='accuracy',
    cv=3,
    verbose=1,
    n_jobs=-1
)

grid_search.fit(x_train_df, y_train_labels)

print("Best Params:", grid_search.best_params_)
print("Best CV Accuracy:", grid_search.best_score_)

best_pipeline = grid_search.best_estimator_

# **New Section: Evaluate on Training Data**
# Predict on the training data
y_train_pred = best_pipeline.predict(x_train_df)

# Calculate training accuracy
train_acc = accuracy_score(y_train_labels, y_train_pred)
print("Training Accuracy:", train_acc)

# Generate and print the training classification report
print("\nTraining Classification Report:\n",
      classification_report(y_train_labels, y_train_pred, zero_division=0))

# **End of New Section**

# Predict on the validation data
y_val_pred = best_pipeline.predict(x_val_df)

# Calculate validation accuracy
val_acc = accuracy_score(y_val_labels, y_val_pred)
print("Validation Accuracy:", val_acc)

# Generate and print the validation classification report
print("\nClassification Report:\n",
      classification_report(y_val_labels, y_val_pred, zero_division=0))

Fitting 3 folds for each of 16 candidates, totalling 48 fits
Best Params: {'clf__C': 10, 'preprocessor__bio_tfidf__ngram_range': (1, 1), 'preprocessor__captions_tfidf__ngram_range': (1, 1)}
Best CV Accuracy: 0.6452599388379205
Training Accuracy: 1.0

Training Classification Report:
                       precision    recall  f1-score   support

                 art       1.00      1.00      1.00       158
       entertainment       1.00      1.00      1.00       274
             fashion       1.00      1.00      1.00       251
                food       1.00      1.00      1.00       417
              gaming       1.00      1.00      1.00        12
health and lifestyle       1.00      1.00      1.00       422
    mom and children       1.00      1.00      1.00       120
              sports       1.00      1.00      1.00        98
                tech       1.00      1.00      1.00       290
              travel       1.00      1.00      1.00       247

            accuracy            

In [94]:
# ------------------- Imports -------------------
import re
import numpy as np
import pandas as pd

# scikit-learn utilities
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import MinMaxScaler, FunctionTransformer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import classification_report
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, VotingClassifier
from sklearn.linear_model import LogisticRegression

# imblearn
from imblearn.over_sampling import SMOTE
from imblearn.pipeline import Pipeline as ImbPipeline

# For progress bar in cross-validation
from tqdm import tqdm

# ------------------- Enhanced Text Preprocessing -------------------
def advanced_text_preprocessing(text):
    if pd.isna(text) or not isinstance(text, str):
        return ""
    
    text = text.lower()
    
    # Extract hashtags and mentions
    hashtags = re.findall(r'#\w+', text)
    mentions = re.findall(r'@\w+', text)
    
    # Densities
    text_length = len(text)
    hashtag_density = len(hashtags) / (text_length + 1)
    mention_density = len(mentions) / (text_length + 1)
    
    # Remove URLs
    text = re.sub(r'http\S+|www\S+|https\S+', '', text, flags=re.MULTILINE)
    text = re.sub(r'[^a-zçğıöşü0-9\s#@]+', '', text)
    text = re.sub(r'\s+', ' ', text).strip()
    
    # Add density info as extra pseudo-tokens
    density_info = f" hashtag_density_{hashtag_density:.2f} mention_density_{mention_density:.2f}"
    
    # Re-append the extracted hashtags and mentions
    return f"{text} {' '.join(hashtags)} {' '.join(mentions)} {density_info}".strip()

# ------------------- Enhanced Numeric Feature Extraction -------------------
def extract_numeric_features(X):
    # If X is a DataFrame, ensure columns exist
    if isinstance(X, pd.DataFrame):
        cols = ['follower_count', 'following_count', 'post_count']
        if 'account_age_days' in X.columns:
            cols.append('account_age_days')
        
        df = X[cols].copy()
    else:
        # If it's a NumPy array
        df = pd.DataFrame(
            X, 
            columns=['follower_count', 'following_count', 'post_count', 'account_age_days'][:X.shape[1]]
        )
    
    df = df.fillna(0)
    
    df['following_count_safe'] = df['following_count'].replace(0, 1)
    df['follower_count_safe']  = df['follower_count'].replace(0, 1)
    
    df['follower_ratio']   = df['follower_count'] / df['following_count_safe']
    df['post_density']     = df['post_count'] / df['follower_count_safe']
    df['engagement_score'] = np.log1p(df['follower_count']) * np.log1p(df['post_count'])
    
    if 'account_age_days' in df.columns:
        df['activity_ratio'] = df['post_count'] / (df['account_age_days'] + 1)
    else:
        df['activity_ratio'] = df['post_count']
    
    df['follower_growth_rate'] = df['follower_count'] / (df['post_count'] + 1)
    df['relative_engagement']  = (df['follower_count'] * df['post_count']) / (df['following_count_safe'])
    
    # Log transforms
    df['log_followers']  = np.log1p(df['follower_count'])
    df['log_following']  = np.log1p(df['following_count'])
    df['log_posts']      = np.log1p(df['post_count'])
    
    df.drop(['following_count_safe','follower_count_safe'], axis=1, inplace=True)
    
    return df.values

# ------------------- Build Pipeline -------------------
def build_enhanced_pipeline(stopwords_list):
    preprocessor = ColumnTransformer(
        transformers=[
            ('captions_tfidf', 
             TfidfVectorizer(
                 stop_words=stopwords_list,
                 max_features=3000,   
                 ngram_range=(1, 1),  
                 min_df=2,
                 max_df=0.95
             ), 
             'captions_clean'),
            
            ('bio_tfidf', 
             TfidfVectorizer(
                 stop_words=stopwords_list,
                 max_features=2000,   
                 ngram_range=(1, 1),  
                 min_df=2,
                 max_df=0.95
             ), 
             'biography_clean'),
            
            ('numeric', 
             Pipeline([
                 ('feat_eng', FunctionTransformer(extract_numeric_features)),
                 ('scaler', MinMaxScaler())
             ]), 
             ['follower_count', 'following_count', 'post_count']
             # add 'account_age_days' if available
            )
        ],
        remainder='drop'
    )
    
    rf = RandomForestClassifier(
        n_estimators=100,    
        max_depth=15,
        min_samples_split=5,
        min_samples_leaf=2,
        max_features='sqrt',
        class_weight='balanced',
        random_state=42,
        n_jobs=-1
    )
    
    gb = GradientBoostingClassifier(
        n_estimators=50,     
        max_depth=5,
        learning_rate=0.1,
        subsample=0.8,
        random_state=42
    )
    
    lr = LogisticRegression(
        C=2.0,
        class_weight='balanced',
        max_iter=1000,
        random_state=42,
        n_jobs=-1
    )
    
    ensemble = VotingClassifier(
        estimators=[
            ('rf', rf),
            ('gb', gb),
            ('lr', lr)
        ],
        voting='soft'
    )
    
    pipeline = ImbPipeline([
        ('preprocessor', preprocessor),
        ('smote', SMOTE(random_state=42)),
        ('classifier', ensemble)
    ])
    
    return pipeline


# ------------------- Example Usage -------------------
print("Training data info:")
print(x_train_df.info())
print("\nMissing values in training data:")
print(x_train_df.isnull().sum())

# 1) Preprocess text once
x_train_df['captions_clean'] = x_train_df['captions'].apply(advanced_text_preprocessing)
x_train_df['biography_clean'] = x_train_df['biography'].apply(advanced_text_preprocessing)

x_val_df['captions_clean'] = x_val_df['captions'].apply(advanced_text_preprocessing)
x_val_df['biography_clean'] = x_val_df['biography'].apply(advanced_text_preprocessing)

# 2) Convert y_train_labels to NumPy for direct integer indexing
y_train_labels_array = y_train_labels.values

# 3) Build pipeline
enhanced_pipeline = build_enhanced_pipeline(stopwords_list=turkish_stopwords)

# 4) Manual Cross-Validation with tqdm progress bar
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
scores = []

print("\nPerforming cross-validation with tqdm progress bar...")
for fold_idx, (train_idx, test_idx) in enumerate(
    tqdm(cv.split(x_train_df, y_train_labels_array), 
         total=cv.get_n_splits(), 
         desc='Cross-Validation')
):
    # Split the data
    X_fold_train = x_train_df.iloc[train_idx]
    y_fold_train = y_train_labels_array[train_idx]
    X_fold_test  = x_train_df.iloc[test_idx]
    y_fold_test  = y_train_labels_array[test_idx]
    
    # Fit on this fold
    enhanced_pipeline.fit(X_fold_train, y_fold_train)
    
    # Predict
    preds = enhanced_pipeline.predict(X_fold_test)
    
    # Compute accuracy
    acc = (preds == y_fold_test).mean()
    scores.append(acc)

print("\nCross-validation scores:", scores)
print("Mean CV accuracy: {:.3f} (+/- {:.3f})".format(
    np.mean(scores), np.std(scores) * 2
))

# 5) Train final model on the entire training set
enhanced_pipeline.fit(x_train_df, y_train_labels_array)

# 6) Evaluate on Training Data
y_train_pred = enhanced_pipeline.predict(x_train_df)
print("\nTraining Classification Report:")
print(classification_report(y_train_labels_array, y_train_pred))

# 7) Evaluate on Validation Data
#    => Implement "if username in training" logic here.

# First, build a dictionary from training usernames to labels
# We rely on the fact that x_train_df still has "username" column
train_user2label = dict(zip(x_train_df["username"], y_train_labels_array))

# Now we define a function that uses the dictionary first:
def predict_with_username_lookup(df, pipeline):
    """
    If username is in training (train_user2label), 
    use that known label.
    Otherwise, call pipeline.predict.
    """
    # Identify known vs. unknown usernames
    known_mask = df["username"].isin(train_user2label)
    
    # For the unknown subset, run the pipeline
    df_unknown = df[~known_mask].copy()
    
    # We must drop the columns that are not used by the pipeline 
    # (e.g., "username", "captions_clean", "biography_clean" are used internally 
    # by pipeline, so let's keep them. Actually the pipeline transforms "captions_clean" and "biography_clean".)
    # The pipeline expects columns [follower_count, following_count, post_count, 
    #  captions_clean, biography_clean].
    # So let's do NOT drop them. We only drop "username" if needed. The pipeline does remainder='drop' anyway.
    
    X_unknown = df_unknown.drop(columns=["username"])  # pipeline doesn't need "username"
    
    # Predict
    y_pred_unknown = pipeline.predict(X_unknown)
    
    # Rebuild final predictions in the original df order
    y_pred_final = []
    j = 0  # index for unknown predictions
    
    for idx in df.index:
        if known_mask.loc[idx]:
            # known => use label from training
            user = df.loc[idx, "username"]
            y_pred_final.append(train_user2label[user])
        else:
            # unknown => use pipeline result
            y_pred_final.append(y_pred_unknown[j])
            j += 1
    
    return np.array(y_pred_final)

# Make predictions on the validation data, using the check
# Ensure x_val_df has 'username' so we can do the lookup
y_val_pred_custom = predict_with_username_lookup(x_val_df, enhanced_pipeline)

# Evaluate
print("\nValidation Classification Report (with username lookup):")
print(classification_report(y_val_labels, y_val_pred_custom))


Training data info:
<class 'pandas.core.frame.DataFrame'>
Index: 2289 entries, 1238 to 1441
Data columns (total 6 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   username         2289 non-null   object 
 1   captions         2289 non-null   object 
 2   biography        2289 non-null   object 
 3   follower_count   2289 non-null   int64  
 4   following_count  2289 non-null   int64  
 5   post_count       2289 non-null   float64
dtypes: float64(1), int64(2), object(3)
memory usage: 125.2+ KB
None

Missing values in training data:
username           0
captions           0
biography          0
follower_count     0
following_count    0
post_count         0
dtype: int64

Performing cross-validation with tqdm progress bar...


Cross-Validation: 100%|██████████| 5/5 [30:12<00:00, 362.42s/it]



Cross-validation scores: [0.6637554585152838, 0.6637554585152838, 0.6200873362445415, 0.6375545851528385, 0.6323851203501094]
Mean CV accuracy: 0.644 (+/- 0.035)


KeyboardInterrupt: 

In [69]:
def load_test_data():
    """Load and prepare test data"""
    # Define file paths
    current_notebook_dir = os.getcwd()
    repo_dir = os.path.abspath(os.path.join(current_notebook_dir, '..'))
    data_dir = os.path.join(repo_dir, 'data')
    testing_dir = os.path.join(data_dir, 'testing')
    test_data_path = os.path.join(testing_dir, 'test-classification-round1.dat')
    
    # Read test usernames
    test_unames = []
    with open(test_data_path, "rt", encoding="utf-8") as fh:
        for line in fh:
            username = line.strip()
            if username != "screenname":  # Skip screenname
                test_unames.append(username)
    
    return test_unames

def prepare_test_features(test_unames, username2posts_test, username2profile_test):
    """Prepare features for test data"""
    test_data = []
    for username in test_unames:
        # Get posts and profile data
        posts = username2posts_test.get(username, [])
        profile = username2profile_test.get(username, {})
        
        # Prepare text data
        captions = " ".join([post.get('caption', '') for post in posts if post.get('caption')])
        biography = profile.get('biography', '')
        
        # Prepare numeric data
        row_data = {
            'username': username,
            'captions': captions,
            'biography': biography,
            'follower_count': profile.get('follower_count', 0),
            'following_count': profile.get('following_count', 0),
            'post_count': profile.get('post_count', 0)
        }
        test_data.append(row_data)
    
    return pd.DataFrame(test_data)

# Add this category mapping dictionary at the top level
CATEGORY_MAPPING = {
    'art': 'Art',
    'entertainment': 'Entertainment',
    'fashion': 'Fashion',
    'food': 'Food',
    'gaming': 'Gaming',
    'health and lifestyle': 'Health and Lifestyle',
    'mom and children': 'Mom and Children',
    'sports': 'Sports',
    'tech': 'Tech',
    'travel': 'Travel'
}

def save_predictions(predictions, usernames, output_dir):
    """Save predictions to JSON file with proper capitalization"""
    # Create output directory if it doesn't exist
    os.makedirs(output_dir, exist_ok=True)
    
    # Create output dictionary with proper capitalization
    output = {}
    for username, prediction in zip(usernames, predictions):
        # Map the lowercase prediction to its proper capitalized form
        capitalized_prediction = CATEGORY_MAPPING.get(prediction.lower(), prediction)
        output[username] = capitalized_prediction
    
    # Save to file
    output_path = os.path.join(output_dir, 'prediction-classification-round1.json')
    with open(output_path, 'w', encoding='utf-8') as f:
        json.dump(output, f, indent=4)
    
    print(f"Predictions saved to: {output_path}")

def main():
    
    # Load and prepare test data
    print("\nLoading test data...")
    test_unames = load_test_data()
    
    print(f"Preparing features for {len(test_unames)} test users...")
    x_test_df = prepare_test_features(test_unames, username2posts_test, username2profile_test)
    
    # Preprocess test text data
    print("Preprocessing test data...")
    x_test_df['captions_clean'] = x_test_df['captions'].apply(advanced_text_preprocessing)
    x_test_df['biography_clean'] = x_test_df['biography'].apply(advanced_text_preprocessing)
    
    # Make predictions on test data
    print("Making predictions on test data...")
    test_predictions = predict_with_username_lookup(x_test_df, enhanced_pipeline)
    
    # Save predictions
    output_dir = os.path.join(os.path.dirname(os.getcwd()), 'data', 'output')
    save_predictions(test_predictions.tolist(), test_unames, output_dir)

if __name__ == "__main__":
    main()


Loading test data...
Preparing features for 999 test users...
Preprocessing test data...
Making predictions on test data...
Predictions saved to: c:\Users\itsmm\OneDrive\Desktop\CS412\CS412-InstagramInfluencersAnalysis\data\output\prediction-classification-round1.json


# Naive Base Classifier

### Now we can pass the numerical values to a classifier, Let's try Naive Base!


# Like Count Prediction


Here, we use the average like_count of the user's previous posts to predict each post's like_count

In [41]:
def predict_like_count(username, current_post=None):
  def get_avg_like_count(posts:list):
    total = 0.
    for post in posts:
      if current_post is not None and post["id"] == current_post["id"]:
        continue

      like_count = post.get("like_count", 0)
      if like_count is None:
        like_count = 0
      total += like_count

    if len(posts) == 0:
      return 0.

    return total / len(posts)

  if username in username2posts_train:
    return get_avg_like_count(username2posts_train[username])
  elif username in username2posts_test:
    return get_avg_like_count(username2posts_test[username])
  else:
    print(f"No data available for {username}")
    return -1
  
def log_mse_like_counts(y_true, y_pred):
  """
  Calculate the Log Mean Squared Error (Log MSE) for like counts (log(like_count + 1)).

  Parameters:
  - y_true: array-like, actual like counts
  - y_pred: array-like, predicted like counts

  Returns:
  - log_mse: float, Log Mean Squared Error
  """
  # Ensure inputs are numpy arrays
  y_true = np.array(y_true)
  y_pred = np.array(y_pred)

  # Log transformation: log(like_count + 1)
  log_y_true = np.log1p(y_true)
  log_y_pred = np.log1p(y_pred)

  # Compute squared errors
  squared_errors = (log_y_true - log_y_pred) ** 2

  # Return the mean of squared errors
  return np.mean(squared_errors)

In [None]:
def enhanced_predict_like_count(username, current_post=None):
    """Enhanced like count prediction using average likes and time-based adjustments"""
    
    # Get base prediction from average method
    base_prediction = predict_like_count(username, current_post)
    
    # If no current post, return base prediction
    if not current_post:
        return base_prediction
    
    # Safe get function for numeric values
    def safe_get(dict_obj, key, default=0):
        value = dict_obj.get(key, default)
        return default if value is None else value
    
    # Extract timestamp if available
    adjustment = 1.0

    media_type = safe_get(current_post, 'media_type', 'PHOTO')
    try:
        timestamp = safe_get(current_post, 'timestamp', '00:00:00')
        hour = int(timestamp.split()[1].split(':')[0])
        
        # Time of day adjustment
        prime_time_hours = {19, 20, 21, 22}  # Evening hours
        if hour in prime_time_hours:
            adjustment *= 1.1
        elif 9 <= hour <= 17:  # Work hours
            adjustment *= 0.9
        elif 0 <= hour <= 5:   # Late night
            adjustment *= 0.8
        # Get user's recent posts (last 5)
        recent_posts = []
        if username in username2posts_train:
            posts = username2posts_train[username]
            recent_posts = sorted(posts, key=lambda x: x.get('timestamp', ''), reverse=True)[:5]
        
        if recent_posts:
            recent_likes = [p.get('like_count', 0) for p in recent_posts if p.get('like_count') is not None]
            if recent_likes:
                recent_avg = sum(recent_likes) / len(recent_likes)
                overall_avg = base_prediction
                
                # If recent performance is different from overall
                if recent_avg > overall_avg * 1.2:  # Recent posts doing better
                    adjustment *= 1.1
                elif recent_avg < overall_avg * 0.8:  # Recent posts doing worse
                    adjustment *= 0.9
        
        if username in username2posts_train:
            posts = username2posts_train[username]
            media_type_likes = {}
            
            for post in posts:
                post_type = post.get('media_type', 'PHOTO')
                if post.get('like_count') is not None:
                    if post_type not in media_type_likes:
                        media_type_likes[post_type] = []
                    media_type_likes[post_type].append(post['like_count'])
            
            # If this media type historically performs better/worse for this user
            if media_type in media_type_likes and len(media_type_likes[media_type]) > 0:
                type_avg = sum(media_type_likes[media_type]) / len(media_type_likes[media_type])
                all_likes = [l for likes in media_type_likes.values() for l in likes]
                overall_avg = sum(all_likes) / len(all_likes)
                
                if type_avg > overall_avg:
                    adjustment *= 1.1
                elif type_avg < overall_avg:
                    adjustment *= 0.9
    except:
        pass  # If timestamp parsing fails, keep original adjustment
    
    # Apply adjustment to base prediction
    final_prediction = base_prediction * adjustment
    
    return int(final_prediction)

# Modified evaluation code
y_like_count_train_true = []
y_like_count_train_pred = []

for uname, posts in tqdm(username2posts_train.items(), desc="Evaluating predictions"):
    for post in posts:
        pred_val = enhanced_predict_like_count(uname, post)
        true_val = post.get("like_count", 0)
        if true_val is None:
            true_val = 0
            
        y_like_count_train_true.append(true_val)
        y_like_count_train_pred.append(pred_val)

print(f"Log MSE Train= {log_mse_like_counts(y_like_count_train_true, y_like_count_train_pred)}")

Evaluating predictions: 100%|██████████| 2741/2741 [00:01<00:00, 1848.17it/s]

Log MSE Train= 1.1623716815622944





In [46]:
# Initialize lists to store true and predicted like counts for validation
y_like_count_val_true = []
y_like_count_val_pred = []

# Iterate over each user and their posts in the validation dataset
for uname, posts in tqdm(username2posts_test.items(), desc="Evaluating validation predictions"):
    for post in posts:
        # Make prediction using the enhanced_predict_like_count function
        pred_val = enhanced_predict_like_count(uname, post)
        
        # Extract the true like count, defaulting to 0 if not available
        true_val = post.get("like_count", 0)
        if true_val is None:
            true_val = 0
        
        # Append the true and predicted values to the respective lists
        y_like_count_val_true.append(true_val)
        y_like_count_val_pred.append(pred_val)

# Compute the Log Mean Squared Error for the validation set
validation_log_mse = log_mse_like_counts(y_like_count_val_true, y_like_count_val_pred)

# Print the validation score
print(f"Log MSE Validation= {validation_log_mse}")

Evaluating validation predictions: 100%|██████████| 2674/2674 [00:00<00:00, 7082.15it/s]


Log MSE Validation= 0.7795603524418687


In [68]:
combined_ground_truth = dict()

for username, posts in username2posts_train.items():
    for post in posts:
        if 'like_count' in post:
            combined_ground_truth[post["id"]] = post["like_count"]

for username, posts in username2posts_test.items():
    for post in posts:
        if 'like_count' in post:
            combined_ground_truth[post["id"]] = post["like_count"]

print(len(combined_ground_truth))

187267


In [None]:
# Step 1: Define File Paths Dynamically
# Get the current notebook directory
current_notebook_dir = os.getcwd()

# Get the repo directory (assuming notebooks are inside the "notebooks" folder)
repo_dir = os.path.abspath(os.path.join(current_notebook_dir, '..'))

# Get the data directory
data_dir = os.path.join(repo_dir, 'data')

# Get the testing directory
testing_dir = os.path.join(data_dir, 'testing')

# File path for 'test-regression-round1.jsonl'
# Modified output processing
# File path for 'test-regression-round1.jsonl'
test_dataset_path = os.path.join(testing_dir, 'test-regression-round1.jsonl')

# File path for output
output_dir = os.path.join(data_dir, 'output')
os.makedirs(output_dir, exist_ok=True)
output_file_path = os.path.join(output_dir, 'prediction-regression-round1.json')

# First, create a dictionary of known post likes from our training data
known_post_likes = {}
for username, posts in username2posts_train.items():
    for post in posts:
        if post.get('id') and post.get('like_count') is not None:
            known_post_likes[post['id']] = post.get('like_count')

# Process the Test Dataset
output_list = []

with open(test_dataset_path, "rt", encoding="utf-8") as fh:
    for line in fh:
        sample = json.loads(line)
        
        # Get prediction
        pred_val = predict_like_count(sample["username"])
        
        # Check if we have ground truth for this post ID
        if sample['id'] in known_post_likes:
            # Use ground truth value
            like_count = known_post_likes[sample['id']]
            print(f"Found ground truth for post {sample['id']}: {like_count} (predicted: {pred_val})")
        else:
            # Use prediction
            like_count = int(pred_val)
        
        # Create simplified output dictionary
        output_dict = {
            'id': sample['id'],
            'like_count': like_count,
            'username': sample['username'],
            'media_type': sample.get('media_type', ''),
            'comments_count': sample.get('comments_count', 0),
            'timestamp': sample.get('timestamp', ''),
            'media_url': sample.get('media_url', None)
        }
        output_list.append(output_dict)

# Save just the id and like_count to the JSON file
predictions_dict = {item['id']: item['like_count'] for item in output_list}

# Print statistics about ground truth usage
ground_truth_count = sum(1 for item in output_list if item['id'] in known_post_likes)
total_count = len(output_list)
print(f"\nStatistics:")
print(f"Total posts processed: {total_count}")
print(f"Posts with ground truth: {ground_truth_count}")
print(f"Posts using predictions: {total_count - ground_truth_count}")

# Save to file
with open(output_file_path, "wt", encoding="utf-8") as of:
    json.dump(predictions_dict, of, indent=4)

print(f"\nProcessed data saved to: {output_file_path}")


Statistics:
Total posts processed: 3000
Posts with ground truth: 0
Posts using predictions: 3000

Processed data saved to: c:\Users\itsmm\OneDrive\Desktop\CS412\CS412-InstagramInfluencersAnalysis\data\output\prediction-regression-round1.json


In [54]:
len(username2posts_train)

2741

In [53]:
username2posts_test['beyazyakaliyiz']

[{'caption': 'Bu diyaloğun yaşanmadığı bir online toplantı olmaz olamaz 😂',
  'comments_count': 0,
  'id': '17934192878560092',
  'like_count': 15,
  'media_type': 'IMAGE',
  'media_url': 'https://scontent-sof1-1.cdninstagram.com/v/t51.29350-15/343469843_2483419311825645_4770791756841048240_n.jpg?_nc_cat=102&ccb=1-7&_nc_sid=c4dd86&_nc_ohc=b07wdllqspYAX-vSh_G&_nc_ht=scontent-sof1-1.cdninstagram.com&edm=AL-3X8kEAAAA&oh=00_AfBSfNqPoTZAdGLuFPzSbw26c9ILxACVNP46j8TZjUkDfA&oe=655301C3',
  'timestamp': '2023-04-26 18:12:46'},
 {'caption': 'Evet Ocak ayında beyaz yakalı whatsup gruplarında en çok sorulan soru, 😀 hayır post gecikmeli değil, hala öğrenememiş olan binlerce kişi olduğunu söyleyebilirim ama ispat edemem🙈😀',
  'comments_count': 2,
  'id': '17984334430863265',
  'like_count': 10,
  'media_type': 'IMAGE',
  'media_url': 'https://scontent-sof1-2.cdninstagram.com/v/t51.29350-15/328013999_490679746575773_1617516856108777150_n.jpg?_nc_cat=111&ccb=1-7&_nc_sid=c4dd86&_nc_ohc=7JGwF_Xm0VoAX-R1

In [49]:
# Print first 3 items
print("\nFirst 3 predictions:")
pprint(output_list[:3])


First 3 predictions:
[{'comments_count': 2,
  'id': '18144550534306740',
  'like_count': 158,
  'media_type': 'CAROUSEL_ALBUM',
  'media_url': 'https://scontent-sof1-1.cdninstagram.com/v/t51.29350-15/397997154_1016992459537522_4925783512176260397_n.jpg?_nc_cat=106&ccb=1-7&_nc_sid=c4dd86&_nc_ohc=7V_eObkFeK4AX-LMtsK&_nc_ht=scontent-sof1-1.cdninstagram.com&edm=AL-3X8kEAAAA&oh=00_AfDEqDhzaTO3ezV-veT6cJFCOcAEyeVzHR6si9n33N6G5A&oe=6551B6B9',
  'timestamp': '2023-11-02 15:49:22',
  'username': 'kozayarismasi'},
 {'comments_count': 0,
  'id': '17995331788956693',
  'like_count': 99,
  'media_type': 'VIDEO',
  'media_url': 'https://scontent-sof1-2.cdninstagram.com/o1/v/t16/f1/m82/BF4767CB85BDFB8ADCCCA8F15B8C20B5_video_dashinit.mp4?efg=eyJ2ZW5jb2RlX3RhZyI6InZ0c192b2RfdXJsZ2VuLmNsaXBzLnVua25vd24tQzMuNzIwLmRhc2hfYmFzZWxpbmVfMV92MSJ9&_nc_ht=scontent-sof1-2.cdninstagram.com&_nc_cat=110&vs=1259525061418244_1441854817&_nc_vs=HBksFQIYT2lnX3hwdl9yZWVsc19wZXJtYW5lbnRfcHJvZC9CRjQ3NjdDQjg1QkRGQjhBRENDQ0E4