# Import Dependencies

In [1]:
import numpy as np
import pandas as pd
import gzip
import json
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import accuracy_score, classification_report
from imblearn.over_sampling import RandomOverSampler
from imblearn.pipeline import Pipeline as ImbPipeline


import os

from pprint import pprint

In [2]:
#@title Turkish StopWords

import nltk
from nltk.corpus import stopwords

nltk.download('stopwords')
turkish_stopwords = stopwords.words('turkish')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\itsmm\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


# Influencer Category Classification



1.   Read Data
2.   Preprocess Data
3.   Prepare Model
4.   Predict Test Data
4.   Save outputs



In [3]:
# Step 1: Define File Paths Dynamically
# Get the current notebook directory
current_notebook_dir = os.getcwd()

# Get the repo directory (assuming notebooks are inside the "notebooks" folder)
repo_dir = os.path.abspath(os.path.join(current_notebook_dir, '..'))

# Get the data directory
data_dir = os.path.join(repo_dir, 'data')

# Get the training directory
training_dir = os.path.join(data_dir, 'training')

# File path for 'train-classification.csv'
train_classification_path = os.path.join(training_dir, 'train-classification.csv')

# Step 2: Load Data Dynamically
train_classification_df = pd.read_csv(train_classification_path)
train_classification_df = train_classification_df.rename(columns={'Unnamed: 0': 'user_id', 'label': 'category'})

# Step 3: Unify Labels
train_classification_df["category"] = train_classification_df["category"].apply(str.lower)

# Step 4: Create User-to-Category Mapping
username2_category = train_classification_df.set_index("user_id").to_dict()["category"]

# Step 5: Verify Output
print("First few rows of the training classification DataFrame:")
train_classification_df.head()

First few rows of the training classification DataFrame:


Unnamed: 0,user_id,category
0,taskirancemal,mom and children
1,tam_kararinda,food
2,spart4nn,food
3,sosyalyiyiciler,food
4,sonaydizdarahad,mom and children


In [4]:
# stats about the labels
train_classification_df.groupby("category").count()

Unnamed: 0_level_0,user_id
category,Unnamed: 1_level_1
art,191
entertainment,323
fashion,299
food,511
gaming,13
health and lifestyle,503
mom and children,149
sports,113
tech,346
travel,294


In [5]:
username2_category["sonaydizdarahad"]

'mom and children'

In [6]:
# Step 1: Define File Paths Dynamically
# Get the current notebook directory
current_notebook_dir = os.getcwd()

# Get the repo directory (assuming notebooks are inside the "notebooks" folder)
repo_dir = os.path.abspath(os.path.join(current_notebook_dir, '..'))

# Get the data directory
data_dir = os.path.join(repo_dir, 'data')

# Get the training directory
training_dir = os.path.join(data_dir, 'training')

# File path for 'training-dataset.jsonl.gz'
train_data_path = os.path.join(training_dir, 'training-dataset.jsonl.gz')

# Step 2: Initialize Dictionaries for Data
username2posts_train = dict()
username2profile_train = dict()

username2posts_test = dict()
username2profile_test = dict()

# Step 3: Process Data from 'training-dataset.jsonl.gz'
with gzip.open(train_data_path, "rt", encoding="utf-8") as fh:
    for line in fh:
        sample = json.loads(line)

        profile = sample["profile"]
        username = profile.get("username", "").strip()  # Handle missing or empty usernames
        if not username:
            continue  # Skip if username is missing or empty

        if username in username2_category:
            # Train data info
            username2posts_train[username] = sample["posts"]
            username2profile_train[username] = profile
        else:
            # Test data info
            username2posts_test[username] = sample["posts"]
            username2profile_test[username] = profile

# Step 4: Verify Output
print(f"Number of Training Users: {len(username2posts_train)}")
print(f"Number of Testing Users: {len(username2posts_test)}")

Number of Training Users: 2741
Number of Testing Users: 2674


In [7]:
# Profile Dataframe
train_profile_df = pd.DataFrame(username2profile_train).T.reset_index(drop=True)
test_profile_df = pd.DataFrame(username2profile_test).T.reset_index(drop=True)

train_profile_df.head(1)

Unnamed: 0,username,id,full_name,biography,category_name,post_count,follower_count,following_count,is_business_account,is_private,...,business_category_name,overall_category_name,category_enum,is_verified_by_mv4b,is_regulated_c18,profile_pic_url,should_show_category,should_show_public_contacts,show_account_transparency_details,profile_picture_base64
0,deparmedya,3170700063,Depar Medya,#mediaplanning #mediabuying #sosyalmedya,Local business,,1167,192,True,False,...,,,LOCAL,False,False,https://instagram.fsaw2-3.fna.fbcdn.net/v/t51....,True,True,True,/9j/4AAQSkZJRgABAQAAAQABAAD/2wBDAAgGBgcGBQgHBw...


In [8]:
test_profile_df.head(1)

Unnamed: 0,username,id,full_name,biography,category_name,post_count,follower_count,following_count,is_business_account,is_private,...,business_category_name,overall_category_name,category_enum,is_verified_by_mv4b,is_regulated_c18,profile_pic_url,should_show_category,should_show_public_contacts,show_account_transparency_details,profile_picture_base64
0,beyazyakaliyiz,8634457436,Selam Beyaz Yakalı,Beyaz yakalıların dünyasına hoşgeldiniz 😀😀😀,Personal blog,,1265,665,True,False,...,,,PERSONAL_BLOG,False,False,https://instagram.fist6-1.fna.fbcdn.net/v/t51....,True,True,True,/9j/4AAQSkZJRgABAQAAAQABAAD/2wBDAAgGBgcGBQgHBw...


In [9]:
# Step 5: Verify Output
print("Columns in train_classification_df:")
print(train_classification_df.columns.tolist())

print("\nFirst few rows of the training classification DataFrame:")
print(train_classification_df.head())

print("\nLabel distribution:")
print(train_classification_df['category'].value_counts())

Columns in train_classification_df:
['user_id', 'category']

First few rows of the training classification DataFrame:
           user_id          category
0    taskirancemal  mom and children
1    tam_kararinda              food
2         spart4nn              food
3  sosyalyiyiciler              food
4  sonaydizdarahad  mom and children

Label distribution:
category
food                    511
health and lifestyle    503
tech                    346
entertainment           323
fashion                 299
travel                  294
art                     191
mom and children        149
sports                  113
gaming                   13
Name: count, dtype: int64


In [10]:
# Profile Dataframe
print("Columns in train_profile_df:")
print(train_profile_df.columns.tolist())

print("\nFirst few rows of train_profile_df:")
print(train_profile_df.head(1))

print("\nColumns in test_profile_df:")
print(test_profile_df.columns.tolist())

print("\nFirst few rows of test_profile_df:")
print(test_profile_df.head(1))

Columns in train_profile_df:
['username', 'id', 'full_name', 'biography', 'category_name', 'post_count', 'follower_count', 'following_count', 'is_business_account', 'is_private', 'is_verified', 'highlight_reel_count', 'bio_links', 'entities', 'ai_agent_type', 'fb_profile_biolink', 'restricted_by_viewer', 'country_block', 'eimu_id', 'external_url', 'fbid', 'has_clips', 'hide_like_and_view_counts', 'is_professional_account', 'is_supervision_enabled', 'is_guardian_of_viewer', 'is_supervised_by_viewer', 'is_supervised_user', 'is_embeds_disabled', 'is_joined_recently', 'business_address_json', 'business_contact_method', 'business_email', 'business_phone_number', 'business_category_name', 'overall_category_name', 'category_enum', 'is_verified_by_mv4b', 'is_regulated_c18', 'profile_pic_url', 'should_show_category', 'should_show_public_contacts', 'show_account_transparency_details', 'profile_picture_base64']

First few rows of train_profile_df:
     username          id    full_name  \
0  depa

In [11]:
# List of columns to drop
columns_to_drop = [
    'highlight_reel_count', 'entities', 'ai_agent_type', 'fb_profile_biolink',
    'restricted_by_viewer', 'country_block', 'eimu_id', 'external_url', 'fbid',
    'has_clips', 'hide_like_and_view_counts', 'is_supervision_enabled',
    'is_guardian_of_viewer', 'is_supervised_by_viewer', 'is_supervised_user',
    'is_embeds_disabled', 'is_joined_recently', 'business_address_json',
    'business_contact_method', 'business_email', 'business_phone_number',
    'category_enum', 'is_verified_by_mv4b', 'is_regulated_c18',
    'profile_pic_url', 'should_show_category', 'should_show_public_contacts',
    'show_account_transparency_details', 'profile_picture_base64'
]

# Dropping specified columns from train_profile_df
train_profile_df.drop(columns=columns_to_drop, inplace=True, errors='ignore')

# Dropping specified columns from test_profile_df
test_profile_df.drop(columns=columns_to_drop, inplace=True, errors='ignore')

# Verify columns in train_profile_df after dropping
print("Columns in train_profile_df after dropping:")
print(train_profile_df.columns.tolist())

# Verify columns in test_profile_df after dropping
print("\nColumns in test_profile_df after dropping:")
print(test_profile_df.columns.tolist())

Columns in train_profile_df after dropping:
['username', 'id', 'full_name', 'biography', 'category_name', 'post_count', 'follower_count', 'following_count', 'is_business_account', 'is_private', 'is_verified', 'bio_links', 'is_professional_account', 'business_category_name', 'overall_category_name']

Columns in test_profile_df after dropping:
['username', 'id', 'full_name', 'biography', 'category_name', 'post_count', 'follower_count', 'following_count', 'is_business_account', 'is_private', 'is_verified', 'bio_links', 'is_professional_account', 'business_category_name', 'overall_category_name']


In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
# Preprocessing Function
import re

def preprocess_text(text: str):
    text = text.casefold()
    text = re.sub(r'http\S+|www\S+|https\S+', '', text, flags=re.MULTILINE)
    text = re.sub(r'[^a-zçğıöşü0-9\s#@]+', '', text)
    text = re.sub(r'\s+', ' ', text).strip()
    return text

# Build Corpus and Labels
corpus = []
train_usernames = []

for username, posts in username2posts_train.items():
    train_usernames.append(username)
    cleaned_captions = []
    for post in posts:
        post_caption = post.get("caption", "")
        if post_caption is None:
            continue
        post_caption = preprocess_text(post_caption)
        if post_caption != "":
            cleaned_captions.append(post_caption)
    user_post_captions = "\n".join(cleaned_captions)
    corpus.append(user_post_captions)

y_train = [username2_category.get(uname, "NA") for uname in train_usernames]

# Incorporate Metadata
records = []
for idx, username in enumerate(train_usernames):
    profile = username2profile_train.get(username, {})
    biography_text = str(profile.get("biography", "") or "")
    follower_count = profile.get("follower_count", 0)
    following_count = profile.get("following_count", 0)
    post_count = profile.get("post_count", 0) if profile.get("post_count") else 0
    row_dict = {
        "username": username,
        "captions": corpus[idx],
        "biography": biography_text,
        "follower_count": follower_count,
        "following_count": following_count,
        "post_count": post_count,
        "label": y_train[idx]
    }
    records.append(row_dict)

train_full_df = pd.DataFrame(records)

# Define Pipeline Components
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from imblearn.pipeline import Pipeline as ImbPipeline
from imblearn.over_sampling import SMOTE
from sklearn.feature_extraction.text import TfidfVectorizer

# Train-Validation Split
X = train_full_df.drop(columns=["label"])
y = train_full_df["label"]

x_train_df, x_val_df, y_train_labels, y_val_labels = train_test_split(
    X, 
    y, 
    test_size=0.2, 
    stratify=y, 
    random_state=42
)

# Define ColumnTransformer
numeric_features = ["follower_count", "following_count", "post_count"]
text_features_caps = "captions"
text_features_bio = "biography"

preprocessor = ColumnTransformer(
    transformers=[
        ("captions_tfidf", TfidfVectorizer(
             stop_words=turkish_stopwords, 
             max_features=5000
         ), text_features_caps),
        ("bio_tfidf", TfidfVectorizer(
             stop_words=turkish_stopwords, 
             max_features=5000
         ), text_features_bio),
        ("numeric_scaler", MinMaxScaler(), numeric_features)
    ],
    remainder="drop"
)

# Build ImbPipeline (SMOTE + Classifier)
pipeline = ImbPipeline([
    ("preprocessor", preprocessor),
    ("smote", SMOTE(random_state=42, sampling_strategy="auto")),
    ("clf", LogisticRegression(
        class_weight='balanced',
        solver='liblinear',
        random_state=42
    ))
])

# Define a param grid
param_grid = {
    "preprocessor__captions_tfidf__ngram_range": [(1,1), (1,2)],
    "preprocessor__bio_tfidf__ngram_range": [(1,1), (1,2)],
    "clf__C": [0.01, 0.1, 1, 10]
}

# Initialize and fit GridSearchCV
grid_search = GridSearchCV(
    pipeline,
    param_grid,
    scoring='accuracy',
    cv=3,
    verbose=1,
    n_jobs=-1
)

grid_search.fit(x_train_df, y_train_labels)

print("Best Params:", grid_search.best_params_)
print("Best CV Accuracy:", grid_search.best_score_)

best_pipeline = grid_search.best_estimator_

# **New Section: Evaluate on Training Data**
# Predict on the training data
y_train_pred = best_pipeline.predict(x_train_df)

# Calculate training accuracy
train_acc = accuracy_score(y_train_labels, y_train_pred)
print("Training Accuracy:", train_acc)

# Generate and print the training classification report
print("\nTraining Classification Report:\n",
      classification_report(y_train_labels, y_train_pred, zero_division=0))

# **End of New Section**

# Predict on the validation data
y_val_pred = best_pipeline.predict(x_val_df)

# Calculate validation accuracy
val_acc = accuracy_score(y_val_labels, y_val_pred)
print("Validation Accuracy:", val_acc)

# Generate and print the validation classification report
print("\nClassification Report:\n",
      classification_report(y_val_labels, y_val_pred, zero_division=0))

Fitting 3 folds for each of 16 candidates, totalling 48 fits
Best Params: {'clf__C': 1, 'preprocessor__bio_tfidf__ngram_range': (1, 1), 'preprocessor__captions_tfidf__ngram_range': (1, 2)}
Best CV Accuracy: 0.6455259262035492
Training Accuracy: 0.9516423357664233

Training Classification Report:
                       precision    recall  f1-score   support

                 art       0.94      0.95      0.94       153
       entertainment       0.97      0.91      0.94       258
             fashion       0.92      0.97      0.95       239
                food       0.98      0.97      0.97       409
              gaming       1.00      1.00      1.00        10
health and lifestyle       0.97      0.91      0.94       402
    mom and children       0.92      0.97      0.94       119
              sports       0.98      0.99      0.98        90
                tech       0.92      0.97      0.95       277
              travel       0.94      0.96      0.95       235

            accura

In [30]:
# ------------------- Imports -------------------
import re
import numpy as np
import pandas as pd

# scikit-learn utilities
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import MinMaxScaler, FunctionTransformer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import classification_report
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, VotingClassifier
from sklearn.linear_model import LogisticRegression

# imblearn
from imblearn.over_sampling import SMOTE
from imblearn.pipeline import Pipeline as ImbPipeline

# For progress bar in cross-validation
from tqdm import tqdm

# ------------------- Enhanced Text Preprocessing -------------------
def advanced_text_preprocessing(text):
    if pd.isna(text) or not isinstance(text, str):
        return ""
    
    text = text.lower()
    
    # Extract hashtags and mentions
    hashtags = re.findall(r'#\w+', text)
    mentions = re.findall(r'@\w+', text)
    
    # Densities
    text_length = len(text)
    hashtag_density = len(hashtags) / (text_length + 1)
    mention_density = len(mentions) / (text_length + 1)
    
    # Remove URLs
    text = re.sub(r'http\S+|www\S+|https\S+', '', text, flags=re.MULTILINE)
    text = re.sub(r'[^a-zçğıöşü0-9\s#@]+', '', text)
    text = re.sub(r'\s+', ' ', text).strip()
    
    # Add density info as extra pseudo-tokens
    density_info = f" hashtag_density_{hashtag_density:.2f} mention_density_{mention_density:.2f}"
    
    # Re-append the extracted hashtags and mentions
    return f"{text} {' '.join(hashtags)} {' '.join(mentions)} {density_info}".strip()

# ------------------- Enhanced Numeric Feature Extraction -------------------
def extract_numeric_features(X):
    # If X is a DataFrame, ensure columns exist
    if isinstance(X, pd.DataFrame):
        cols = ['follower_count', 'following_count', 'post_count']
        if 'account_age_days' in X.columns:
            cols.append('account_age_days')
        
        df = X[cols].copy()
    else:
        # If it's a NumPy array
        df = pd.DataFrame(
            X, 
            columns=['follower_count', 'following_count', 'post_count', 'account_age_days'][:X.shape[1]]
        )
    
    df = df.fillna(0)
    
    df['following_count_safe'] = df['following_count'].replace(0, 1)
    df['follower_count_safe']  = df['follower_count'].replace(0, 1)
    
    df['follower_ratio']   = df['follower_count'] / df['following_count_safe']
    df['post_density']     = df['post_count'] / df['follower_count_safe']
    df['engagement_score'] = np.log1p(df['follower_count']) * np.log1p(df['post_count'])
    
    if 'account_age_days' in df.columns:
        df['activity_ratio'] = df['post_count'] / (df['account_age_days'] + 1)
    else:
        df['activity_ratio'] = df['post_count']
    
    df['follower_growth_rate'] = df['follower_count'] / (df['post_count'] + 1)
    df['relative_engagement']  = (df['follower_count'] * df['post_count']) / (df['following_count_safe'])
    
    # Log transforms
    df['log_followers']  = np.log1p(df['follower_count'])
    df['log_following']  = np.log1p(df['following_count'])
    df['log_posts']      = np.log1p(df['post_count'])
    
    df.drop(['following_count_safe','follower_count_safe'], axis=1, inplace=True)
    
    return df.values

# ------------------- Build Pipeline -------------------
def build_enhanced_pipeline(stopwords_list):
    preprocessor = ColumnTransformer(
        transformers=[
            ('captions_tfidf', 
             TfidfVectorizer(
                 stop_words=stopwords_list,
                 max_features=3000,   
                 ngram_range=(1, 1),  
                 min_df=2,
                 max_df=0.95
             ), 
             'captions_clean'),
            
            ('bio_tfidf', 
             TfidfVectorizer(
                 stop_words=stopwords_list,
                 max_features=2000,   
                 ngram_range=(1, 1),  
                 min_df=2,
                 max_df=0.95
             ), 
             'biography_clean'),
            
            ('numeric', 
             Pipeline([
                 ('feat_eng', FunctionTransformer(extract_numeric_features)),
                 ('scaler', MinMaxScaler())
             ]), 
             ['follower_count', 'following_count', 'post_count']
             # add 'account_age_days' if available
            )
        ],
        remainder='drop'
    )
    
    rf = RandomForestClassifier(
        n_estimators=100,    
        max_depth=15,
        min_samples_split=5,
        min_samples_leaf=2,
        max_features='sqrt',
        class_weight='balanced',
        random_state=42,
        n_jobs=-1
    )
    
    gb = GradientBoostingClassifier(
        n_estimators=50,     
        max_depth=5,
        learning_rate=0.1,
        subsample=0.8,
        random_state=42
    )
    
    lr = LogisticRegression(
        C=2.0,
        class_weight='balanced',
        max_iter=1000,
        random_state=42,
        n_jobs=-1
    )
    
    ensemble = VotingClassifier(
        estimators=[
            ('rf', rf),
            ('gb', gb),
            ('lr', lr)
        ],
        voting='soft'
    )
    
    pipeline = ImbPipeline([
        ('preprocessor', preprocessor),
        ('smote', SMOTE(random_state=42)),
        ('classifier', ensemble)
    ])
    
    return pipeline


# ------------------- Example Usage -------------------
print("Training data info:")
print(x_train_df.info())
print("\nMissing values in training data:")
print(x_train_df.isnull().sum())

# 1) Preprocess text once
x_train_df['captions_clean'] = x_train_df['captions'].apply(advanced_text_preprocessing)
x_train_df['biography_clean'] = x_train_df['biography'].apply(advanced_text_preprocessing)

x_val_df['captions_clean'] = x_val_df['captions'].apply(advanced_text_preprocessing)
x_val_df['biography_clean'] = x_val_df['biography'].apply(advanced_text_preprocessing)

# 2) Convert y_train_labels to NumPy for direct integer indexing
y_train_labels_array = y_train_labels.values

# 3) Build pipeline
enhanced_pipeline = build_enhanced_pipeline(stopwords_list=turkish_stopwords)

# 4) Manual Cross-Validation with tqdm progress bar
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
scores = []

print("\nPerforming cross-validation with tqdm progress bar...")
for fold_idx, (train_idx, test_idx) in enumerate(
    tqdm(cv.split(x_train_df, y_train_labels_array), 
         total=cv.get_n_splits(), 
         desc='Cross-Validation')
):
    # Split the data
    X_fold_train = x_train_df.iloc[train_idx]
    y_fold_train = y_train_labels_array[train_idx]   # now works because it's a NumPy array
    X_fold_test  = x_train_df.iloc[test_idx]
    y_fold_test  = y_train_labels_array[test_idx]
    
    # Fit on this fold
    enhanced_pipeline.fit(X_fold_train, y_fold_train)
    # Predict
    preds = enhanced_pipeline.predict(X_fold_test)
    # Compute accuracy
    acc = (preds == y_fold_test).mean()
    scores.append(acc)

print("\nCross-validation scores:", scores)
print("Mean CV accuracy: {:.3f} (+/- {:.3f})".format(
    np.mean(scores), np.std(scores) * 2
))

# 5) Train final model on the entire training set
enhanced_pipeline.fit(x_train_df, y_train_labels_array)

# 6) Evaluate on Training Data
y_train_pred = enhanced_pipeline.predict(x_train_df)
print("\nTraining Classification Report:")
print(classification_report(y_train_labels_array, y_train_pred))

# 7) Evaluate on Validation Data
y_val_pred = enhanced_pipeline.predict(x_val_df)
print("\nValidation Classification Report:")
print(classification_report(y_val_labels, y_val_pred))


Training data info:
<class 'pandas.core.frame.DataFrame'>
Index: 2192 entries, 2638 to 2667
Data columns (total 7 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   captions         2192 non-null   object 
 1   biography        2192 non-null   object 
 2   follower_count   2192 non-null   int64  
 3   following_count  2192 non-null   int64  
 4   post_count       2192 non-null   float64
 5   captions_clean   2192 non-null   object 
 6   biography_clean  2192 non-null   object 
dtypes: float64(1), int64(2), object(4)
memory usage: 137.0+ KB
None

Missing values in training data:
captions           0
biography          0
follower_count     0
following_count    0
post_count         0
captions_clean     0
biography_clean    0
dtype: int64

Performing cross-validation with tqdm progress bar...


Cross-Validation: 100%|██████████| 5/5 [31:01<00:00, 372.27s/it]



Cross-validation scores: [0.6560364464692483, 0.662870159453303, 0.6552511415525114, 0.6643835616438356, 0.6484018264840182]
Mean CV accuracy: 0.657 (+/- 0.012)


KeyboardInterrupt: 

In [13]:
# After merging captions and profile data
print("Columns in train_full_df:")
print(train_full_df.columns.tolist())

print("\nFirst few rows of train_full_df:")
print(train_full_df.head())

Columns in train_full_df:
['username', 'captions', 'biography', 'follower_count', 'following_count', 'post_count', 'label']

First few rows of train_full_df:
               username                                           captions  \
0            deparmedya  cumhuriyetimizin 100yılı kutlu olsun\noriflame...   
1            kafesfirin  bugün bir fincan köpüklü türk kahvesiyle taçla...   
2              vimerang  saygı ve özlemle #atatürk #10kasım #10kasim193...   
3     mustafa_yalcinn38  altınoluk çevre şehircilik ve iklim değişikliğ...   
4  zorluenergysolutions  güne enerjik bir sohbet ile devam etmek ister ...   

                                           biography  follower_count  \
0           #mediaplanning #mediabuying #sosyalmedya            1167   
1  📍Söğütözü📍FTZ AVM\n🛒Ankara macro▲center v...           11997   
2       Dijital İletişim Yönetimi🎬info@vimerang.comq            2321   
3                            Talas Belediye Başkanı           13647   
4  Türkiye’ni

In [14]:
# Features and Labels
print("Features (X) columns:")
print(X.columns.tolist())

print("\nLabels (y) distribution:")
print(y.value_counts())

Features (X) columns:
['captions', 'biography', 'follower_count', 'following_count', 'post_count']

Labels (y) distribution:
label
food                    511
health and lifestyle    502
tech                    346
entertainment           323
fashion                 299
travel                  294
art                     191
mom and children        149
sports                  113
gaming                   13
Name: count, dtype: int64


In [15]:
print("Training set columns:")
print(x_train_df.columns.tolist())

print("\nValidation set columns:")
print(x_val_df.columns.tolist())

print("\nTraining set preview:")
print(x_train_df.head())

print("\nValidation set preview:")
print(x_val_df.head())


Training set columns:
['captions', 'biography', 'follower_count', 'following_count', 'post_count']

Validation set columns:
['captions', 'biography', 'follower_count', 'following_count', 'post_count']

Training set preview:
                                               captions  \
2638  bugün sofralarımızın vazgeçilmezi #düşükprotei...   
206   dün gece #kanald ekranlarında bizi 90lı yıllar...   
2073  ulu önderimiz gazi mustafa kemal atatürkün bed...   
354   formula 1 in teknoloji alt yapısı için lenovoy...   
1736  yaşasin cumhuriyet\n10102005\nsağlıklı ve mutl...   

                                              biography  follower_count  \
2638  Sosyal medya hesaplarımızın kullanım kuralları...            2562   
206   Her cumartesi saat 20.00’de @kanald ekranların...           66804   
2073  Cerrahpaşa Tıp Fakültesi, Nükleer Tıp Anabi...            3297   
354   Herkes için daha akıllı teknoloji, Dünyanın v...           59517   
1736                                          

In [16]:
# Step 1: Define File Paths Dynamically
# Get the current notebook directory
current_notebook_dir = os.getcwd()

# Get the repo directory (assuming notebooks are inside the "notebooks" folder)
repo_dir = os.path.abspath(os.path.join(current_notebook_dir, '..'))

# Get the data directory
data_dir = os.path.join(repo_dir, 'data')

# Get the testing directory
testing_dir = os.path.join(data_dir, 'testing')

# File path for 'test-classification-round1.dat'
test_data_path = os.path.join(testing_dir, 'test-classification-round1.dat')

# Step 2: Preview First 5 Lines of the Test File
with open(test_data_path, "rt", encoding="utf-8") as fh:
    for i, line in enumerate(fh):
        print(line.strip())
        if i == 4:  # Print only the first 5 lines
            break

print("*****")

# Step 3: Extract Usernames from Test Data
test_unames = []
with open(test_data_path, "rt", encoding="utf-8") as fh:
    for line in fh:
        test_unames.append(line.strip())

# Step 4: Verify Output
print(test_unames[:5])  # Display the first 5 usernames

ozhotelstr
elleturkiye
sozerinsaatorhangazi
sanliurfapiazzaavym
rusanozden
*****
['ozhotelstr', 'elleturkiye', 'sozerinsaatorhangazi', 'sanliurfapiazzaavym', 'rusanozden']


# Naive Base Classifier

### Now we can pass the numerical values to a classifier, Let's try Naive Base!


# Like Count Prediction


Here, we use the average like_count of the user's previous posts to predict each post's like_count

In [17]:
def predict_like_count(username, current_post=None):
  def get_avg_like_count(posts:list):
    total = 0.
    for post in posts:
      if current_post is not None and post["id"] == current_post["id"]:
        continue

      like_count = post.get("like_count", 0)
      if like_count is None:
        like_count = 0
      total += like_count

    if len(posts) == 0:
      return 0.

    return total / len(posts)

  if username in username2posts_train:
    return get_avg_like_count(username2posts_train[username])
  elif username in username2posts_test:
    return get_avg_like_count(username2posts_test[username])
  else:
    print(f"No data available for {username}")
    return -1

In [18]:
def log_mse_like_counts(y_true, y_pred):
  """
  Calculate the Log Mean Squared Error (Log MSE) for like counts (log(like_count + 1)).

  Parameters:
  - y_true: array-like, actual like counts
  - y_pred: array-like, predicted like counts

  Returns:
  - log_mse: float, Log Mean Squared Error
  """
  # Ensure inputs are numpy arrays
  y_true = np.array(y_true)
  y_pred = np.array(y_pred)

  # Log transformation: log(like_count + 1)
  log_y_true = np.log1p(y_true)
  log_y_pred = np.log1p(y_pred)

  # Compute squared errors
  squared_errors = (log_y_true - log_y_pred) ** 2

  # Return the mean of squared errors
  return np.mean(squared_errors)

In [None]:
#@title Train Dataset evaluation

y_like_count_train_true = []
y_like_count_train_pred = []
for uname, posts in username2posts_train.items():
  for post in posts:
    pred_val = predict_like_count(uname, post)
    true_val = post.get("like_count", 0)
    if true_val is None:
      true_val = 0

    y_like_count_train_true.append(true_val)
    y_like_count_train_pred.append(pred_val)

print(f"Log MSE Train= {log_mse_like_counts(y_like_count_train_true, y_like_count_train_pred)}")

Log MSE Train= 1.2271047744059362


In [None]:
# Step 1: Define File Paths Dynamically
# Get the current notebook directory
current_notebook_dir = os.getcwd()

# Get the repo directory (assuming notebooks are inside the "notebooks" folder)
repo_dir = os.path.abspath(os.path.join(current_notebook_dir, '..'))

# Get the data directory
data_dir = os.path.join(repo_dir, 'data')

# Get the testing directory
testing_dir = os.path.join(data_dir, 'testing')

# File path for 'test-regression-round1.jsonl'
test_dataset_path = os.path.join(testing_dir, 'test-regression-round1.jsonl')

# File path for output
output_dir = os.path.join(data_dir, 'output')
os.makedirs(output_dir, exist_ok=True)  # Ensure the output directory exists
output_file_path = os.path.join(output_dir, 'test-regression-round1.jsonl')

# Step 2: Process the Test Dataset
to_predict_like_counts_usernames = []
output_list = []

with open(test_dataset_path, "rt", encoding="utf-8") as fh:
    for line in fh:
        sample = json.loads(line)

        # Perform prediction
        pred_val = predict_like_count(sample["username"])  # Ensure `predict_like_count` is defined
        sample["like_count"] = int(pred_val)
        output_list.append(sample)

# Step 3: Save the Output to a File
with open(output_file_path, "wt", encoding="utf-8") as of:
    json.dump(output_list, of)

# Step 4: Output Verification
print(f"Processed data saved to: {output_file_path}")

Processed data saved to: c:\Users\itsmm\OneDrive\Desktop\CS412\CS412-InstagramInfluencersAnalysis\data\output\test-regression-round1.jsonl


In [21]:
# output_list first 3 items
pprint(output_list[:3])

[{'caption': 'KOZA 2023 2.si Damla’nın koleksiyonu, Latincede ‘Memento Mori’ '
             'olarak bilinen ‘ölümlü olduğunu hatırla’ anlamındaki ifadeden '
             'esinleniyor. Koleksiyon, hayatın ve ölümün, para, işçi, kral ve '
             'kraliçe kavramları üzerinden yaratıcı görünümlerle bir araya '
             'getirilmesini amaçlıyor. Ölüm sembollerinden esinlenen desenler '
             'kullanan Damla, “kağıt parçasından ibaret olmak” kavramını '
             'vurguluyor. Koleksiyon, yaşamın ve ölümün aynı anda ifade '
             'edilmesini hedefliyor; kırmızı ve mavi ışıklarla veya '
             'gözlüklerle görülen hologram efekti kullanılarak bu konsept '
             'sahneye taşınıyor. Kırmızı renk ölümü, mavi ise yaşamı '
             'simgeliyor. Koleksiyon, ofis giyimlerinden esinlenerek '
             'kravatlar, gömlekler ve evrak çantaları içeriyor. Klasik sivri '
             'burun çizmelerin üzerine spor ayakkabıların üst yüzeyi '
             'yerle