# Import Dependencies

In [180]:
import numpy as np
import pandas as pd
import gzip
import json
import os

from pprint import pprint

In [181]:
#@title Turkish StopWords

import nltk
from nltk.corpus import stopwords

nltk.download('stopwords')
turkish_stopwords = stopwords.words('turkish')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\itsmm\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


# Influencer Category Classification



1.   Read Data
2.   Preprocess Data
3.   Prepare Model
4.   Predict Test Data
4.   Save outputs



In [183]:
# Step 1: Define File Paths Dynamically
# Get the current notebook directory
current_notebook_dir = os.getcwd()

# Get the repo directory (assuming notebooks are inside the "notebooks" folder)
repo_dir = os.path.abspath(os.path.join(current_notebook_dir, '..'))

# Get the data directory
data_dir = os.path.join(repo_dir, 'data')

# Get the training directory
training_dir = os.path.join(data_dir, 'training')

# File path for 'train-classification.csv'
train_classification_path = os.path.join(training_dir, 'train-classification.csv')

# Step 2: Load Data Dynamically
train_classification_df = pd.read_csv(train_classification_path)
train_classification_df = train_classification_df.rename(columns={'Unnamed: 0': 'user_id', 'label': 'category'})

# Step 3: Unify Labels
train_classification_df["category"] = train_classification_df["category"].apply(str.lower)

# Step 4: Create User-to-Category Mapping
username2_category = train_classification_df.set_index("user_id").to_dict()["category"]

# Step 5: Verify Output
print("First few rows of the training classification DataFrame:")
train_classification_df.head()

First few rows of the training classification DataFrame:


Unnamed: 0,user_id,category
0,taskirancemal,mom and children
1,tam_kararinda,food
2,spart4nn,food
3,sosyalyiyiciler,food
4,sonaydizdarahad,mom and children


In [184]:
# stats about the labels
train_classification_df.groupby("category").count()

Unnamed: 0_level_0,user_id
category,Unnamed: 1_level_1
art,191
entertainment,323
fashion,299
food,511
gaming,13
health and lifestyle,503
mom and children,149
sports,113
tech,346
travel,294


In [185]:
username2_category["kod8net"]

'tech'

In [207]:
# Step 1: Define File Paths Dynamically
# Get the current notebook directory
current_notebook_dir = os.getcwd()

# Get the repo directory (assuming notebooks are inside the "notebooks" folder)
repo_dir = os.path.abspath(os.path.join(current_notebook_dir, '..'))

# Get the data directory
data_dir = os.path.join(repo_dir, 'data')

# Get the training directory
training_dir = os.path.join(data_dir, 'training')

# File path for 'training-dataset.jsonl.gz'
train_data_path = os.path.join(training_dir, 'training-dataset.jsonl.gz')

# Step 2: Initialize Dictionaries for Data
username2posts_train = dict()
username2profile_train = dict()

username2posts_test = dict()
username2profile_test = dict()

# Step 3: Process Data from 'training-dataset.jsonl.gz'
with gzip.open(train_data_path, "rt", encoding="utf-8") as fh:
    for line in fh:
        sample = json.loads(line)

        profile = sample["profile"]
        username = profile.get("username", "").strip()  # Handle missing or empty usernames
        if not username:
            continue  # Skip if username is missing or empty

        if username in username2_category:
            # Train data info
            username2posts_train[username] = sample["posts"]
            username2profile_train[username] = profile
        else:
            # Test data info
            username2posts_test[username] = sample["posts"]
            username2profile_test[username] = profile

# Step 4: Verify Output
print(f"Number of Training Users: {len(username2posts_train)}")
print(f"Number of Testing Users: {len(username2posts_test)}")

Number of Training Users: 2741
Number of Testing Users: 2674


In [211]:
# Step 4.1: Track Initial Row Counts
print(f"Number of Training Users (before enrichment): {len(username2profile_train)}")
print(f"Number of Testing Users (before enrichment): {len(username2profile_test)}")

# Step 4.2: Load additional annotated data
data_dir = os.path.dirname(train_data_path)  # Get the directory containing the JSONL file
extra_file_1 = os.path.join(data_dir, "annotated_users_CS412-9e7e94622d40Musa.csv")
extra_file_2 = os.path.join(data_dir, "annotated_users_CS412-2bedd0f2b2f6Neda.csv")

# Load the additional data
extra_data_1 = pd.read_csv(extra_file_1)
extra_data_2 = pd.read_csv(extra_file_2)

# Combine the two files
extra_data = pd.concat([extra_data_1, extra_data_2], ignore_index=True)

# Ensure consistent column names
extra_data.rename(
    columns={
        "username": "username",
        "influencerCategory": "category",
        "influencerMention": "mention",
        "accountType": "account_type",
    },
    inplace=True,
)

# Step 4.3: Add users from extra data to training profiles if they don't already exist
new_users_added_to_train = 0
for _, row in extra_data.iterrows():
    username = row['username']
    if username not in username2profile_train:
        # Add new user to training data
        username2profile_train[username] = {
            "username": username,
            "category": row["category"],
            "mention": row["mention"],
            "account_type": row["account_type"],
        }
        new_users_added_to_train += 1

new_users_added_to_test = 0
for _, row in extra_data.iterrows():
    username = row['username']
    if username not in username2profile_test:
        # Add new user to testing data
        username2profile_test[username] = {
            "username": username,
            "category": row["category"],
            "mention": row["mention"],
            "account_type": row["account_type"],
        }
        new_users_added_to_test += 1

# Step 4.4: Convert enriched dictionaries back to DataFrames
train_profile_df = pd.DataFrame.from_dict(username2profile_train, orient='index').reset_index()
test_profile_df = pd.DataFrame.from_dict(username2profile_test, orient='index').reset_index()

# Rename 'index' column to 'username'
train_profile_df.rename(columns={"index": "username"}, inplace=True)
test_profile_df.rename(columns={"index": "username"}, inplace=True)

# Step 4.5: Print Row Counts After Enrichment
print(f"Number of Training Users (after enrichment): {len(train_profile_df)} (Added {new_users_added_to_train} new users)")
print(f"Number of Testing Users (after enrichment): {len(test_profile_df)} (Added {new_users_added_to_test} new users)")


Number of Training Users (before enrichment): 2741
Number of Testing Users (before enrichment): 2674
Number of Training Users (after enrichment): 2945 (Added 204 new users)
Number of Testing Users (after enrichment): 2803 (Added 129 new users)


In [212]:
# Profile Dataframe
train_profile_df = pd.DataFrame(username2profile_train).T.reset_index(drop=True)
test_profile_df = pd.DataFrame(username2profile_test).T.reset_index(drop=True)

train_profile_df.head(2)

Unnamed: 0,username,id,full_name,biography,category_name,post_count,follower_count,following_count,is_business_account,is_private,...,is_verified_by_mv4b,is_regulated_c18,profile_pic_url,should_show_category,should_show_public_contacts,show_account_transparency_details,profile_picture_base64,category,mention,account_type
0,deparmedya,3170700063,Depar Medya,#mediaplanning #mediabuying #sosyalmedya,Local business,,1167,192,True,False,...,False,False,https://instagram.fsaw2-3.fna.fbcdn.net/v/t51....,True,True,True,/9j/4AAQSkZJRgABAQAAAQABAAD/2wBDAAgGBgcGBQgHBw...,,,
1,kafesfirin,266439571,KAFES FIRIN,📍Söğütözü📍FTZ AVM\n🛒Ankara macro▲center v...,Brand,,11997,17,True,False,...,False,False,https://instagram.fada1-13.fna.fbcdn.net/v/t51...,True,True,True,/9j/4AAQSkZJRgABAQAAAQABAAD/2wBDAAgGBgcGBQgHBw...,,,


In [213]:
test_profile_df.head(2)

Unnamed: 0,username,id,full_name,biography,category_name,post_count,follower_count,following_count,is_business_account,is_private,...,is_verified_by_mv4b,is_regulated_c18,profile_pic_url,should_show_category,should_show_public_contacts,show_account_transparency_details,profile_picture_base64,category,mention,account_type
0,beyazyakaliyiz,8634457436,Selam Beyaz Yakalı,Beyaz yakalıların dünyasına hoşgeldiniz 😀😀😀,Personal blog,,1265,665,True,False,...,False,False,https://instagram.fist6-1.fna.fbcdn.net/v/t51....,True,True,True,/9j/4AAQSkZJRgABAQAAAQABAAD/2wBDAAgGBgcGBQgHBw...,,,
1,totalenergies_istasyonlari,7066643793,TotalEnergies İstasyonları,TotalEnergies İstasyonları resmi Instagram hes...,Energy Company,,28025,4,True,False,...,False,False,https://instagram.fsaw2-1.fna.fbcdn.net/v/t51....,True,True,True,/9j/4AAQSkZJRgABAQAAAQABAAD/2wBDAAgGBgcGBQgHBw...,Tech,No,Company


In [195]:
from sklearn.feature_extraction.text import TfidfVectorizer
import re

def preprocess_text(text: str):
    # lower casing Turkish Text, Don't use str.lower :)
    text = text.casefold()

    # Remove URLs
    text = re.sub(r'http\S+|www\S+|https\S+', '', text, flags=re.MULTILINE)

    # Remove special characters and punctuation
    # HERE THE EMOJIS stuff are being removed, you may want to keep them :D
    text = re.sub(r'[^a-zçğıöşü0-9\s#@]', '', text)

    # Remove numbers
    text = re.sub(r'\d+', '', text)

    # Remove extra whitespaces
    text = re.sub(r'\s+', ' ', text).strip()

    return text


corpus = []

# to keep the label order
train_usernames = []

for username, posts in username2posts_train.items():
  train_usernames.append(username)

  # aggregating the posts per user
  cleaned_captions = []
  for post in posts:
    post_caption = post.get("caption", "")
    if post_caption is None:
      continue

    post_caption = preprocess_text(post_caption)

    if post_caption != "":
      cleaned_captions.append(post_caption)


  # joining the posts of each user with a \n
  user_post_captions = "\n".join(cleaned_captions)
  corpus.append(user_post_captions)


vectorizer = TfidfVectorizer(stop_words=turkish_stopwords, max_features=5000)

# fit the vectorizer
vectorizer.fit(corpus)


# transform the data into vectors
x_post_train = vectorizer.transform(corpus)
y_train = [username2_category.get(uname, "NA") for uname in train_usernames]


test_usernames = []
test_corpus = []
for username, posts in username2posts_test.items():
  test_usernames.append(username)
  # aggregating the posts per user
  cleaned_captions = []
  for post in posts:
    post_caption = post.get("caption", "")
    if post_caption is None:
      continue

    post_caption = preprocess_text(post_caption)

    if post_caption != "":
      cleaned_captions.append(post_caption)

  user_post_captions = "\n".join(cleaned_captions)
  test_corpus.append(user_post_captions)


# Just transforming! No Fitting!!!!!
x_post_test = vectorizer.transform(test_corpus)

In [196]:
# Making sure everything is fine
assert y_train.count("NA") == 0

In [197]:
feature_names = vectorizer.get_feature_names_out()
feature_names

array(['abdullah', 'abone', 'about', ..., 'şık', 'şıklık', 'şıklığı'],
      dtype=object)

In [198]:
df_tfidf = pd.DataFrame(x_post_train.toarray(), columns=feature_names)
df_tfidf.head(2)

Unnamed: 0,abdullah,abone,about,acele,acil,activities,acı,ad,ada,adam,...,şubemiz,şubesi,şölen,şöleni,şöyle,şükranla,şükür,şık,şıklık,şıklığı
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.050596,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [199]:
df_tfidf.shape

(2741, 5000)

In [200]:
from sklearn.model_selection import train_test_split

x_train, x_val, y_train, y_val = train_test_split(df_tfidf, y_train, test_size=0.2, stratify=y_train)

In [201]:
x_train.shape

(2192, 5000)

In [202]:
x_val.shape

(549, 5000)

# Naive Base Classifier

### Now we can pass the numerical values to a classifier, Let's try Naive Base!


In [203]:

from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, classification_report


model = MultinomialNB()
model.fit(x_train, y_train)

In [204]:
#@title Train Data
y_train_pred = model.predict(x_train)

print("Accuracy:", accuracy_score(y_train, y_train_pred))
print("\nClassification Report:")
print(classification_report(y_train, y_train_pred, zero_division=0))

Accuracy: 0.6656021897810219

Classification Report:
                      precision    recall  f1-score   support

                 art       0.89      0.10      0.19       153
       entertainment       0.62      0.58      0.60       258
             fashion       0.76      0.75      0.76       239
                food       0.81      0.91      0.86       409
              gaming       0.00      0.00      0.00        10
health and lifestyle       0.50      0.85      0.63       402
    mom and children       0.91      0.08      0.15       119
              sports       1.00      0.08      0.14        90
                tech       0.77      0.83      0.80       277
              travel       0.62      0.65      0.64       235

            accuracy                           0.67      2192
           macro avg       0.69      0.48      0.48      2192
        weighted avg       0.72      0.67      0.63      2192



In [205]:
#@title Validation Data
y_val_pred = model.predict(x_val)

print("Accuracy:", accuracy_score(y_val, y_val_pred))
print("\nClassification Report:")
print(classification_report(y_val, y_val_pred, zero_division=0))

Accuracy: 0.5901639344262295

Classification Report:
                      precision    recall  f1-score   support

                 art       1.00      0.05      0.10        38
       entertainment       0.46      0.45      0.45        65
             fashion       0.64      0.62      0.63        60
                food       0.72      0.82      0.77       102
              gaming       0.00      0.00      0.00         3
health and lifestyle       0.44      0.81      0.57       100
    mom and children       0.00      0.00      0.00        30
              sports       1.00      0.04      0.08        23
                tech       0.75      0.71      0.73        69
              travel       0.69      0.69      0.69        59

            accuracy                           0.59       549
           macro avg       0.57      0.42      0.40       549
        weighted avg       0.62      0.59      0.55       549



In [170]:
# Step 1: Define File Paths Dynamically
# Get the current notebook directory
current_notebook_dir = os.getcwd()

# Get the repo directory (assuming notebooks are inside the "notebooks" folder)
repo_dir = os.path.abspath(os.path.join(current_notebook_dir, '..'))

# Get the data directory
data_dir = os.path.join(repo_dir, 'data')

# Get the testing directory
testing_dir = os.path.join(data_dir, 'testing')

# File path for 'test-classification-round1.dat'
test_data_path = os.path.join(testing_dir, 'test-classification-round1.dat')

# Step 2: Preview First 5 Lines of the Test File
with open(test_data_path, "rt", encoding="utf-8") as fh:
    for i, line in enumerate(fh):
        print(line.strip())
        if i == 4:  # Print only the first 5 lines
            break

print("*****")

# Step 3: Extract Usernames from Test Data
test_unames = []
with open(test_data_path, "rt", encoding="utf-8") as fh:
    for line in fh:
        test_unames.append(line.strip())

# Step 4: Verify Output
print(test_unames[:5])  # Display the first 5 usernames

ozhotelstr
elleturkiye
sozerinsaatorhangazi
sanliurfapiazzaavym
rusanozden
*****
['ozhotelstr', 'elleturkiye', 'sozerinsaatorhangazi', 'sanliurfapiazzaavym', 'rusanozden']


In [171]:
x_test = []

for uname in test_unames:
  try:
    index = test_usernames.index(uname)
    x_test.append(x_post_test[index].toarray()[0])
  except Exception as e:
    try:
      index = train_usernames.index(uname)
      x_test.append(x_post_train[index].toarray()[0])
    except Exception as e:
      print(uname)


test_unames.remove("screenname")

screenname


In [172]:
df_test = pd.DataFrame(np.array(x_test), columns=feature_names)
df_test.head(2)

Unnamed: 0,abdullah,abone,about,acele,acil,activities,acı,ad,ada,adam,...,şubemiz,şubesi,şölen,şöleni,şöyle,şükranla,şükür,şık,şıklık,şıklığı
0,0.0,0.0,0.0,0.0,0.0,0.013628,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.025994,0.0,0.0,0.011087,0.0,0.0


In [173]:
test_pred = model.predict(df_test)

output = dict()
for index, uname in enumerate(test_unames):
  output[uname] = test_pred[index]

In [174]:
with open("output.json", "w") as of:
  json.dump(output, of, indent=4)

# Like Count Prediction


Here, we use the average like_count of the user's previous posts to predict each post's like_count

In [175]:
def predict_like_count(username, current_post=None):
  def get_avg_like_count(posts:list):
    total = 0.
    for post in posts:
      if current_post is not None and post["id"] == current_post["id"]:
        continue

      like_count = post.get("like_count", 0)
      if like_count is None:
        like_count = 0
      total += like_count

    if len(posts) == 0:
      return 0.

    return total / len(posts)

  if username in username2posts_train:
    return get_avg_like_count(username2posts_train[username])
  elif username in username2posts_test:
    return get_avg_like_count(username2posts_test[username])
  else:
    print(f"No data available for {username}")
    return -1

In [176]:
def log_mse_like_counts(y_true, y_pred):
  """
  Calculate the Log Mean Squared Error (Log MSE) for like counts (log(like_count + 1)).

  Parameters:
  - y_true: array-like, actual like counts
  - y_pred: array-like, predicted like counts

  Returns:
  - log_mse: float, Log Mean Squared Error
  """
  # Ensure inputs are numpy arrays
  y_true = np.array(y_true)
  y_pred = np.array(y_pred)

  # Log transformation: log(like_count + 1)
  log_y_true = np.log1p(y_true)
  log_y_pred = np.log1p(y_pred)

  # Compute squared errors
  squared_errors = (log_y_true - log_y_pred) ** 2

  # Return the mean of squared errors
  return np.mean(squared_errors)

In [177]:
#@title Train Dataset evaluation

y_like_count_train_true = []
y_like_count_train_pred = []
for uname, posts in username2posts_train.items():
  for post in posts:
    pred_val = predict_like_count(uname, post)
    true_val = post.get("like_count", 0)
    if true_val is None:
      true_val = 0

    y_like_count_train_true.append(true_val)
    y_like_count_train_pred.append(pred_val)

print(f"Log MSE Train= {log_mse_like_counts(y_like_count_train_true, y_like_count_train_pred)}")

Log MSE Train= 1.2271047744059362


In [178]:
# Step 1: Define File Paths Dynamically
# Get the current notebook directory
current_notebook_dir = os.getcwd()

# Get the repo directory (assuming notebooks are inside the "notebooks" folder)
repo_dir = os.path.abspath(os.path.join(current_notebook_dir, '..'))

# Get the data directory
data_dir = os.path.join(repo_dir, 'data')

# Get the testing directory
testing_dir = os.path.join(data_dir, 'testing')

# File path for 'test-regression-round1.jsonl'
test_dataset_path = os.path.join(testing_dir, 'test-regression-round1.jsonl')

# File path for output
output_dir = os.path.join(data_dir, 'output')
os.makedirs(output_dir, exist_ok=True)  # Ensure the output directory exists
output_file_path = os.path.join(output_dir, 'test-regression-round1.jsonl')

# Step 2: Process the Test Dataset
to_predict_like_counts_usernames = []
output_list = []

with open(test_dataset_path, "rt", encoding="utf-8") as fh:
    for line in fh:
        sample = json.loads(line)

        # Perform prediction
        pred_val = predict_like_count(sample["username"])  # Ensure `predict_like_count` is defined
        sample["like_count"] = int(pred_val)
        output_list.append(sample)

# Step 3: Save the Output to a File
with open(output_file_path, "wt", encoding="utf-8") as of:
    json.dump(output_list, of)

# Step 4: Output Verification
print(f"Processed data saved to: {output_file_path}")

Processed data saved to: c:\Users\itsmm\OneDrive\Desktop\CS412\CS412-InstagramInfluencersAnalysis\data\output\test-regression-round1.jsonl


In [179]:
# output_list first 3 items
pprint(output_list[:3])

[{'caption': 'KOZA 2023 2.si Damla’nın koleksiyonu, Latincede ‘Memento Mori’ '
             'olarak bilinen ‘ölümlü olduğunu hatırla’ anlamındaki ifadeden '
             'esinleniyor. Koleksiyon, hayatın ve ölümün, para, işçi, kral ve '
             'kraliçe kavramları üzerinden yaratıcı görünümlerle bir araya '
             'getirilmesini amaçlıyor. Ölüm sembollerinden esinlenen desenler '
             'kullanan Damla, “kağıt parçasından ibaret olmak” kavramını '
             'vurguluyor. Koleksiyon, yaşamın ve ölümün aynı anda ifade '
             'edilmesini hedefliyor; kırmızı ve mavi ışıklarla veya '
             'gözlüklerle görülen hologram efekti kullanılarak bu konsept '
             'sahneye taşınıyor. Kırmızı renk ölümü, mavi ise yaşamı '
             'simgeliyor. Koleksiyon, ofis giyimlerinden esinlenerek '
             'kravatlar, gömlekler ve evrak çantaları içeriyor. Klasik sivri '
             'burun çizmelerin üzerine spor ayakkabıların üst yüzeyi '
             'yerle