In [186]:
# Instagram Influencers Analysis
# Authors: Musa Misto & Neda Mohamed

# Purpose:
# This notebook serves as the main workspace for the CS412 course project on Instagram influencer analysis.
# The project aims to develop machine learning models for two primary tasks:
# 1. Multi-class classification to predict influencer categories based on profile meta-data and recent posts.
# 2. Regression analysis to estimate content popularity (e.g., like_count) using relevant features.
# The goal is to explore the dataset, preprocess the data, build and evaluate models, and document findings.

# Imports

In [187]:
import os
import gzip
import json
import pandas as pd
import nltk
from nltk.corpus import stopwords

# Reading Data


In [219]:
# 1) Download stopwords if they aren't already downloaded
nltk.download('stopwords')

# 2) Load Turkish stopwords
turkish_stopwords = stopwords.words('turkish')

# 3) Dynamically locate training data paths (no prints)
current_notebook_dir = os.getcwd()  # e.g. "repo/notebooks"
repo_dir = os.path.abspath(os.path.join(current_notebook_dir, '..'))  # "repo"
data_dir = os.path.join(repo_dir, 'data')                             # "repo/data"
training_dir = os.path.join(data_dir, 'training')                     # "repo/data/training"

train_csv_path = os.path.join(training_dir, 'train-classification.csv')
train_jsonl_path = os.path.join(training_dir, 'training-dataset.jsonl.gz')

# 4) Read the CSV into a DataFrame
df_classification = pd.read_csv(train_csv_path)

# 5) Read the JSONL.GZ lines into Python objects
records = []
with gzip.open(train_jsonl_path, 'rt', encoding='utf-8') as f:
    for line in f:
        record = json.loads(line)
        records.append(record)

# 6) Optionally, create two DataFrames if you want to split profile/post data
df_profiles_list = []
df_posts_list = []

for rec in records:
    profile = rec.get('profile', {})
    posts = rec.get('posts', [])
    
    df_profiles_list.append(profile)
    
    # For each post, attach the username to keep track
    for p in posts:
        post_entry = p.copy()  # shallow copy
        post_entry['username'] = profile.get('username')
        df_posts_list.append(post_entry)

df_profiles = pd.DataFrame(df_profiles_list)
df_posts = pd.DataFrame(df_posts_list)

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\itsmm\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


# PREPROCESSING & FEATURE ENGINEERING

In [189]:
# Check the number of rows and columns in each DataFrame
print(f"Classification DataFrame: {df_classification.shape}")
print(f"Profiles DataFrame: {df_profiles.shape}")
print(f"Posts DataFrame: {df_posts.shape}")

Classification DataFrame: (2742, 2)
Profiles DataFrame: (5415, 44)
Posts DataFrame: (187302, 8)


In [190]:
# View column names and data types for each DataFrame
print("Classification DataFrame Info:")
print(df_classification.info())
print("\nProfiles DataFrame Info:")
print(df_profiles.info())
print("\nPosts DataFrame Info:")
print(df_posts.info())

Classification DataFrame Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2742 entries, 0 to 2741
Data columns (total 2 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   Unnamed: 0  2742 non-null   object
 1   label       2742 non-null   object
dtypes: object(2)
memory usage: 43.0+ KB
None

Profiles DataFrame Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5415 entries, 0 to 5414
Data columns (total 44 columns):
 #   Column                             Non-Null Count  Dtype  
---  ------                             --------------  -----  
 0   username                           5415 non-null   object 
 1   id                                 5415 non-null   object 
 2   full_name                          5359 non-null   object 
 3   biography                          5067 non-null   object 
 4   category_name                      4395 non-null   object 
 5   post_count                         579 non-null    float64
 6   followe

In [191]:
# Check for missing values in each DataFrame
print("Missing Values in Classification DataFrame:")
print(df_classification.isnull().sum())
print("\nMissing Values in Profiles DataFrame:")
print(df_profiles.isnull().sum())
print("\nMissing Values in Posts DataFrame:")
print(df_posts.isnull().sum())

Missing Values in Classification DataFrame:
Unnamed: 0    0
label         0
dtype: int64

Missing Values in Profiles DataFrame:
username                                0
id                                      0
full_name                              56
biography                             348
category_name                        1020
post_count                           4836
follower_count                          0
following_count                         0
is_business_account                     0
is_private                              0
is_verified                             0
highlight_reel_count                    0
bio_links                               0
entities                              348
ai_agent_type                        5415
fb_profile_biolink                   5415
restricted_by_viewer                 5415
country_block                           0
eimu_id                                 0
external_url                          925
fbid                            

In [192]:
# Display the first few rows of df_classification
print(df_classification.head())
print("\n")

# Rename 'Unnamed: 0' to 'username'
df_classification.rename(columns={'Unnamed: 0': 'username'}, inplace=True)

# Verify the renaming
print(df_classification.head())
print("\n")

# Check the updated columns
print(df_classification.columns)
print("\n")

# Check for duplicate usernames in Classification DataFrame
duplicate_classification = df_classification['username'].duplicated().sum()
print(f"Duplicate Usernames in Classification DataFrame: {duplicate_classification}")

        Unnamed: 0             label
0    taskirancemal  Mom and Children
1    tam_kararinda              Food
2         spart4nn              Food
3  sosyalyiyiciler              Food
4  sonaydizdarahad  Mom and Children


          username             label
0    taskirancemal  Mom and Children
1    tam_kararinda              Food
2         spart4nn              Food
3  sosyalyiyiciler              Food
4  sonaydizdarahad  Mom and Children


Index(['username', 'label'], dtype='object')


Duplicate Usernames in Classification DataFrame: 0


In [193]:
# List all column names in df_profiles
print(df_profiles.columns.tolist())

# Find duplicated column names
duplicated_columns = df_profiles.columns[df_profiles.columns.duplicated()]
print(f"Duplicated Columns in Profiles DataFrame: {duplicated_columns.tolist()}")

# Remove duplicated columns, keeping the first occurrence
df_profiles_clean = df_profiles.loc[:, ~df_profiles.columns.duplicated()]

# Verify the columns after removal
print("Columns after removing duplicates:", df_profiles_clean.columns.tolist())

# Check for any remaining duplicated columns
remaining_duplicates = df_profiles_clean.columns[df_profiles_clean.columns.duplicated()]
print(f"Remaining Duplicated Columns in Profiles DataFrame: {remaining_duplicates.tolist()}")

['username', 'id', 'full_name', 'biography', 'category_name', 'post_count', 'follower_count', 'following_count', 'is_business_account', 'is_private', 'is_verified', 'highlight_reel_count', 'bio_links', 'entities', 'ai_agent_type', 'fb_profile_biolink', 'restricted_by_viewer', 'country_block', 'eimu_id', 'external_url', 'fbid', 'has_clips', 'hide_like_and_view_counts', 'is_professional_account', 'is_supervision_enabled', 'is_guardian_of_viewer', 'is_supervised_by_viewer', 'is_supervised_user', 'is_embeds_disabled', 'is_joined_recently', 'business_address_json', 'business_contact_method', 'business_email', 'business_phone_number', 'business_category_name', 'overall_category_name', 'category_enum', 'is_verified_by_mv4b', 'is_regulated_c18', 'profile_pic_url', 'should_show_category', 'should_show_public_contacts', 'show_account_transparency_details', 'profile_picture_base64']
Duplicated Columns in Profiles DataFrame: []
Columns after removing duplicates: ['username', 'id', 'full_name', 'bi

In [194]:
# 1. Drop columns that are entirely missing or irrelevant
columns_to_drop = ['ai_agent_type', 'fb_profile_biolink', 'restricted_by_viewer']
df_profiles_clean = df_profiles_clean.drop(columns=columns_to_drop, errors='ignore')

# 2. Fill missing 'full_name' with empty string
df_profiles_clean['full_name'] = df_profiles_clean['full_name'].fillna('')

# 3. Fill missing 'biography' with empty string
df_profiles_clean['biography'] = df_profiles_clean['biography'].fillna('')

# 4. Fill missing 'category_name' with 'Unknown'
df_profiles_clean['category_name'] = df_profiles_clean['category_name'].fillna('Unknown')

# 5. Fill missing 'post_count' with 0 and convert to integer
df_profiles_clean['post_count'] = df_profiles_clean['post_count'].fillna(0).astype(int)

# 6. Handle 'entities' by filling missing with empty dictionary
df_profiles_clean['entities'] = df_profiles_clean['entities'].fillna({})

# 7. Fill missing 'external_url' with empty string
if 'external_url' in df_profiles_clean.columns:
    df_profiles_clean['external_url'] = df_profiles_clean['external_url'].fillna('')

# Check for remaining missing values in Profiles DataFrame
print("Missing Values after Cleaning in Profiles DataFrame:")
print(df_profiles_clean.isnull().sum())

# 1. Drop 'profile_picture_base64' column
df_profiles_clean = df_profiles_clean.drop(columns=['profile_picture_base64'], errors='ignore')

# 2. Drop 'entities' column (if deemed unnecessary)
df_profiles_clean = df_profiles_clean.drop(columns=['entities'], errors='ignore')

# If you decide to keep 'entities', use the following instead:
# df_profiles_clean['entities'] = df_profiles_clean['entities'].fillna({})

print("Dropped 'profile_picture_base64' and 'entities' columns from Profiles DataFrame.")


Missing Values after Cleaning in Profiles DataFrame:
username                                0
id                                      0
full_name                               0
biography                               0
category_name                           0
post_count                              0
follower_count                          0
following_count                         0
is_business_account                     0
is_private                              0
is_verified                             0
highlight_reel_count                    0
bio_links                               0
entities                              348
country_block                           0
eimu_id                                 0
external_url                            0
fbid                                    0
has_clips                               0
hide_like_and_view_counts               0
is_professional_account                 0
is_supervision_enabled                  0
is_guardian_of_viewer  

In [195]:
# Check for remaining missing values in Profiles DataFrame
print("Missing Values after Cleaning in Profiles DataFrame:")
print(df_profiles_clean.isnull().sum())

# Define columns to drop that have all entries missing
columns_to_drop_all_missing = ['business_email', 'business_phone_number', 'overall_category_name']

# Drop these columns from df_profiles_clean
df_profiles_clean = df_profiles_clean.drop(columns=columns_to_drop_all_missing, errors='ignore')

print("Dropped columns with all missing values:", columns_to_drop_all_missing)

# Define columns with high missingness
columns_to_drop_high_missing = ['business_category_name', 'category_enum']

# Drop these columns from df_profiles_clean
df_profiles_clean = df_profiles_clean.drop(columns=columns_to_drop_high_missing, errors='ignore')

print("Dropped columns with high missingness:", columns_to_drop_high_missing)

# Check for remaining missing values in Profiles DataFrame
print("Missing Values after Final Cleaning in Profiles DataFrame:")
print(df_profiles_clean.isnull().sum())


Missing Values after Cleaning in Profiles DataFrame:
username                                0
id                                      0
full_name                               0
biography                               0
category_name                           0
post_count                              0
follower_count                          0
following_count                         0
is_business_account                     0
is_private                              0
is_verified                             0
highlight_reel_count                    0
bio_links                               0
country_block                           0
eimu_id                                 0
external_url                            0
fbid                                    0
has_clips                               0
hide_like_and_view_counts               0
is_professional_account                 0
is_supervision_enabled                  0
is_guardian_of_viewer                   0
is_supervised_by_viewer

In [196]:
# 1. Define the columns to drop
columns_to_drop_remaining = ['entities', 'profile_picture_base64']

# 2. Drop the specified columns from df_profiles_clean
df_profiles_clean = df_profiles_clean.drop(columns=columns_to_drop_remaining, errors='ignore')

print(f"Dropped columns: {columns_to_drop_remaining}")

# Check for remaining missing values in Profiles DataFrame
print("Missing Values after Final Cleaning in Profiles DataFrame:")
print(df_profiles_clean.isnull().sum())


Dropped columns: ['entities', 'profile_picture_base64']
Missing Values after Final Cleaning in Profiles DataFrame:
username                             0
id                                   0
full_name                            0
biography                            0
category_name                        0
post_count                           0
follower_count                       0
following_count                      0
is_business_account                  0
is_private                           0
is_verified                          0
highlight_reel_count                 0
bio_links                            0
country_block                        0
eimu_id                              0
external_url                         0
fbid                                 0
has_clips                            0
hide_like_and_view_counts            0
is_professional_account              0
is_supervision_enabled               0
is_guardian_of_viewer                0
is_supervised_by_viewer    

In [197]:
# Standardize 'username' in Classification DataFrame
df_classification_clean = df_classification.copy()
df_classification_clean['username'] = df_classification_clean['username'].str.strip().str.lower()

# Standardize 'username' in Profiles DataFrame
df_profiles_clean = df_profiles_clean.copy()
df_profiles_clean['username'] = df_profiles_clean['username'].str.strip().str.lower()

# Inspect standardized usernames in Classification DataFrame
print("First 5 standardized usernames in df_classification_clean:")
print(df_classification_clean['username'].head())

# Inspect standardized usernames in Profiles DataFrame
print("\nFirst 5 standardized usernames in df_profiles_clean:")
print(df_profiles_clean['username'].head())


First 5 standardized usernames in df_classification_clean:
0      taskirancemal
1      tam_kararinda
2           spart4nn
3    sosyalyiyiciler
4    sonaydizdarahad
Name: username, dtype: object

First 5 standardized usernames in df_profiles_clean:
0                    deparmedya
1                beyazyakaliyiz
2                    kafesfirin
3                      vimerang
4    totalenergies_istasyonlari
Name: username, dtype: object


In [198]:
import pandas as pd

# Assuming df_posts is your posts DataFrame and has a 'username' column
# and df_classification_clean and df_profiles_clean have been standardized

# Create a set of overlapping usernames for efficient lookup
overlapping_users = set(df_classification_clean['username']).intersection(set(df_profiles_clean['username']))

print(f"Number of overlapping users: {len(overlapping_users)}")
print(f"Sample overlapping usernames: {list(overlapping_users)[:10]}")

Number of overlapping users: 2741
Sample overlapping usernames: ['sumeyyeboyacii', 'safiloturkey', 'sivilsayfalar', 'frozzafood', 'yalcinyasemin', 'hatemoglu1924', 'istanbuldentopia', 'a.y.s.store', 'profdrbariscaynak', 'tick.tock.boom']


In [199]:
# Filter df_posts to include only overlapping users
df_posts_overlap = df_posts[df_posts['username'].isin(overlapping_users)].copy()

print(f"Shape of df_posts_overlap: {df_posts_overlap.shape}")

Shape of df_posts_overlap: (94824, 8)


In [200]:
# Convert 'timestamp' to datetime
df_posts_overlap['timestamp'] = pd.to_datetime(df_posts_overlap['timestamp'], errors='coerce')

# Check for any missing or invalid timestamps
missing_timestamps = df_posts_overlap['timestamp'].isnull().sum()
print(f"Number of missing or invalid 'timestamp' entries: {missing_timestamps}")

# Optionally, drop rows with missing timestamps
df_posts_overlap = df_posts_overlap.dropna(subset=['timestamp'])
print(f"Shape after dropping missing timestamps: {df_posts_overlap.shape}")

Number of missing or invalid 'timestamp' entries: 0
Shape after dropping missing timestamps: (94824, 8)


In [201]:
# Aggregating post-level data
df_post_agg = df_posts_overlap.groupby('username').agg(
    avg_likes=('like_count', 'mean'),
    avg_comments=('comments_count', 'mean'),
    total_posts=('id', 'count'),       # Assuming 'id' is unique per post
    last_post=('timestamp', 'max')     # Most recent post date
).reset_index()

print("\nAggregated Post-Level Data (df_post_agg):")
print(df_post_agg.head())


Aggregated Post-Level Data (df_post_agg):
        username   avg_likes  avg_comments  total_posts           last_post
0      1001sanat   20.657143      0.485714           35 2023-10-30 08:38:25
1   1924istanbul  571.742857     13.457143           35 2023-06-28 11:15:27
2     1dil1insan   16.400000      0.657143           35 2023-10-29 12:51:31
3  1kitap.1mekan  842.100000     25.342857           35 2023-10-08 09:25:46
4  1kizinonerisi  302.571429      1.571429           35 2023-09-08 19:02:15


In [202]:
# Merge classification and profiles data on 'username'
df_user = pd.merge(
    df_classification_clean,
    df_profiles_clean,
    on='username',
    how='left',            # Keeps all users from classification
    suffixes=('_class', '_profile')
)

print("\nMerged df_user DataFrame:")
print(df_user.head())


Merged df_user DataFrame:
          username             label          id        full_name  \
0    taskirancemal  Mom and Children  1282703608   Cemal Taşkıran   
1    tam_kararinda              Food  2114951482      Kaan Yarman   
2         spart4nn              Food   645610128     Cemil Ceylan   
3  sosyalyiyiciler              Food  1671187359                    
4  sonaydizdarahad  Mom and Children  1635669992  Sonay Demiryeri   

                                           biography    category_name  \
0                                     📍Antalya / Kaş     Entrepreneur   
1     Milliyet Pazar\nKalori Alacaksan Buna Değecek📚  Kitchen/cooking   
2  Küçük ev\nKamp ve doğa hayatı🏕\nMutfağımız doğ...    Video creator   
3  Founder @bitte.izmir \nIZMIR yemek/seyahat foo...          Unknown   
4  DİZDAR 🧿 Ahad 🧿 Dağhan \n@d.a.d.kids \n@dilanp...    Personal blog   

   post_count  follower_count  following_count is_business_account  ...  \
0      1382.0         12145.0           1064

In [203]:
# Merge post-level aggregated data into df_user
df_user = pd.merge(
    df_user,
    df_post_agg,
    on='username',
    how='left'            # Keeps all users from df_user
)

print("\ndf_user after merging post-level aggregated data:")
print(df_user[['username', 'avg_likes', 'avg_comments', 'total_posts', 'last_post']].head())


df_user after merging post-level aggregated data:
          username     avg_likes  avg_comments  total_posts  \
0    taskirancemal    422.971429     12.657143         35.0   
1    tam_kararinda  13460.457143    260.685714         35.0   
2         spart4nn  93142.085714    586.257143         35.0   
3  sosyalyiyiciler    309.285714     17.971429         35.0   
4  sonaydizdarahad  45827.342857    625.685714         35.0   

            last_post  
0 2023-05-15 14:32:32  
1 2023-10-17 14:55:11  
2 2023-10-14 14:17:30  
3 2023-10-08 07:47:03  
4 2023-10-07 20:04:25  


In [204]:
# Number of users with post-level data
num_with_posts = df_user['avg_likes'].notnull().sum()
num_without_posts = df_user['avg_likes'].isnull().sum()

print(f"\nNumber of users with post-level data: {num_with_posts}")
print(f"Number of users without post-level data: {num_without_posts}")

# Display users without post-level data, if any
if num_without_posts > 0:
    print("\nSample users without post-level data:")
    print(df_user[df_user['avg_likes'].isnull()]['username'].head(10))
else:
    print("\nAll users have post-level data.")


Number of users with post-level data: 2719
Number of users without post-level data: 23

Sample users without post-level data:
682      belediyesikose
1037    khloekardashian
1044      karikaturcaps
1048         aslangozde
1068       sserelyereli
1148        meerveebass
1270            hakanuc
1313         halicizade
1407    kumbahcemobilya
1481         sebainsaat
Name: username, dtype: object


In [205]:
# Fill missing numerical aggregated data with 0
df_user[['avg_likes', 'avg_comments', 'total_posts']] = df_user[['avg_likes', 'avg_comments', 'total_posts']].fillna(0)

print("\nNumerical Aggregated Features After Filling Missing Values:")
print(df_user[['username', 'avg_likes', 'avg_comments', 'total_posts']].tail(25))


Numerical Aggregated Features After Filling Missing Values:
                  username     avg_likes  avg_comments  total_posts
2717             serefoguz    140.028571      2.771429         35.0
2718        tahincioglugmk     72.285714      1.800000         35.0
2719       thecraton.hotel     21.828571      0.085714         35.0
2720        kaplan_turkiye     20.000000      0.314286         35.0
2721            demiirhaan  50490.400000    235.000000         35.0
2722              rawsters     32.166667      0.400000         35.0
2723      trendy.shoppings    492.333333     32.285714         35.0
2724  pastirmasucukkayseri    117.314286      7.028571         35.0
2725       villeroyboch_tr     11.714286      0.085714         35.0
2726             erkimilac     27.771429      0.028571         35.0
2727           kisafilmder    224.428571      8.085714         35.0
2728            tiytroadam    106.342857      2.885714         35.0
2729            cem.boyner   3080.657143    118.742857 

In [206]:
import pandas as pd

# Define placeholder date
placeholder_date = pd.Timestamp('1900-01-01')

# Fill missing 'last_post' with the placeholder date
df_user['last_post'] = df_user['last_post'].fillna(placeholder_date)

print("\n'last_post' Feature After Filling Missing Values:")
print(df_user[['username', 'last_post']].tail(25))


'last_post' Feature After Filling Missing Values:
                  username           last_post
2717             serefoguz 2023-11-18 03:47:44
2718        tahincioglugmk 2023-11-21 08:12:29
2719       thecraton.hotel 2023-11-22 16:12:51
2720        kaplan_turkiye 2023-10-11 11:30:05
2721            demiirhaan 2023-10-24 17:04:31
2722              rawsters 2023-11-13 07:50:24
2723      trendy.shoppings 2023-11-03 19:26:14
2724  pastirmasucukkayseri 2023-11-21 16:58:15
2725       villeroyboch_tr 2023-11-29 07:56:54
2726             erkimilac 2023-10-27 13:13:05
2727           kisafilmder 2023-10-26 16:16:46
2728            tiytroadam 2023-11-23 16:12:28
2729            cem.boyner 2023-10-22 13:11:23
2730            visitsplit 2023-10-20 07:00:03
2731             thehunger 2023-11-22 11:26:25
2732      kadir_albayrak59 2023-11-04 20:09:05
2733           muzegazhane 2023-11-17 10:23:08
2734          seyhaneturla 2022-11-17 18:11:55
2735      sistem.aluminyum 2023-11-17 14:43:08
2736   ko

In [207]:
import pandas as pd

# Define placeholder date
placeholder_date = pd.Timestamp('1900-01-01')

# Fill missing 'last_post' with the placeholder date
df_user['last_post'] = df_user['last_post'].fillna(placeholder_date)

print("\n'last_post' Feature After Filling Missing Values:")
print(df_user[['username', 'last_post']].tail(25))


'last_post' Feature After Filling Missing Values:
                  username           last_post
2717             serefoguz 2023-11-18 03:47:44
2718        tahincioglugmk 2023-11-21 08:12:29
2719       thecraton.hotel 2023-11-22 16:12:51
2720        kaplan_turkiye 2023-10-11 11:30:05
2721            demiirhaan 2023-10-24 17:04:31
2722              rawsters 2023-11-13 07:50:24
2723      trendy.shoppings 2023-11-03 19:26:14
2724  pastirmasucukkayseri 2023-11-21 16:58:15
2725       villeroyboch_tr 2023-11-29 07:56:54
2726             erkimilac 2023-10-27 13:13:05
2727           kisafilmder 2023-10-26 16:16:46
2728            tiytroadam 2023-11-23 16:12:28
2729            cem.boyner 2023-10-22 13:11:23
2730            visitsplit 2023-10-20 07:00:03
2731             thehunger 2023-11-22 11:26:25
2732      kadir_albayrak59 2023-11-04 20:09:05
2733           muzegazhane 2023-11-17 10:23:08
2734          seyhaneturla 2022-11-17 18:11:55
2735      sistem.aluminyum 2023-11-17 14:43:08
2736   ko

In [208]:
from datetime import datetime

# Current date for reference
current_date = pd.Timestamp(datetime.now())

# Calculate Days Since Last Post
df_user['days_since_last_post'] = (current_date - df_user['last_post']).dt.days

# Handle anomalies: negative days if 'last_post' is in the future
df_user['days_since_last_post'] = df_user['days_since_last_post'].apply(lambda x: x if x >= 0 else 0)

# For users with placeholder 'last_post', set 'days_since_last_post' to -1 to indicate no posts
df_user.loc[df_user['last_post'] == placeholder_date, 'days_since_last_post'] = -1

print("\n'days_since_last_post' Feature After Imputation:")
print(df_user[['username', 'last_post', 'days_since_last_post']].tail(25))


'days_since_last_post' Feature After Imputation:
                  username           last_post  days_since_last_post
2717             serefoguz 2023-11-18 03:47:44                   400
2718        tahincioglugmk 2023-11-21 08:12:29                   397
2719       thecraton.hotel 2023-11-22 16:12:51                   396
2720        kaplan_turkiye 2023-10-11 11:30:05                   438
2721            demiirhaan 2023-10-24 17:04:31                   425
2722              rawsters 2023-11-13 07:50:24                   405
2723      trendy.shoppings 2023-11-03 19:26:14                   415
2724  pastirmasucukkayseri 2023-11-21 16:58:15                   397
2725       villeroyboch_tr 2023-11-29 07:56:54                   389
2726             erkimilac 2023-10-27 13:13:05                   422
2727           kisafilmder 2023-10-26 16:16:46                   423
2728            tiytroadam 2023-11-23 16:12:28                   395
2729            cem.boyner 2023-10-22 13:11:23       

In [209]:
# Check the number of users with 'days_since_last_post' as -1
num_no_posts = (df_user['days_since_last_post'] == -1).sum()
print(f"\nNumber of users with no posts (days_since_last_post = -1): {num_no_posts}")

# Display the users without post data
print("\nUsers without post-level data:")
print(df_user[df_user['days_since_last_post'] == -1]['username'].head(25))


Number of users with no posts (days_since_last_post = -1): 3

Users without post-level data:
682         belediyesikose
1720     touchdownistanbul
1973    orhanelibelediyesi
Name: username, dtype: object


# Feature  Engineering

In [210]:
from sklearn.feature_extraction.text import TfidfVectorizer

# Ensure 'biography' is string and handle missing values
df_user['biography'] = df_user['biography'].fillna('').astype(str)

# Define a list of Turkish stopwords (ensure it's comprehensive)
turkish_stopwords = [
    've', 'bir', 'bu', 'da', 'de', 'ile', 'çok', 'çokça', 'da', 'ne', 'mi', 'bu', 
    'ben', 'siz', 'o', 'biz', 'sizler'
]

# Initialize TF-IDF Vectorizer
tfidf_vectorizer = TfidfVectorizer(
    max_features=500,               # Adjust based on your dataset and computational resources
    stop_words=turkish_stopwords,   # Use Turkish stopwords
    lowercase=True,                 # Convert all text to lowercase
    ngram_range=(1, 2)              # Consider unigrams and bigrams
)

# Fit and transform the 'biography' column
tfidf_bio = tfidf_vectorizer.fit_transform(df_user['biography'])

# Convert the TF-IDF matrix to a DataFrame
tfidf_bio_df = pd.DataFrame(tfidf_bio.toarray(), columns=tfidf_vectorizer.get_feature_names_out())

# Concatenate the TF-IDF features with the original DataFrame
df_user = pd.concat([df_user.reset_index(drop=True), tfidf_bio_df.reset_index(drop=True)], axis=1)

print("\nTF-IDF Features Added to df_user:")
print(df_user.head())


TF-IDF Features Added to df_user:
          username             label          id        full_name  \
0    taskirancemal  Mom and Children  1282703608   Cemal Taşkıran   
1    tam_kararinda              Food  2114951482      Kaan Yarman   
2         spart4nn              Food   645610128     Cemil Ceylan   
3  sosyalyiyiciler              Food  1671187359                    
4  sonaydizdarahad  Mom and Children  1635669992  Sonay Demiryeri   

                                           biography    category_name  \
0                                     📍Antalya / Kaş     Entrepreneur   
1     Milliyet Pazar\nKalori Alacaksan Buna Değecek📚  Kitchen/cooking   
2  Küçük ev\nKamp ve doğa hayatı🏕\nMutfağımız doğ...    Video creator   
3  Founder @bitte.izmir \nIZMIR yemek/seyahat foo...          Unknown   
4  DİZDAR 🧿 Ahad 🧿 Dağhan \n@d.a.d.kids \n@dilanp...    Personal blog   

   post_count  follower_count  following_count is_business_account  ...  \
0      1382.0         12145.0       

In [211]:
from sklearn.preprocessing import LabelEncoder

# Initialize Label Encoder
label_encoder = LabelEncoder()

# Encode 'category_name' if not already encoded
df_user['category_encoded'] = label_encoder.fit_transform(df_user['category_name'].astype(str))

print("\nCategorical Variables Encoded Using Label Encoding:")
print(df_user[['category_name', 'category_encoded']].head())


Categorical Variables Encoded Using Label Encoding:
     category_name  category_encoded
0     Entrepreneur               116
1  Kitchen/cooking               180
2    Video creator               323
3          Unknown               319
4    Personal blog               229


In [212]:
# Avoid division by zero by replacing zero total_posts with 1 temporarily
df_user['total_posts_nonzero'] = df_user['total_posts'].replace(0, 1)

# Calculate engagement_rate
df_user['engagement_rate'] = (df_user['avg_likes'] + df_user['avg_comments']) / df_user['total_posts_nonzero']

# Drop the temporary column
df_user = df_user.drop(columns=['total_posts_nonzero'])

# Verify the new column
print("\n'engagement_rate' Feature Added:")
print(df_user[['username', 'avg_likes', 'avg_comments', 'total_posts', 'engagement_rate']].head())


'engagement_rate' Feature Added:
          username     avg_likes  avg_comments  total_posts  engagement_rate
0    taskirancemal    422.971429     12.657143         35.0        12.446531
1    tam_kararinda  13460.457143    260.685714         35.0       392.032653
2         spart4nn  93142.085714    586.257143         35.0      2677.952653
3  sosyalyiyiciler    309.285714     17.971429         35.0         9.350204
4  sonaydizdarahad  45827.342857    625.685714         35.0      1327.229388


In [213]:
from sklearn.preprocessing import StandardScaler

# Define numerical features to scale
numerical_features = [
    'follower_count',
    'following_count',
    'highlight_reel_count',
    'avg_likes',
    'avg_comments',
    'total_posts',
    'days_since_last_post',
    'engagement_rate'
]

# Initialize StandardScaler
scaler = StandardScaler()

# Fit and transform the numerical features
df_user[numerical_features] = scaler.fit_transform(df_user[numerical_features])

print("\nNumerical Features Scaled Using StandardScaler:")
print(df_user[numerical_features].head())


Numerical Features Scaled Using StandardScaler:
   follower_count  following_count  highlight_reel_count  avg_likes  \
0       -0.050865         0.616689             -0.064921  -0.160447   
1        0.075421         0.409158              2.652599   0.216413   
2        0.131779         0.018326              7.081150   2.519677   
3       -0.048431         4.003552              1.679660  -0.163733   
4        0.075116         1.473150              0.371225   1.152005   

   avg_comments  total_posts  days_since_last_post  engagement_rate  
0     -0.061870     0.134432              0.682042        -0.150072  
1      0.077402     0.134432             -0.177230         0.172871  
2      0.260215     0.134432             -0.160599         2.117676  
3     -0.058886     0.134432             -0.127337        -0.152706  
4      0.282355     0.134432             -0.121793         0.968514  


# TRAIN / VALIDATION SPLIT, MODEL SELECTION, & TRAINING (CLASSIFICATION)

In [218]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.metrics import accuracy_score, mean_squared_error
from sklearn.preprocessing import LabelEncoder, StandardScaler

# Ensure dataset is loaded into df_user (from previous steps)
# Define features for classification and regression
classification_features = ['follower_count', 'following_count', 'highlight_reel_count', 'avg_likes', 
                           'avg_comments', 'total_posts', 'days_since_last_post', 'engagement_rate']

# Train-Test Split for Classification
X_class = df_user[classification_features]
y_class = df_user['category_encoded']  # Assuming category_encoded exists
X_train_class, X_test_class, y_train_class, y_test_class = train_test_split(
    X_class, y_class, test_size=0.2, random_state=42
)

# Train-Test Split for Regression
X_reg = df_user[classification_features]
y_reg = np.log1p(df_user['like_count'])  # Apply log1p transformation for regression target
X_train_reg, X_test_reg, y_train_reg, y_test_reg = train_test_split(
    X_reg, y_reg, test_size=0.2, random_state=42
)

# Initialize StandardScaler
scaler = StandardScaler()

# Scale Classification Features
X_train_class = scaler.fit_transform(X_train_class)
X_test_class = scaler.transform(X_test_class)

# Scale Regression Features
X_train_reg = scaler.fit_transform(X_train_reg)
X_test_reg = scaler.transform(X_test_reg)

# Classification Model: Random Forest
clf = RandomForestClassifier(n_estimators=100, random_state=42)
clf.fit(X_train_class, y_train_class)

# Predictions and Evaluation (Classification)
y_pred_class = clf.predict(X_test_class)
classification_accuracy = accuracy_score(y_test_class, y_pred_class)
print(f"Classification Accuracy: {classification_accuracy:.4f}")

# Regression Model: Random Forest
reg = RandomForestRegressor(n_estimators=100, random_state=42)
reg.fit(X_train_reg, y_train_reg)

# Predictions and Evaluation (Regression)
y_pred_reg = reg.predict(X_test_reg)
mse = mean_squared_error(y_test_reg, y_pred_reg)
print(f"Regression Mean Squared Error: {mse:.4f}")

# Print Predictions
print("\nClassification Predictions (Actual vs Predicted):")
classification_results = pd.DataFrame({
    'Actual': y_test_class,
    'Predicted': y_pred_class
})
print(classification_results.head())

print("\nRegression Predictions (Actual vs Predicted):")
regression_results = pd.DataFrame({
    'Actual': np.expm1(y_test_reg),  # Reverse log transformation
    'Predicted': np.expm1(y_pred_reg)
})
print(regression_results.head())

KeyError: 'like_count'

# PREDICT ON TEST DATA FOR CLASSIFICATION & SAVE

# REGRESSION FOR LIKE_COUNT

# TEST REGRESSION PREDICTIONS & SAVE