# Import Dependencies

In [21]:
# Cell 1: Import Necessary Libraries

import os
import json
import gzip
import re
from typing import Dict, List, Tuple

import numpy as np
import pandas as pd
import nltk
from nltk.corpus import stopwords
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import TfidfVectorizer

# Download Turkish stopwords
nltk.download('stopwords')
TURKISH_STOPWORDS = stopwords.words('turkish')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\itsmm\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [22]:
# Cell 2: Define Repository Structure and Data Paths

# Current Notebook Directory (e.g., "repo/notebooks")
current_notebook_dir = os.getcwd()

# Repository Root Directory (e.g., "repo")
repo_dir = os.path.abspath(os.path.join(current_notebook_dir, '..'))

# Data Directory (e.g., "repo/data")
data_dir = os.path.join(repo_dir, 'data')

# Training, Testing, and Output Directories
training_dir = os.path.join(data_dir, 'training')
testing_dir = os.path.join(data_dir, 'testing')
output_dir = os.path.join(data_dir, 'output')

# Ensure that the output directory exists
os.makedirs(output_dir, exist_ok=True)

# File Paths
train_csv_path = os.path.join(training_dir, 'train-classification.csv')
train_jsonl_path = os.path.join(training_dir, 'training-dataset.jsonl.gz')
test_classification_path = os.path.join(testing_dir, 'test-classification-round1.dat')
test_regression_path = os.path.join(testing_dir, 'test-regression-round1.jsonl')

In [23]:
# Cell 3: Load and Preprocess Train Classification Data

def load_train_classification(file_path: str) -> pd.DataFrame:
    """
    Loads and preprocesses the train-classification CSV file.

    Parameters:
    - file_path: Path to the train-classification.csv file.

    Returns:
    - DataFrame with renamed columns and standardized category labels.
    """
    try:
        df = pd.read_csv(file_path)
        df = df.rename(columns={'Unnamed: 0': 'user_id', 'label': 'category'})
        df["category"] = df["category"].str.lower()
        return df
    except FileNotFoundError:
        print(f"File not found: {file_path}")
        return pd.DataFrame()
    except Exception as e:
        print(f"Error loading train classification data: {e}")
        return pd.DataFrame()


# Load the train classification data
df_classification = load_train_classification(train_csv_path)

# Display the first few rows to verify
df_classification.head()

Unnamed: 0,user_id,category
0,taskirancemal,mom and children
1,tam_kararinda,food
2,spart4nn,food
3,sosyalyiyiciler,food
4,sonaydizdarahad,mom and children


In [24]:
# Cell 4: Create User to Category Mapping

def map_user_to_category(df: pd.DataFrame) -> Dict[str, str]:
    """
    Creates a mapping from user IDs to categories.

    Parameters:
    - df: DataFrame containing 'user_id' and 'category' columns.

    Returns:
    - Dictionary mapping user IDs to categories.
    """
    try:
        return df.set_index("user_id").to_dict()["category"]
    except KeyError:
        print("Columns 'user_id' and/or 'category' not found in DataFrame.")
        return {}
    except Exception as e:
        print(f"Error mapping user to category: {e}")
        return {}


# Create the mapping
username2_category = map_user_to_category(df_classification)

# Verify the mapping for a sample user
sample_user_id = df_classification['user_id'].iloc[0]
print(f"User ID: {sample_user_id}, Category: {username2_category.get(sample_user_id)}")

User ID: taskirancemal, Category: mom and children


In [25]:
# Cell 5: Segregate Training and Testing Data

def load_training_data(file_path: str, user_category_map: Dict[str, str]) -> Tuple[Dict[str, List[Dict]], Dict[str, Dict]]:
    """
    Loads and segregates training and testing data based on user categories.

    Parameters:
    - file_path: Path to the training-dataset.jsonl.gz file.
    - user_category_map: Dictionary mapping user IDs to categories.

    Returns:
    - Tuple containing:
        - username2posts_train: Dict mapping usernames to their posts for training.
        - username2profile_train: Dict mapping usernames to their profile data for training.
    """
    username2posts_train = {}
    username2profile_train = {}
    username2posts_test = {}
    username2profile_test = {}
    
    try:
        with gzip.open(file_path, "rt", encoding='utf-8') as fh:
            for line_number, line in enumerate(fh, start=1):
                try:
                    sample = json.loads(line)
                    profile = sample.get("profile", {})
                    username = profile.get("username", "").strip()
                    
                    if not username:
                        print(f"Line {line_number}: Missing username. Skipping.")
                        continue
    
                    if username in user_category_map:
                        # Training Data
                        username2posts_train[username] = sample.get("posts", [])
                        username2profile_train[username] = profile
                    else:
                        # Testing Data
                        username2posts_test[username] = sample.get("posts", [])
                        username2profile_test[username] = profile
                except json.JSONDecodeError:
                    print(f"Line {line_number}: JSON decode error. Skipping.")
                except Exception as e:
                    print(f"Line {line_number}: Unexpected error: {e}. Skipping.")
    
        return username2posts_train, username2profile_train, username2posts_test, username2profile_test
    except FileNotFoundError:
        print(f"File not found: {file_path}")
        return username2posts_train, username2profile_train, username2posts_test, username2profile_test
    except Exception as e:
        print(f"Error loading training data: {e}")
        return username2posts_train, username2profile_train, username2posts_test, username2profile_test


# Load and segregate the training data
username2posts_train, username2profile_train, username2posts_test, username2profile_test = load_training_data(train_jsonl_path, username2_category)

# Display counts
print(f"Number of Training Users: {len(username2posts_train)}")
print(f"Number of Testing Users: {len(username2posts_test)}")

Number of Training Users: 2741
Number of Testing Users: 2674


In [26]:
# Cell 6: Create DataFrames for User Profiles

def create_profile_dataframe(username2profile: Dict[str, Dict]) -> pd.DataFrame:
    """
    Creates a DataFrame from user profiles.

    Parameters:
    - username2profile: Dict mapping usernames to their profile data.

    Returns:
    - DataFrame containing user profiles.
    """
    try:
        df = pd.DataFrame(username2profile).T.reset_index(drop=True)
        return df
    except Exception as e:
        print(f"Error creating profile DataFrame: {e}")
        return pd.DataFrame()


# Create DataFrames for training and testing profiles
train_profile_df = create_profile_dataframe(username2profile_train)
test_profile_df = create_profile_dataframe(username2profile_test)

# Display the first few rows of the training profile DataFrame
print("Training Profile DataFrame:")
display(train_profile_df.head())

# Display the first few rows of the testing profile DataFrame
print("\nTesting Profile DataFrame:")
display(test_profile_df.head())

Training Profile DataFrame:


Unnamed: 0,username,id,full_name,biography,category_name,post_count,follower_count,following_count,is_business_account,is_private,...,business_category_name,overall_category_name,category_enum,is_verified_by_mv4b,is_regulated_c18,profile_pic_url,should_show_category,should_show_public_contacts,show_account_transparency_details,profile_picture_base64
0,deparmedya,3170700063,Depar Medya,#mediaplanning #mediabuying #sosyalmedya,Local business,,1167,192,True,False,...,,,LOCAL,False,False,https://instagram.fsaw2-3.fna.fbcdn.net/v/t51....,True,True,True,/9j/4AAQSkZJRgABAQAAAQABAAD/2wBDAAgGBgcGBQgHBw...
1,kafesfirin,266439571,KAFES FIRIN,📍Söğütözü📍FTZ AVM\n🛒Ankara macro▲center v...,Brand,,11997,17,True,False,...,,,BRAND,False,False,https://instagram.fada1-13.fna.fbcdn.net/v/t51...,True,True,True,/9j/4AAQSkZJRgABAQAAAQABAAD/2wBDAAgGBgcGBQgHBw...
2,vimerang,2367195567,Vimerang,Dijital İletişim Yönetimi🎬info@vimerang.comq,,,2321,454,True,False,...,Creators & Celebrities,,VIDEO_CREATOR,False,False,https://instagram.fist19-1.fna.fbcdn.net/v/t51...,True,True,True,/9j/4AAQSkZJRgABAQAAAQABAAD/2wBDAAgGBgcGBQgHBw...
3,mustafa_yalcinn38,9606564254,Mustafa Yalçın,Talas Belediye Başkanı,Politician,,13647,29,True,False,...,,,POLITICIAN,False,False,https://instagram.fist1-4.fna.fbcdn.net/v/t51....,True,False,True,/9j/4AAQSkZJRgABAQAAAQABAAD/2wBDAAgGBgcGBQgHBw...
4,zorluenergysolutions,8155780357,ZES (Zorlu Energy Solutions),Türkiye’nin 81 ilindeki en yaygın elektrikli ş...,,,7917,11,True,False,...,,,ENERGY_COMPANY,False,False,https://instagram.fayt2-2.fna.fbcdn.net/v/t51....,True,True,True,/9j/4AAQSkZJRgABAQAAAQABAAD/2wBDAAgGBgcGBQgHBw...



Testing Profile DataFrame:


Unnamed: 0,username,id,full_name,biography,category_name,post_count,follower_count,following_count,is_business_account,is_private,...,business_category_name,overall_category_name,category_enum,is_verified_by_mv4b,is_regulated_c18,profile_pic_url,should_show_category,should_show_public_contacts,show_account_transparency_details,profile_picture_base64
0,beyazyakaliyiz,8634457436,Selam Beyaz Yakalı,Beyaz yakalıların dünyasına hoşgeldiniz 😀😀😀,Personal blog,,1265,665,True,False,...,,,PERSONAL_BLOG,False,False,https://instagram.fist6-1.fna.fbcdn.net/v/t51....,True,True,True,/9j/4AAQSkZJRgABAQAAAQABAAD/2wBDAAgGBgcGBQgHBw...
1,totalenergies_istasyonlari,7066643793,TotalEnergies İstasyonları,TotalEnergies İstasyonları resmi Instagram hes...,Energy Company,,28025,4,True,False,...,,,ENERGY_COMPANY,False,False,https://instagram.fsaw2-1.fna.fbcdn.net/v/t51....,True,True,True,/9j/4AAQSkZJRgABAQAAAQABAAD/2wBDAAgGBgcGBQgHBw...
2,konforyatak,8782109673,Konfor Yatak #KonforluUykular,"Konfor Yatak, birbirinden farklı özelliklere s...",Furniture,,40334,2,True,False,...,,,FURNITURE,False,False,https://instagram.fyei6-3.fna.fbcdn.net/v/t51....,True,True,True,/9j/4AAQSkZJRgABAQAAAQABAAD/2wBDAAgGBgcGBQgHBw...
3,ht_kulup,1950140344,HT KULÜP,Bloomberght - Habertürk Magazin,,,158877,69,True,False,...,,,TV_SHOW,False,False,https://instagram.fada2-1.fna.fbcdn.net/v/t51....,True,True,True,/9j/4AAQSkZJRgABAQAAAQABAAD/2wBDAAgGBgcGBQgHBw...
4,ajansspor,338611487,Ajansspor,"🏢 Saran Group \n🏟 Anında, tarafsız spor haberl...",News & media website,,93193,286,True,False,...,,,NEWS_SITE,False,False,https://instagram.fadb2-1.fna.fbcdn.net/v/t51....,True,True,True,/9j/4AAQSkZJRgABAQAAAQABAAD/2wBDAAgGBgcGBQgHBw...


In [27]:
# Cell 7: Initial Data Exploration and Verification

def initial_data_exploration(train_class_df: pd.DataFrame, train_profile_df: pd.DataFrame, test_profile_df: pd.DataFrame):
    """
    Performs initial data exploration and visualization.

    Parameters:
    - train_class_df: DataFrame containing training classification data.
    - train_profile_df: DataFrame containing training profile data.
    - test_profile_df: DataFrame containing testing profile data.
    """
    try:
        print("\n=== Class Distribution ===")
        class_counts = train_class_df['category'].value_counts()
        print(class_counts)
        
        print("\n=== Training Profile DataFrame Statistics ===")
        print(train_profile_df.describe(include='all'))
        
        print("\n=== Testing Profile DataFrame Statistics ===")
        print(test_profile_df.describe(include='all'))
        
        print("\n=== Missing Values in Training Classification DataFrame ===")
        print(train_class_df.isnull().sum())
        
        print("\n=== Missing Values in Training Profile DataFrame ===")
        print(train_profile_df.isnull().sum())
        
        print("\n=== Missing Values in Testing Profile DataFrame ===")
        print(test_profile_df.isnull().sum())
        
        # Additional exploration can be added here (e.g., unique values, value distributions)
    except Exception as e:
        print(f"Error during initial data exploration: {e}")


# Perform initial data exploration
initial_data_exploration(df_classification, train_profile_df, test_profile_df)


=== Class Distribution ===
category
food                    511
health and lifestyle    503
tech                    346
entertainment           323
fashion                 299
travel                  294
art                     191
mom and children        149
sports                  113
gaming                   13
Name: count, dtype: int64

=== Training Profile DataFrame Statistics ===
          username          id    full_name     biography    category_name  \
count         2741        2741         2715          2571             2226   
unique        2741        2741         2715          2565              333   
top     deparmedya  3170700063  Depar Medya  @bigbosslayf  Product/service   
freq             1           1            1             2              178   

        post_count  follower_count  following_count is_business_account  \
count        275.0            2741             2741                2741   
unique       259.0            2654              986                  

In [28]:
# Cell 8: Handle Missing Values

def handle_missing_values(df_classification: pd.DataFrame, 
                         df_train_profile: pd.DataFrame, 
                         df_test_profile: pd.DataFrame) -> Tuple[pd.DataFrame, pd.DataFrame, pd.DataFrame]:
    """
    Identifies and handles missing values in the classification and profile DataFrames.
    
    Parameters:
    - df_classification: DataFrame containing user classifications.
    - df_train_profile: DataFrame containing training user profiles.
    - df_test_profile: DataFrame containing testing user profiles.
    
    Returns:
    - Tuple of DataFrames with missing values handled.
    """
    # Handling missing values in classification DataFrame
    print("\nHandling Missing Values in Classification DataFrame:")
    missing_class = df_classification.isnull().sum()
    print(missing_class)
    
    # If 'category' has missing values, decide whether to drop or impute
    if missing_class['category'] > 0:
        print(f"\nNumber of missing 'category' values: {missing_class['category']}")
        # Option 1: Drop rows with missing 'category'
        df_classification = df_classification.dropna(subset=['category'])
        print(f"After dropping, new shape: {df_classification.shape}")
    
    # Handling missing values in training profile DataFrame
    print("\nHandling Missing Values in Training Profile DataFrame:")
    missing_train_profile = df_train_profile.isnull().sum()
    print(missing_train_profile)
    
    # Example Strategy: Fill missing numerical values with median and categorical with mode
    numerical_cols_train = df_train_profile.select_dtypes(include=['int64', 'float64']).columns
    categorical_cols_train = df_train_profile.select_dtypes(include=['object']).columns
    
    for col in numerical_cols_train:
        median = df_train_profile[col].median()
        df_train_profile[col].fillna(median, inplace=True)
    
    for col in categorical_cols_train:
        mode = df_train_profile[col].mode()[0] if not df_train_profile[col].mode().empty else 'unknown'
        df_train_profile[col].fillna(mode, inplace=True)
    
    # Handling missing values in testing profile DataFrame
    print("\nHandling Missing Values in Testing Profile DataFrame:")
    missing_test_profile = df_test_profile.isnull().sum()
    print(missing_test_profile)
    
    numerical_cols_test = df_test_profile.select_dtypes(include=['int64', 'float64']).columns
    categorical_cols_test = df_test_profile.select_dtypes(include=['object']).columns
    
    for col in numerical_cols_test:
        median = df_test_profile[col].median()
        df_test_profile[col].fillna(median, inplace=True)
    
    for col in categorical_cols_test:
        mode = df_test_profile[col].mode()[0] if not df_test_profile[col].mode().empty else 'unknown'
        df_test_profile[col].fillna(mode, inplace=True)
    
    return df_classification, df_train_profile, df_test_profile


# Handle missing values
df_classification, train_profile_df, test_profile_df = handle_missing_values(df_classification, train_profile_df, test_profile_df)

# Verify that there are no missing values left
print("\nVerification of Missing Values After Handling:")
print("Classification DataFrame Missing Values:")
print(df_classification.isnull().sum())

print("\nTraining Profile DataFrame Missing Values:")
print(train_profile_df.isnull().sum())

print("\nTesting Profile DataFrame Missing Values:")
print(test_profile_df.isnull().sum())


Handling Missing Values in Classification DataFrame:
user_id     0
category    0
dtype: int64

Handling Missing Values in Training Profile DataFrame:
username                                0
id                                      0
full_name                              26
biography                             170
category_name                         515
post_count                           2466
follower_count                          0
following_count                         0
is_business_account                     0
is_private                              0
is_verified                             0
highlight_reel_count                    0
bio_links                               0
entities                              170
ai_agent_type                        2741
fb_profile_biolink                   2741
restricted_by_viewer                 2741
country_block                           0
eimu_id                                 0
external_url                          457
fbid     

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df_train_profile[col].fillna(mode, inplace=True)
  df_train_profile[col].fillna(mode, inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df_test_profile[col].fillna(mode, inplace=True)
  df_test_profile[col].fillna(mode, inplace=True)


In [29]:
# Cell 9.1: Define Columns to Exclude from Encoding

# Columns that should NOT be encoded
COLUMNS_TO_EXCLUDE = [
    'username', 
    'id', 
    'full_name', 
    'profile_pic_url', 
    'profile_picture_base64',
    'biography',  # Assuming 'biography' is text and already processed or not useful for encoding
    'should_show_category',
    'should_show_public_contacts',
    'show_account_transparency_details',
    'is_business_account',
    'is_private',
    'is_verified_by_mv4b',
    'is_regulated_c18'
]

# Cell 9.2: Update the Encoding Function to Exclude Certain Columns

from sklearn.preprocessing import LabelEncoder

def encode_categorical_variables(
    df_train_profile: pd.DataFrame, 
    df_test_profile: pd.DataFrame, 
    exclude_cols: List[str] = []
) -> Tuple[pd.DataFrame, pd.DataFrame, Dict[str, LabelEncoder]]:
    """
    Encodes categorical variables in the profile DataFrames using Label Encoding,
    excluding specified columns.
    
    Parameters:
    - df_train_profile: DataFrame containing training user profiles.
    - df_test_profile: DataFrame containing testing user profiles.
    - exclude_cols: List of column names to exclude from encoding.
    
    Returns:
    - Tuple containing:
        - Encoded training profile DataFrame
        - Encoded testing profile DataFrame
        - Dictionary of fitted LabelEncoders for each encoded column
    """
    label_encoders = {}
    
    # Identify categorical columns excluding specified columns
    categorical_cols_train = df_train_profile.select_dtypes(include=['object']).columns.tolist()
    categorical_cols_train = [col for col in categorical_cols_train if col not in exclude_cols]
    
    categorical_cols_test = df_test_profile.select_dtypes(include=['object']).columns.tolist()
    categorical_cols_test = [col for col in categorical_cols_test if col not in exclude_cols]
    
    # Process each categorical column
    for col in categorical_cols_train:
        try:
            # Combine train and test data to fit the encoder
            combined_data = pd.concat([df_train_profile[col], df_test_profile[col]], axis=0).astype(str)
            label_encoder = LabelEncoder()
            label_encoder.fit(combined_data)
            label_encoders[col] = label_encoder
            
            # Transform the data
            df_train_profile[col] = label_encoder.transform(df_train_profile[col].astype(str))
            
            # Handle unseen labels in test data by assigning a special value (-1)
            df_test_profile[col] = df_test_profile[col].map(
                lambda x: label_encoder.transform([x])[0] if x in label_encoder.classes_ else -1
            )
        except Exception as e:
            print(f"Error encoding column '{col}': {e}")
            # Assign a default value in case of error
            df_train_profile[col] = -1
            df_test_profile[col] = -1
    
    return df_train_profile, df_test_profile, label_encoders

# Cell 9.3: Apply the Updated Encoding Function

# Apply the encoding function, excluding specified columns
train_profile_df_encoded, test_profile_df_encoded, label_encoders = encode_categorical_variables(
    train_profile_df, 
    test_profile_df, 
    exclude_cols=COLUMNS_TO_EXCLUDE
)

# Display the first few rows to verify encoding
print("\nEncoded Training Profile DataFrame:")
display(train_profile_df_encoded.head())

print("\nEncoded Testing Profile DataFrame:")
display(test_profile_df_encoded.head())



Encoded Training Profile DataFrame:


Unnamed: 0,username,id,full_name,biography,category_name,post_count,follower_count,following_count,is_business_account,is_private,...,business_category_name,overall_category_name,category_enum,is_verified_by_mv4b,is_regulated_c18,profile_pic_url,should_show_category,should_show_public_contacts,show_account_transparency_details,profile_picture_base64
0,deparmedya,3170700063,Depar Medya,#mediaplanning #mediabuying #sosyalmedya,230,79.0,379,435,True,False,...,16,0,203,False,False,https://instagram.fsaw2-3.fna.fbcdn.net/v/t51....,True,True,True,/9j/4AAQSkZJRgABAQAAAQABAAD/2wBDAAgGBgcGBQgHBw...
1,kafesfirin,266439571,KAFES FIRIN,📍Söğütözü📍FTZ AVM\n🛒Ankara macro▲center v...,53,79.0,454,351,True,False,...,16,0,55,False,False,https://instagram.fada1-13.fna.fbcdn.net/v/t51...,True,True,True,/9j/4AAQSkZJRgABAQAAAQABAAD/2wBDAAgGBgcGBQgHBw...
2,vimerang,2367195567,Vimerang,Dijital İletişim Yönetimi🎬info@vimerang.comq,319,79.0,1919,955,True,False,...,3,0,388,False,False,https://instagram.fist19-1.fna.fbcdn.net/v/t51...,True,True,True,/9j/4AAQSkZJRgABAQAAAQABAAD/2wBDAAgGBgcGBQgHBw...
3,mustafa_yalcinn38,9606564254,Mustafa Yalçın,Talas Belediye Başkanı,313,79.0,744,690,True,False,...,16,0,268,False,False,https://instagram.fist1-4.fna.fbcdn.net/v/t51....,True,False,True,/9j/4AAQSkZJRgABAQAAAQABAAD/2wBDAAgGBgcGBQgHBw...
4,zorluenergysolutions,8155780357,ZES (Zorlu Energy Solutions),Türkiye’nin 81 ilindeki en yaygın elektrikli ş...,319,79.0,4569,80,True,False,...,16,0,133,False,False,https://instagram.fayt2-2.fna.fbcdn.net/v/t51....,True,True,True,/9j/4AAQSkZJRgABAQAAAQABAAD/2wBDAAgGBgcGBQgHBw...



Encoded Testing Profile DataFrame:


Unnamed: 0,username,id,full_name,biography,category_name,post_count,follower_count,following_count,is_business_account,is_private,...,business_category_name,overall_category_name,category_enum,is_verified_by_mv4b,is_regulated_c18,profile_pic_url,should_show_category,should_show_public_contacts,show_account_transparency_details,profile_picture_base64
0,beyazyakaliyiz,8634457436,Selam Beyaz Yakalı,Beyaz yakalıların dünyasına hoşgeldiniz 😀😀😀,293,65.0,-1,-1,True,False,...,16,0,252,False,False,https://instagram.fist6-1.fna.fbcdn.net/v/t51....,True,True,True,/9j/4AAQSkZJRgABAQAAAQABAAD/2wBDAAgGBgcGBQgHBw...
1,totalenergies_istasyonlari,7066643793,TotalEnergies İstasyonları,TotalEnergies İstasyonları resmi Instagram hes...,133,65.0,-1,-1,True,False,...,16,0,133,False,False,https://instagram.fsaw2-1.fna.fbcdn.net/v/t51....,True,True,True,/9j/4AAQSkZJRgABAQAAAQABAAD/2wBDAAgGBgcGBQgHBw...
2,konforyatak,8782109673,Konfor Yatak #KonforluUykular,"Konfor Yatak, birbirinden farklı özelliklere s...",163,65.0,-1,-1,True,False,...,16,0,156,False,False,https://instagram.fyei6-3.fna.fbcdn.net/v/t51....,True,True,True,/9j/4AAQSkZJRgABAQAAAQABAAD/2wBDAAgGBgcGBQgHBw...
3,ht_kulup,1950140344,HT KULÜP,Bloomberght - Habertürk Magazin,319,65.0,-1,-1,True,False,...,16,0,383,False,False,https://instagram.fada2-1.fna.fbcdn.net/v/t51....,True,True,True,/9j/4AAQSkZJRgABAQAAAQABAAD/2wBDAAgGBgcGBQgHBw...
4,ajansspor,338611487,Ajansspor,"🏢 Saran Group \n🏟 Anında, tarafsız spor haberl...",272,65.0,-1,-1,True,False,...,16,0,235,False,False,https://instagram.fadb2-1.fna.fbcdn.net/v/t51....,True,True,True,/9j/4AAQSkZJRgABAQAAAQABAAD/2wBDAAgGBgcGBQgHBw...


In [30]:
# Cell 10: Feature Scaling

from sklearn.preprocessing import StandardScaler

def scale_numerical_features(df_train_profile: pd.DataFrame, 
                             df_test_profile: pd.DataFrame, 
                             numerical_columns: List[str]) -> Tuple[pd.DataFrame, pd.DataFrame, StandardScaler]:
    """
    Scales numerical features using StandardScaler.
    
    Parameters:
    - df_train_profile: DataFrame containing training user profiles.
    - df_test_profile: DataFrame containing testing user profiles.
    - numerical_columns: List of numerical column names to scale.
    
    Returns:
    - Tuple containing:
        - Scaled training profile DataFrame
        - Scaled testing profile DataFrame
        - Fitted StandardScaler instance
    """
    scaler = StandardScaler()
    
    # Fit the scaler on training data
    df_train_profile[numerical_columns] = scaler.fit_transform(df_train_profile[numerical_columns])
    
    # Transform the testing data
    df_test_profile[numerical_columns] = scaler.transform(df_test_profile[numerical_columns])
    
    return df_train_profile, df_test_profile, scaler


# Identify numerical columns in the training profile DataFrame
numerical_cols_train = train_profile_df_encoded.select_dtypes(include=['int64', 'float64']).columns.tolist()

# Assuming similar numerical columns exist in the testing profile DataFrame
numerical_cols_test = test_profile_df_encoded.select_dtypes(include=['int64', 'float64']).columns.tolist()

# To avoid scaling the target variable if present, ensure 'category_encoded_cleaned' is not included
# (In this step, profiles are separate from classification labels)
# Example: If 'engagement_rate' is a feature in profiles, it would be included in numerical_cols_train

# For demonstration, let's assume 'followers', 'following', 'posts', etc., are numerical features
# You should replace these with actual numerical column names from your profile DataFrames

# Example numerical columns (replace with actual column names)
# numerical_features = ['followers', 'following', 'posts', 'highlight_reel_count']
# For now, we'll use all numerical columns except any that shouldn't be scaled

# Extract numerical columns dynamically
numerical_features = [col for col in numerical_cols_train if col not in ['user_id']]  # Replace 'user_id' as needed

# Scale numerical features
train_profile_df_scaled, test_profile_df_scaled, scaler = scale_numerical_features(train_profile_df_encoded, test_profile_df_encoded, numerical_features)

# Display the first few rows to verify scaling
print("\nScaled Training Profile DataFrame:")
display(train_profile_df_scaled.head())

print("\nScaled Testing Profile DataFrame:")
display(test_profile_df_scaled.head())

# Cell 10.1: Text Preprocessing Function
import re

def preprocess_text(text: str) -> str:
    """
    Preprocesses a given text string.
    - Converts text to lowercase (casefold for Turkish compatibility).
    - Removes URLs.
    - Removes special characters, punctuation, and emojis (optional).
    - Removes digits.
    - Removes extra whitespaces.

    Parameters:
    - text: The text to preprocess.

    Returns:
    - Cleaned and preprocessed text string.
    """
    # Convert text to lowercase with Turkish compatibility
    text = text.casefold()

    # Remove URLs
    text = re.sub(r'http\S+|www\S+|https\S+', '', text, flags=re.MULTILINE)

    # Remove special characters and punctuation
    text = re.sub(r'[^a-zçğıöşü0-9\s#@]', '', text)

    # Remove numbers
    text = re.sub(r'\d+', '', text)

    # Remove extra whitespaces
    text = re.sub(r'\s+', ' ', text).strip()

    return text

# Cell 10.2: Build Corpus Function
from typing import List, Tuple, Dict

def build_corpus(username2posts: Dict[str, List[Dict]], preprocess_fn) -> Tuple[List[str], List[str]]:
    """
    Constructs a text corpus by aggregating and preprocessing posts for each username.
    
    Parameters:
    - username2posts: Dictionary mapping usernames to their list of posts.
    - preprocess_fn: Function to preprocess individual post captions.
    
    Returns:
    - Tuple containing:
        - corpus: List of aggregated and preprocessed captions per username.
        - usernames: List of usernames corresponding to the corpus.
    """
    corpus = []
    usernames = []
    
    for username, posts in username2posts.items():
        usernames.append(username)

        # Aggregate and preprocess posts for the user
        cleaned_captions = []
        for post in posts:
            post_caption = post.get("caption", "")
            if post_caption is None:
                continue
            # Apply the preprocessing function
            post_caption = preprocess_fn(post_caption)
            if post_caption:
                cleaned_captions.append(post_caption)
        
        # Join the cleaned captions into a single string
        aggregated_captions = "\n".join(cleaned_captions)
        corpus.append(aggregated_captions)
    
    return corpus, usernames


Scaled Training Profile DataFrame:


Unnamed: 0,username,id,full_name,biography,category_name,post_count,follower_count,following_count,is_business_account,is_private,...,business_category_name,overall_category_name,category_enum,is_verified_by_mv4b,is_regulated_c18,profile_pic_url,should_show_category,should_show_public_contacts,show_account_transparency_details,profile_picture_base64
0,deparmedya,3170700063,Depar Medya,#mediaplanning #mediabuying #sosyalmedya,230,-0.170361,379,435,True,False,...,16,0,203,False,False,https://instagram.fsaw2-3.fna.fbcdn.net/v/t51....,True,True,True,/9j/4AAQSkZJRgABAQAAAQABAAD/2wBDAAgGBgcGBQgHBw...
1,kafesfirin,266439571,KAFES FIRIN,📍Söğütözü📍FTZ AVM\n🛒Ankara macro▲center v...,53,-0.170361,454,351,True,False,...,16,0,55,False,False,https://instagram.fada1-13.fna.fbcdn.net/v/t51...,True,True,True,/9j/4AAQSkZJRgABAQAAAQABAAD/2wBDAAgGBgcGBQgHBw...
2,vimerang,2367195567,Vimerang,Dijital İletişim Yönetimi🎬info@vimerang.comq,319,-0.170361,1919,955,True,False,...,3,0,388,False,False,https://instagram.fist19-1.fna.fbcdn.net/v/t51...,True,True,True,/9j/4AAQSkZJRgABAQAAAQABAAD/2wBDAAgGBgcGBQgHBw...
3,mustafa_yalcinn38,9606564254,Mustafa Yalçın,Talas Belediye Başkanı,313,-0.170361,744,690,True,False,...,16,0,268,False,False,https://instagram.fist1-4.fna.fbcdn.net/v/t51....,True,False,True,/9j/4AAQSkZJRgABAQAAAQABAAD/2wBDAAgGBgcGBQgHBw...
4,zorluenergysolutions,8155780357,ZES (Zorlu Energy Solutions),Türkiye’nin 81 ilindeki en yaygın elektrikli ş...,319,-0.170361,4569,80,True,False,...,16,0,133,False,False,https://instagram.fayt2-2.fna.fbcdn.net/v/t51....,True,True,True,/9j/4AAQSkZJRgABAQAAAQABAAD/2wBDAAgGBgcGBQgHBw...



Scaled Testing Profile DataFrame:


Unnamed: 0,username,id,full_name,biography,category_name,post_count,follower_count,following_count,is_business_account,is_private,...,business_category_name,overall_category_name,category_enum,is_verified_by_mv4b,is_regulated_c18,profile_pic_url,should_show_category,should_show_public_contacts,show_account_transparency_details,profile_picture_base64
0,beyazyakaliyiz,8634457436,Selam Beyaz Yakalı,Beyaz yakalıların dünyasına hoşgeldiniz 😀😀😀,293,-0.184514,-1,-1,True,False,...,16,0,252,False,False,https://instagram.fist6-1.fna.fbcdn.net/v/t51....,True,True,True,/9j/4AAQSkZJRgABAQAAAQABAAD/2wBDAAgGBgcGBQgHBw...
1,totalenergies_istasyonlari,7066643793,TotalEnergies İstasyonları,TotalEnergies İstasyonları resmi Instagram hes...,133,-0.184514,-1,-1,True,False,...,16,0,133,False,False,https://instagram.fsaw2-1.fna.fbcdn.net/v/t51....,True,True,True,/9j/4AAQSkZJRgABAQAAAQABAAD/2wBDAAgGBgcGBQgHBw...
2,konforyatak,8782109673,Konfor Yatak #KonforluUykular,"Konfor Yatak, birbirinden farklı özelliklere s...",163,-0.184514,-1,-1,True,False,...,16,0,156,False,False,https://instagram.fyei6-3.fna.fbcdn.net/v/t51....,True,True,True,/9j/4AAQSkZJRgABAQAAAQABAAD/2wBDAAgGBgcGBQgHBw...
3,ht_kulup,1950140344,HT KULÜP,Bloomberght - Habertürk Magazin,319,-0.184514,-1,-1,True,False,...,16,0,383,False,False,https://instagram.fada2-1.fna.fbcdn.net/v/t51....,True,True,True,/9j/4AAQSkZJRgABAQAAAQABAAD/2wBDAAgGBgcGBQgHBw...
4,ajansspor,338611487,Ajansspor,"🏢 Saran Group \n🏟 Anında, tarafsız spor haberl...",272,-0.184514,-1,-1,True,False,...,16,0,235,False,False,https://instagram.fadb2-1.fna.fbcdn.net/v/t51....,True,True,True,/9j/4AAQSkZJRgABAQAAAQABAAD/2wBDAAgGBgcGBQgHBw...


In [31]:
# Cell 11: Enhance Text Preprocessing and Vectorization

from sklearn.pipeline import Pipeline
from sklearn.base import TransformerMixin, BaseEstimator

class TextPreprocessor(TransformerMixin, BaseEstimator):
    """
    Custom transformer for text preprocessing.
    """
    def __init__(self, preprocess_fn):
        self.preprocess_fn = preprocess_fn
    
    def fit(self, X, y=None):
        return self
    
    def transform(self, X):
        # Convert to Pandas Series if input is a list
        if isinstance(X, list):
            X = pd.Series(X)
        return X.apply(self.preprocess_fn)


def build_vectorizer_pipeline(stop_words: List[str], max_features: int = 5000) -> Pipeline:
    """
    Builds a text preprocessing and TF-IDF vectorization pipeline.
    
    Parameters:
    - stop_words: List of stopwords to remove during vectorization.
    - max_features: Maximum number of features for the TF-IDF vectorizer.
    
    Returns:
    - A scikit-learn Pipeline object.
    """
    pipeline = Pipeline([
        ('preprocessor', TextPreprocessor(preprocess_text)),
        ('tfidf', TfidfVectorizer(stop_words=stop_words, max_features=max_features))
    ])
    return pipeline


def create_tfidf_features(corpus: List[str], pipeline: Pipeline):
    """
    Fits the vectorizer pipeline on the corpus and transforms the data into TF-IDF features.
    
    Parameters:
    - corpus: List of text documents.
    - pipeline: scikit-learn Pipeline for preprocessing and vectorization.
    
    Returns:
    - Transformed TF-IDF feature matrix.
    """
    try:
        tfidf_matrix = pipeline.fit_transform(corpus)
        return tfidf_matrix
    except Exception as e:
        print(f"Error during TF-IDF transformation: {e}")
        return None


# Build the vectorizer pipeline
vectorizer_pipeline = build_vectorizer_pipeline(TURKISH_STOPWORDS, max_features=5000)

# Build the corpus and get corresponding usernames for training
corpus_train, train_usernames = build_corpus(username2posts_train, preprocess_text)

# Create TF-IDF features for training data
x_post_train = create_tfidf_features(corpus_train, vectorizer_pipeline)

# Ensure the vectorizer has been fitted
if x_post_train is not None:
    print(f"TF-IDF Training Features Shape: {x_post_train.shape}")
else:
    print("TF-IDF Training Features could not be created.")

# Repeat the process for testing data
def transform_test_corpus(username2posts_test: Dict[str, List[Dict]], 
                         preprocess_fn, 
                         vectorizer_pipeline: Pipeline) -> Tuple[List[str], List[str], np.ndarray]:
    """
    Builds and transforms the test corpus using the fitted vectorizer pipeline.
    
    Parameters:
    - username2posts_test: Dict mapping usernames to their posts for testing.
    - preprocess_fn: Function to preprocess individual post captions.
    - vectorizer_pipeline: Fitted scikit-learn Pipeline for preprocessing and vectorization.
    
    Returns:
    - Tuple containing:
        - test_corpus: List of aggregated and cleaned captions per test user.
        - test_usernames: List of test usernames.
        - x_post_test: Transformed TF-IDF feature matrix for testing data.
    """
    test_corpus, test_usernames = build_corpus(username2posts_test, preprocess_fn)
    try:
        x_post_test = vectorizer_pipeline.transform(test_corpus)
        return test_corpus, test_usernames, x_post_test
    except Exception as e:
        print(f"Error during test TF-IDF transformation: {e}")
        return test_corpus, test_usernames, None


# Transform test corpus
test_corpus, test_usernames, x_post_test = transform_test_corpus(username2posts_test, preprocess_text, vectorizer_pipeline)

# Verify test TF-IDF features
if x_post_test is not None:
    print(f"TF-IDF Testing Features Shape: {x_post_test.shape}")
else:
    print("TF-IDF Testing Features could not be created.")

TF-IDF Training Features Shape: (2741, 5000)
TF-IDF Testing Features Shape: (2674, 5000)


In [32]:
# Cell 12: Train-Test Split Optimization

from sklearn.model_selection import train_test_split
from scipy.sparse import csr_matrix

def split_train_validation(x_features: csr_matrix, 
                          y_labels: List[str], 
                          test_size: float = 0.2, 
                          random_state: int = 42) -> Tuple[csr_matrix, csr_matrix, List[str], List[str]]:
    """
    Splits the data into training and validation sets using stratification.
    
    Parameters:
    - x_features: Sparse matrix of feature vectors.
    - y_labels: List of target labels corresponding to each feature vector.
    - test_size: Proportion of the dataset to include in the validation split.
    - random_state: Seed used by the random number generator.
    
    Returns:
    - Tuple containing:
        - x_train: Training feature matrix.
        - x_val: Validation feature matrix.
        - y_train: Training labels.
        - y_val: Validation labels.
    """
    try:
        x_train, x_val, y_train, y_val = train_test_split(
            x_features, y_labels, 
            test_size=test_size, 
            random_state=random_state, 
            stratify=y_labels
        )
        return x_train, x_val, y_train, y_val
    except ValueError as ve:
        print(f"ValueError during train-test split: {ve}")
        return None, None, None, None
    except Exception as e:
        print(f"Error during train-test split: {e}")
        return None, None, None, None


# Prepare target labels for training data
# Assuming 'category_encoded_cleaned' is already created and encoded
# If not, we'll need to encode it here

# For this step, we'll assume that 'y_train' is already a list of category labels corresponding to 'train_usernames'

# In previous steps, 'y_train' was defined as:
# y_train = [username2_category.get(uname, "NA") for uname in train_usernames]
# However, in the optimized code, we need to ensure that 'y_train' corresponds to the indices in 'x_post_train'

# Let's create 'y_train' based on 'train_usernames'
y_train_labels = [username2_category.get(uname, "unknown") for uname in train_usernames]

# Encode target labels
label_encoder_target = LabelEncoder()
y_train_encoded = label_encoder_target.fit_transform(y_train_labels)

# Perform train-validation split
x_train, x_val, y_train, y_val = split_train_validation(x_post_train, y_train_encoded)

if x_train is not None:
    print(f"Training Features Shape: {x_train.shape}")
    print(f"Validation Features Shape: {x_val.shape}")
    print(f"Training Labels Shape: {len(y_train)}")
    print(f"Validation Labels Shape: {len(y_val)}")
else:
    print("Train-validation split failed.")

Training Features Shape: (2192, 5000)
Validation Features Shape: (549, 5000)
Training Labels Shape: 2192
Validation Labels Shape: 549


In [33]:
# Cell 13: Integrate Data Preprocessing into Pipeline

from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer

def create_full_pipeline(vectorizer_pipeline: Pipeline, 
                        numerical_features: List[str]) -> Pipeline:
    """
    Creates a complete preprocessing pipeline including text vectorization and numerical feature scaling.
    
    Parameters:
    - vectorizer_pipeline: scikit-learn Pipeline for text preprocessing and vectorization.
    - numerical_features: List of numerical feature column names.
    
    Returns:
    - A complete scikit-learn Pipeline object.
    """
    # For numerical features, we've already scaled them. If not, include scaling here.
    # Since scaling was done separately, this pipeline focuses on text features.
    # If you have additional preprocessing steps, add them here.
    
    # Combine both vectorizer and numerical features using ColumnTransformer
    full_pipeline = ColumnTransformer([
        ('text', vectorizer_pipeline, 'text_feature'),  # Placeholder; adjust as needed
        ('num', 'passthrough', numerical_features)
    ])
    
    return full_pipeline


# Example Usage:
# Assuming 'text_feature' is a placeholder for the actual text data column
# In your case, since TF-IDF was already performed, this step might be redundant
# Alternatively, you can keep the pipeline separate for more flexibility

# Therefore, skipping pipeline integration as TF-IDF has been handled

# Alternatively, consider the following if integrating:

# Define a new function to handle both text and numerical features
def create_combined_features(df_profile: pd.DataFrame, 
                             tfidf_matrix: csr_matrix) -> csr_matrix:
    """
    Combines numerical features with TF-IDF features into a single feature matrix.
    
    Parameters:
    - df_profile: DataFrame containing numerical features.
    - tfidf_matrix: Sparse matrix containing TF-IDF features.
    
    Returns:
    - Combined sparse feature matrix.
    """
    try:
        numerical_features_matrix = csr_matrix(df_profile.values)
        combined_features = csr_matrix(np.hstack([numerical_features_matrix.toarray(), tfidf_matrix.toarray()]))
        return combined_features
    except Exception as e:
        print(f"Error combining features: {e}")
        return None


# Combine training numerical features with TF-IDF features
# Extract numerical features from the scaled training profile DataFrame
# Replace 'numerical_features' with actual column names
numerical_train = train_profile_df_scaled[numerical_features].values
numerical_train_matrix = csr_matrix(numerical_train)

# Combine numerical and TF-IDF features
from scipy.sparse import hstack

combined_train_features = hstack([numerical_train_matrix, x_post_train])

print(f"Combined Training Features Shape: {combined_train_features.shape}")

# Similarly, combine for validation and testing data

# Extract numerical features from the scaled validation profile DataFrame
# Assuming you have a validation profile DataFrame, else use training profiles
# For simplicity, let's assume we're using the same training profiles for validation
# In practice, you should have separate profiles for training and validation

# Since in previous steps, we split x_post_train and y_train into x_train and x_val, 
# but the profiles are separate, ensure that you align the profiles accordingly

# This step may require additional alignment logic

Combined Training Features Shape: (2741, 5001)


In [34]:
# Cell 14: Final Preprocessing Summary and Preparation for Modeling

def summarize_preprocessing(x_train: csr_matrix, 
                           x_val: csr_matrix, 
                           y_train: List[int], 
                           y_val: List[int], 
                           label_encoder: LabelEncoder):
    """
    Summarizes the preprocessing steps and provides insights into the training and validation datasets.
    
    Parameters:
    - x_train: Training feature matrix.
    - x_val: Validation feature matrix.
    - y_train: Training labels.
    - y_val: Validation labels.
    - label_encoder: Fitted LabelEncoder instance for target labels.
    """
    try:
        print("\n=== Preprocessing Summary ===")
        print(f"Training Features Shape: {x_train.shape}")
        print(f"Validation Features Shape: {x_val.shape}")
        print(f"Number of Training Samples: {x_train.shape[0]}")
        print(f"Number of Validation Samples: {x_val.shape[0]}")
        
        print("\n=== Class Distribution in Training Set ===")
        unique, counts = np.unique(y_train, return_counts=True)
        for cls, count in zip(unique, counts):
            print(f"{label_encoder.inverse_transform([cls])[0]}: {count}")
        
        print("\n=== Class Distribution in Validation Set ===")
        unique, counts = np.unique(y_val, return_counts=True)
        for cls, count in zip(unique, counts):
            print(f"{label_encoder.inverse_transform([cls])[0]}: {count}")
        
    except Exception as e:
        print(f"Error during preprocessing summary: {e}")


# Summarize preprocessing
summarize_preprocessing(x_train, x_val, y_train, y_val, label_encoder_target)


=== Preprocessing Summary ===
Training Features Shape: (2192, 5000)
Validation Features Shape: (549, 5000)
Number of Training Samples: 2192
Number of Validation Samples: 549

=== Class Distribution in Training Set ===
art: 153
entertainment: 258
fashion: 239
food: 409
gaming: 10
health and lifestyle: 402
mom and children: 119
sports: 90
tech: 277
travel: 235

=== Class Distribution in Validation Set ===
art: 38
entertainment: 65
fashion: 60
food: 102
gaming: 3
health and lifestyle: 100
mom and children: 30
sports: 23
tech: 69
travel: 59


# Feature Engineering

In [35]:
# Cell 15: Feature Engineering

# Purpose: Add new features based on existing data for improved modeling

def add_feature_engineering(df):
    """
    Perform feature engineering on the provided DataFrame.

    Parameters:
    - df: DataFrame to add features to.

    Returns:
    - Updated DataFrame with new features.
    """
    df = df.copy()

    # Add ratios and derived features
    df['followers_to_following_ratio'] = df['follower_count'] / (df['following_count'] + 1e-5)  # Avoid division by zero
    df['posts_per_highlight'] = df['post_count'] / (df['highlight_reel_count'] + 1e-5)
    
    # Binary features based on thresholds
    df['is_popular'] = (df['follower_count'] > 10000).astype(int)
    df['is_active'] = (df['post_count'] > 50).astype(int)

    # Text-based features
    df['biography_length'] = df['biography'].apply(lambda x: len(x) if isinstance(x, str) else 0)
    df['has_links_in_bio'] = df['biography'].str.contains('http', na=False).astype(int)

    # Add a feature to count hashtags in biography
    df['hashtag_count_in_bio'] = df['biography'].apply(lambda x: x.count('#') if isinstance(x, str) else 0)

    return df

# Apply feature engineering to training and testing profile DataFrames
train_profile_df_engineered = add_feature_engineering(train_profile_df_encoded)
test_profile_df_engineered = add_feature_engineering(test_profile_df_encoded)

# Display results
print("Training DataFrame with Engineered Features:")
print(train_profile_df_engineered.head())

print("\nTesting DataFrame with Engineered Features:")
print(test_profile_df_engineered.head())

Training DataFrame with Engineered Features:
               username          id                     full_name  \
0            deparmedya  3170700063                   Depar Medya   
1            kafesfirin   266439571                   KAFES FIRIN   
2              vimerang  2367195567                      Vimerang   
3     mustafa_yalcinn38  9606564254               Mustafa Yalçın   
4  zorluenergysolutions  8155780357  ZES (Zorlu Energy Solutions)   

                                           biography  category_name  \
0           #mediaplanning #mediabuying #sosyalmedya            230   
1  📍Söğütözü📍FTZ AVM\n🛒Ankara macro▲center v...             53   
2       Dijital İletişim Yönetimi🎬info@vimerang.comq            319   
3                            Talas Belediye Başkanı            313   
4  Türkiye’nin 81 ilindeki en yaygın elektrikli ş...            319   

   post_count  follower_count  following_count is_business_account is_private  \
0   -0.170361             379   

# Feature Selection and scaling

In [36]:
# Cell 16: Feature Selection and Scaling (Corrected)

from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from scipy.sparse import hstack

# Step 1: Ensure 'post_count_bin' exists in the DataFrame
def ensure_feature_column(df, feature_name, default_value=0):
    """
    Ensures a feature column exists in the DataFrame, filling it with a default value if missing.

    Parameters:
    - df: DataFrame to check.
    - feature_name: Name of the feature column to ensure.
    - default_value: Default value to fill if the column is missing.

    Returns:
    - Updated DataFrame with the ensured feature column.
    """
    if feature_name not in df.columns:
        df[feature_name] = default_value
    return df

# Ensure 'post_count_bin' exists in both training and testing DataFrames
train_profile_df_encoded = ensure_feature_column(train_profile_df_encoded, 'post_count_bin')
test_profile_df_encoded = ensure_feature_column(test_profile_df_encoded, 'post_count_bin')

# Step 2: Combine Engineered and Numerical Features
numerical_features = numerical_cols_train + ['post_count_bin']

def select_relevant_features(df, feature_columns):
    """
    Selects only the relevant features from the DataFrame.

    Parameters:
    - df: DataFrame containing the features.
    - feature_columns: List of relevant feature column names.

    Returns:
    - Selected features DataFrame.
    """
    return df[feature_columns]

# Select features from training and testing DataFrames
train_selected_features = select_relevant_features(train_profile_df_encoded, numerical_features)
test_selected_features = select_relevant_features(test_profile_df_encoded, numerical_features)

# Step 3: Apply Scaling to Numerical Features
scaler_pipeline = Pipeline([
    ('scaler', StandardScaler())
])

def scale_features(df, pipeline):
    """
    Scales the numerical features using the provided pipeline.

    Parameters:
    - df: DataFrame containing the features to scale.
    - pipeline: Scikit-learn Pipeline for scaling.

    Returns:
    - Scaled feature matrix (numpy array).
    """
    return pipeline.fit_transform(df)

# Scale the numerical features
train_scaled_features = scale_features(train_selected_features, scaler_pipeline)
test_scaled_features = scaler_pipeline.transform(test_selected_features)

# Step 4: Combine Numerical and TF-IDF Features

# Combine numerical and TF-IDF features using hstack
x_train_combined = hstack([train_scaled_features, x_post_train])
x_test_combined = hstack([test_scaled_features, x_post_test])

# Print the final shapes for verification
print(f"Final Training Feature Matrix Shape: {x_train_combined.shape}")
print(f"Final Testing Feature Matrix Shape: {x_test_combined.shape}")

# Outputs are now ready for model training and evaluation.

Final Training Feature Matrix Shape: (2741, 5002)
Final Testing Feature Matrix Shape: (2674, 5002)


In [37]:
# Cell 19: Fixing Data Split, Training, and Validation with Progress Bar

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report
from tqdm import tqdm
from scipy.sparse import hstack
import time

# Step 1: Combine Numerical and TF-IDF Features for Full Dataset
x_combined = hstack([train_scaled_features, x_post_train])  # Already constructed from previous steps
#print(f"Combined Features Shape: {x_combined.shape}")

# Step 2: Split Data into Training and Validation Sets
x_train, x_val, y_train, y_val = train_test_split(
    x_combined,           # Combined features
    y_train_encoded,      # Encoded target variable
    test_size=0.2,        # 20% for validation
    random_state=42,      # Seed for reproducibility
    stratify=y_train_encoded  # Maintain class distribution
)

# Verify the splits
#print(f"Training Feature Shape: {x_train.shape}")
#print(f"Validation Feature Shape: {x_val.shape}")
#print(f"Training Labels Shape: {len(y_train)}")
#print(f"Validation Labels Shape: {len(y_val)}")

# Step 3: Train Logistic Regression Baseline Model with Progress Bar
baseline_model = LogisticRegression(max_iter=1000, random_state=42)  # Baseline Logistic Regression Model

# Initialize tqdm progress bar
with tqdm(total=100, desc="Training Logistic Regression", unit="%") as pbar:
    start_time = time.time()
    baseline_model.fit(x_train, y_train)  # Train the model
    end_time = time.time()
    pbar.update(100)  # Training completed, update progress bar to 100%
    print(f"Training completed in {end_time - start_time:.2f} seconds.")

# Step 4: Evaluate Model on Training Set
y_train_pred = baseline_model.predict(x_train)
train_accuracy = accuracy_score(y_train, y_train_pred)
print(f"\n=== Training Evaluation ===")
print(f"Training Accuracy: {train_accuracy:.4f}")
print("\nTraining Classification Report:")
print(classification_report(y_train, y_train_pred, zero_division=0))

# Step 5: Evaluate Model on Validation Set
y_val_pred = baseline_model.predict(x_val)
val_accuracy = accuracy_score(y_val, y_val_pred)
print(f"\n=== Validation Evaluation ===")
print(f"Validation Accuracy: {val_accuracy:.4f}")
print("\nValidation Classification Report:")
print(classification_report(y_val, y_val_pred, zero_division=0))

Training Logistic Regression: 100%|██████████| 100/100 [00:01<00:00, 81.09%/s]

Training completed in 1.23 seconds.

=== Training Evaluation ===
Training Accuracy: 0.8481

Training Classification Report:
              precision    recall  f1-score   support

           0       0.95      0.59      0.73       153
           1       0.81      0.79      0.80       258
           2       0.86      0.90      0.88       239
           3       0.90      0.97      0.93       409
           4       0.00      0.00      0.00        10
           5       0.76      0.92      0.83       402
           6       0.96      0.61      0.75       119
           7       1.00      0.61      0.76        90
           8       0.85      0.94      0.89       277
           9       0.85      0.85      0.85       235

    accuracy                           0.85      2192
   macro avg       0.79      0.72      0.74      2192
weighted avg       0.85      0.85      0.84      2192


=== Validation Evaluation ===
Validation Accuracy: 0.6284

Validation Classification Report:
              precision




# Improving Model

# Regresion Task


In [42]:
# Cell 20: Regression Task - Predicting Like Count

from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from tqdm import tqdm
import numpy as np
import time

# Step 1: Prepare Data for Regression
def prepare_regression_data(username_to_posts: dict, feature_matrix: csr_matrix, preprocess_fn) -> (np.ndarray, np.ndarray):
    """
    Prepares data for the regression task by extracting features and target values.
    
    Parameters:
    - username_to_posts: Dictionary mapping usernames to their posts.
    - feature_matrix: Pre-computed feature matrix (e.g., TF-IDF and numerical features).
    - preprocess_fn: Function for text preprocessing.
    
    Returns:
    - features: Feature matrix for regression (e.g., combined features).
    - targets: Target values (e.g., average like counts).
    """
    targets = []
    indices_to_remove = []  # To track rows with NaN targets

    for i, (username, posts) in enumerate(username_to_posts.items()):
        # Calculate average like count for the user
        avg_likes = np.mean([post.get("like_count", 0) or 0 for post in posts])
        if np.isnan(avg_likes):
            indices_to_remove.append(i)  # Mark index for removal
        else:
            targets.append(avg_likes)

    # Remove rows with NaN targets from the feature matrix
    feature_matrix = np.delete(feature_matrix.toarray(), indices_to_remove, axis=0)
    
    return feature_matrix, np.array(targets)


# Prepare training data
x_reg_train, y_reg_train = prepare_regression_data(username2posts_train, x_train_combined, preprocess_text)

# Prepare testing data (if needed for evaluation)
x_reg_test, y_reg_test = prepare_regression_data(username2posts_test, x_test_combined, preprocess_text)

print(f"Regression Training Features Shape: {x_reg_train.shape}")
print(f"Regression Training Targets Shape: {y_reg_train.shape}")

# Step 2: Define a Baseline Regression Model
baseline_regressor = LinearRegression()

# Step 3: Train Regression Model with Progress Bar
with tqdm(total=100, desc="Training Linear Regression", unit="%") as pbar:
    start_time = time.time()
    baseline_regressor.fit(x_reg_train, y_reg_train)
    end_time = time.time()
    pbar.update(100)  # Training completed
    print(f"Training completed in {end_time - start_time:.2f} seconds.")

# Step 4: Evaluate the Model
# Training Evaluation
y_train_pred = baseline_regressor.predict(x_reg_train)
train_mse = mean_squared_error(y_reg_train, y_train_pred)
train_rmse = np.sqrt(train_mse)

print("\n=== Training Evaluation ===")
print(f"Training MSE: {train_mse:.4f}")
print(f"Training RMSE: {train_rmse:.4f}")

# Testing Evaluation (if available)
if y_reg_test.size > 0:
    y_test_pred = baseline_regressor.predict(x_reg_test)
    test_mse = mean_squared_error(y_reg_test, y_test_pred)
    test_rmse = np.sqrt(test_mse)

    print("\n=== Testing Evaluation ===")
    print(f"Testing MSE: {test_mse:.4f}")
    print(f"Testing RMSE: {test_rmse:.4f}")

# Step 5: Save Results
# Optional: Save predictions for further analysis
regression_results = {
    "y_train_true": y_reg_train.tolist(),
    "y_train_pred": y_train_pred.tolist(),
    "y_test_true": y_reg_test.tolist() if y_reg_test.size > 0 else [],
    "y_test_pred": y_test_pred.tolist() if y_reg_test.size > 0 else []
}

print("\nRegression Results Saved for Analysis!")

  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)


Regression Training Features Shape: (2739, 5002)
Regression Training Targets Shape: (2739,)


Training Linear Regression: 100%|██████████| 100/100 [00:05<00:00, 18.39%/s]

Training completed in 5.44 seconds.

=== Training Evaluation ===
Training MSE: 2863.1075
Training RMSE: 53.5080

=== Testing Evaluation ===
Testing MSE: 5526242902457112623807725568.0000
Testing RMSE: 74338703933127.0000

Regression Results Saved for Analysis!





In [46]:
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.feature_selection import VarianceThreshold

# Analyze correlation among numerical features
correlation_matrix = train_profile_df_encoded.corr()
plt.figure(figsize=(10, 8))
sns.heatmap(correlation_matrix, annot=True, cmap="coolwarm")
plt.title("Correlation Matrix")
plt.show()

# Identify low-variance features
selector = VarianceThreshold(threshold=0.01)  # Set a low threshold for variance
high_variance_features = selector.fit_transform(train_scaled_features)

print(f"Original Features: {train_scaled_features.shape[1]}")
print(f"Features After Removing Low Variance: {high_variance_features.shape[1]}")


ValueError: could not convert string to float: 'deparmedya'