In [2]:
###############################################################################
# 1. All necessary imports
###############################################################################
import numpy as np
import pandas as pd
import gzip
import json
import re

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import VotingClassifier
from sklearn.metrics import accuracy_score, classification_report

import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')
turkish_stopwords = stopwords.words('turkish')

###############################################################################
# 2. Preprocessing function
###############################################################################
def preprocess_text(text: str):
    # Lower casing Turkish text
    text = text.casefold()
    # Remove URLs
    text = re.sub(r'http\S+|www\S+|https\S+', '', text, flags=re.MULTILINE)
    # Remove special characters (adjust if you want to keep #, @, etc.)
    text = re.sub(r'[^a-zçğıöşü0-9\s#@]', '', text)
    # Remove numbers
    text = re.sub(r'\d+', '', text)
    # Remove extra whitespaces
    text = re.sub(r'\s+', ' ', text).strip()
    return text

###############################################################################
# 3. Read and unify the training classification data
###############################################################################
train_classification_df = pd.read_csv("C:/Users/onurm/Desktop/CS412-project/train-classification.csv")
train_classification_df = train_classification_df.rename(
    columns={'Unnamed: 0': 'user_id', 'label': 'category'}
)
train_classification_df["category"] = train_classification_df["category"].apply(str.lower)
username2_category = train_classification_df.set_index("user_id").to_dict()["category"]

###############################################################################
# 4. Read the main JSONL file, separate train vs. unlabeled data
###############################################################################
train_data_path = "C:/Users/onurm/Desktop/CS412-project/training-dataset.jsonl.gz"

username2posts_train = {}
username2profile_train = {}

username2posts_unlabeled = {}
username2profile_unlabeled = {}

with gzip.open(train_data_path, "rt") as fh:
    for line in fh:
        sample = json.loads(line)
        profile = sample["profile"]
        username = profile["username"]
        if username in username2_category:
            username2posts_train[username] = sample["posts"]
            username2profile_train[username] = profile
        else:
            username2posts_unlabeled[username] = sample["posts"]
            username2profile_unlabeled[username] = profile

###############################################################################
#  Additional Feature Extraction
###############################################################################
def extract_features(username, profile_data, posts_data):
    profile = profile_data.get(username, {})
    profile_features = {
        'follower_count': profile.get('follower_count', 0),
        'following_count': profile.get('following_count', 0),
        'highlight_reel_count': profile.get('highlight_reel_count', 0),
        'is_business': int(profile.get('is_business_account', False)),
        'full_name' : profile.get('full_name', ''),
        'biography' : profile.get('biography', ''),
        'external_url' : profile.get('external_url', ''),
        'is_verified' : int(profile.get('is_verified', False)),
        'is_private' : int(profile.get('is_private', False)),
        'is_business_account' : int(profile.get('is_business_account', False)),
        'is_professional_account' : int(profile.get('is_professional_account', False)),
        'is_joined_recently' : int(profile.get('is_joined_recently', False)),
        'is_business_account' : int(profile.get('is_business_account', False)),
        }
    posts = posts_data.get(username, [])
    if posts:
        avg_likes = np.mean([p.get('like_count', 0) or 0 for p in posts])
        avg_comments = np.mean([p.get('comments_count', 0) or 0 for p in posts])
        media_types = [p.get('media_type', '') for p in posts]
        image_ratio = sum(1 for m in media_types if m == 'IMAGE') / len(posts)
    else:
        avg_likes = 0
        avg_comments = 0
        image_ratio = 0
    
    return {
        **profile_features,
        'avg_likes': avg_likes,
        'avg_comments': avg_comments,
        'image_ratio': image_ratio
    }

###############################################################################
###############################################################################
# 6. Build the "training" corpus + label lists
###############################################################################
all_corpus = []     # TF-IDF text corpus
all_labels = []     # categories
all_usernames = []  # track usernames in parallel

for uname, posts in username2posts_train.items():
    captions_clean = []
    for p in posts:
        cap = p.get("caption", "")
        if cap:
            captions_clean.append(preprocess_text(cap))
    joined_text = "\n".join(captions_clean)
    all_corpus.append(joined_text)
    all_labels.append(username2_category[uname])
    all_usernames.append(uname)

###############################################################################
# 7. Train/Validation Split
###############################################################################

X_train_corpus, X_val_corpus, y_train, y_val, train_usernames_split, val_usernames_split = train_test_split(
    all_corpus,
    all_labels,
    all_usernames,
    test_size=0.2,
    stratify=all_labels,
    random_state=42
)

###############################################################################
# 7. Vectorize (TF-IDF) using only the training corpus
###############################################################################
vectorizer = TfidfVectorizer(stop_words=turkish_stopwords, max_features=5000)
vectorizer.fit(X_train_corpus)

# Transform
x_train_tfidf = vectorizer.transform(X_train_corpus)
x_val_tfidf   = vectorizer.transform(X_val_corpus)

###############################################################################
# 8. Hyperparameter Tuning for SVM
###############################################################################
svm_clf_base = SVC(probability=True, random_state=42)

param_grid_svm = {
    'C': [0.1, 1, 10],
    'kernel': ['linear', 'rbf'],
    'gamma': ['scale', 'auto']
}

svm_grid = GridSearchCV(
    svm_clf_base,
    param_grid_svm,
    scoring='accuracy',
    cv=3,  # or 5
    n_jobs=-1,
    verbose=1
)
svm_grid.fit(x_train_tfidf, y_train)

print("Best SVM params:", svm_grid.best_params_)
best_svm = svm_grid.best_estimator_

###############################################################################
# 9. Hyperparameter Tuning for LogisticRegression
###############################################################################
lr_clf_base = LogisticRegression(max_iter=1000, random_state=42)

param_grid_lr = {
    'C': [0.01, 0.1, 1, 10],
    'solver': ['lbfgs', 'saga'],  # etc.
    
}

lr_grid = GridSearchCV(
    lr_clf_base,
    param_grid_lr,
    scoring='accuracy',
    cv=3,
    n_jobs=-1,
    verbose=1
)
lr_grid.fit(x_train_tfidf, y_train)

print("Best LR params:", lr_grid.best_params_)
best_lr = lr_grid.best_estimator_

###############################################################################
# 10. Hyperparameter Tuning for Multinomial Naive Bayes
###############################################################################
nb_clf_base = MultinomialNB()
param_grid_nb = {
    'alpha': [0.1, 1.0, 5.0]
}

nb_grid = GridSearchCV(
    nb_clf_base,
    param_grid_nb,
    scoring='accuracy',
    cv=3,
    n_jobs=-1,
    verbose=1
)
nb_grid.fit(x_train_tfidf, y_train)

print("Best NB params:", nb_grid.best_params_)
best_nb = nb_grid.best_estimator_

###############################################################################
# 11. Build a VotingClassifier with the best models
###############################################################################
voting_clf = VotingClassifier(
    estimators=[
        ('svm', best_svm),
        ('lr', best_lr),
        ('nb', best_nb)
    ],
    voting='soft'  
)
voting_clf.fit(x_train_tfidf, y_train)

###############################################################################
# 12. Evaluate on the Validation Split
###############################################################################
y_val_pred = voting_clf.predict(x_val_tfidf)
val_accuracy = accuracy_score(y_val, y_val_pred)
print("\nValidation Accuracy:", val_accuracy)
print("\nValidation Classification Report:")
print(classification_report(y_val, y_val_pred, zero_division=0))

###############################################################################
# 13. Retrain on all labeled data (train+val) for final model

###############################################################################
X_full_tfidf = vectorizer.fit_transform(all_corpus)
voting_clf.fit(X_full_tfidf, all_labels)
y_full_pred = voting_clf.predict(X_full_tfidf)
full_accuracy = accuracy_score(all_labels, y_full_pred)
print("\nFull Accuracy:", full_accuracy)
print("\nFull Classification Report:")
print(classification_report(all_labels, y_full_pred, zero_division=0))

###############################################################################
# 14. Evaluate on test-classification-round2.dat
###############################################################################
test_data_round3_path = "C:/Users/onurm/Desktop/CS412-project/test-classification-round3.dat"


round2_usernames = []
with open(test_data_round3_path, "rt") as fh:
    for line in fh:
        round2_usernames.append(line.strip())



ground_truth_round2 = {}  


X_round2_corpus = []
round2_labels_if_known = []
found_usernames = []

for uname in round2_usernames:
    # Check if in unlabeled set:
    if uname in username2posts_unlabeled:
        posts = username2posts_unlabeled[uname]
    elif uname in username2posts_train:
        # Possibly was part of the labeled set
        posts = username2posts_train[uname]
    else:
        # Not found at all => empty
        posts = []
    
    cleaned_captions = []
    for p in posts:
        cap = p.get("caption", "")
        if cap:
            cleaned_captions.append(preprocess_text(cap))
    joined_text = "\n".join(cleaned_captions)
    X_round2_corpus.append(joined_text)
    found_usernames.append(uname)

    # If we do have ground truth for round2
    if uname in ground_truth_round2:
        round2_labels_if_known.append(ground_truth_round2[uname])
    else:
        # or an empty string (meaning unknown)
        round2_labels_if_known.append(None)

# TF-IDF transform
X_round2_tfidf = vectorizer.transform(X_round2_corpus)

# Predict
round2_predictions = voting_clf.predict(X_round2_tfidf)


actual_labels = [l for l in round2_labels_if_known if l is not None]
if len(actual_labels) == len(round2_labels_if_known) and len(actual_labels) > 0:
    
    round2_accuracy = accuracy_score(round2_labels_if_known, round2_predictions)
    print("\nROUND3 Accuracy:", round2_accuracy)
    print("\nROUND3 Classification Report:")
    print(classification_report(round2_labels_if_known, round2_predictions, zero_division=0))
else:
    print("\nNo ground-truth labels provided for round2, so only predictions are generated.")


round2_output = {u: p for u, p in zip(found_usernames, round2_predictions)}
with open("prediction-classification-round3", "w", encoding='utf-8') as of:
    json.dump(round2_output, of, indent=4, ensure_ascii=False)

print("\nDone! Round2 predictions saved to 'prediction-classification-round3'.")


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\onurm\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Fitting 3 folds for each of 12 candidates, totalling 36 fits




Best SVM params: {'C': 1, 'gamma': 'scale', 'kernel': 'linear'}
Fitting 3 folds for each of 8 candidates, totalling 24 fits
Best LR params: {'C': 10, 'solver': 'lbfgs'}
Fitting 3 folds for each of 3 candidates, totalling 9 fits
Best NB params: {'alpha': 0.1}

Validation Accuracy: 0.6648451730418944

Validation Classification Report:
                      precision    recall  f1-score   support

                 art       0.36      0.11      0.16        38
       entertainment       0.49      0.45      0.47        65
             fashion       0.58      0.70      0.64        60
                food       0.87      0.91      0.89       102
              gaming       0.00      0.00      0.00         3
health and lifestyle       0.64      0.75      0.69       100
    mom and children       0.63      0.40      0.49        30
              sports       0.83      0.65      0.73        23
                tech       0.66      0.80      0.72        69
              travel       0.65      0.68   