Authors: Leor Yomtobian and Yash Surve

In [None]:
# this code is using Logistic Regression as the Classifier
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import RobustScaler
from sklearn.metrics import accuracy_score, precision_score, confusion_matrix
from sklearn.model_selection import cross_val_score

# File paths
file_name_test = 'testTrack_hierarchy.txt'
file_name_train = 'trainIdx2_matrix.txt'
ground_truth_file = 'test2_new.txt'
output_file = 'submission_lr_optimized_v8.csv'

# Load ground truth
print("Loading ground truth...")
ground_truth = {}
with open(ground_truth_file, 'r') as f:
    for line in f:
        userID, trackID, label = line.strip().split('|')
        ground_truth[f"{userID}_{trackID}"] = int(label)
gt_users = set(k.split('_')[0] for k in ground_truth.keys())
gt_tracks = set(k.split('_')[1] for k in ground_truth.keys())
print(f"Loaded {len(ground_truth)} ground truth labels ({sum(1 for v in ground_truth.values() if v == 1)} positive, {sum(1 for v in ground_truth.values() if v == 0)} negative).")
print(f"Unique users in ground truth: {len(gt_users)}")

# Load test hierarchy
print("Loading test hierarchy for trackIDs...")
test_tracks = {}
with open(file_name_test, 'r') as fTest:
    for line in fTest:
        parts = line.strip().split('|')
        userID = parts[0]
        trackID = parts[1]
        albumID = parts[2] if parts[2] != "None" else None
        artistID = parts[3] if len(parts) > 3 and parts[3] != "None" else None
        genreIDs = parts[4:] if len(parts) > 4 else []
        test_tracks[f"{userID}_{trackID}"] = {
            'trackID': trackID,
            'albumID': albumID,
            'artistID': artistID,
            'genreIDs': genreIDs
        }
test_track_ids = set(t['trackID'] for t in test_tracks.values())
print(f"Loaded {len(test_tracks)} user-track pairs.")
print(f"Ground truth tracks in test hierarchy: {len(gt_tracks & test_track_ids)}/{len(gt_tracks)}")

# Load training data
print("Reading training data...")
train_data = {}
user_ratings = {}
all_scores = []
artist_global_scores = {}

with open(file_name_train, 'r') as fTrain:
    for line in fTrain:
        userID, itemID, score = line.strip().split('|')
        score = int(score)
        if userID not in train_data:
            train_data[userID] = {}
        train_data[userID][itemID] = score
        all_scores.append(score)
        if userID not in user_ratings:
            user_ratings[userID] = []
        user_ratings[userID].append(score)
        if itemID.startswith('artist_'):
            if itemID not in artist_global_scores:
                artist_global_scores[itemID] = []
            artist_global_scores[itemID].append(score)

global_avg_score = np.mean(all_scores) if all_scores else 0
user_avg_scores = {
    uid: np.mean(scores) if scores else global_avg_score for uid, scores in user_ratings.items()
}
user_rating_count = {
    uid: len(scores) for uid, scores in user_ratings.items()
}
user_rating_variance = {
    uid: np.std(scores) if scores else 0 for uid, scores in user_ratings.items()
}
# Log-transform and cap user_rating_count
user_rating_count = {uid: min(np.log1p(count), 5.0) for uid, count in user_rating_count.items()}
# Global artist averages for cold-start
artist_global_avg = {
    aid: np.mean(scores) if scores else global_avg_score for aid, scores in artist_global_scores.items()
}
print(f"Training data loaded. Global average score: {global_avg_score:.2f}")
print(f"User variance mean: {np.mean(list(user_rating_variance.values())):.2f}, std: {np.std(list(user_rating_variance.values())):.2f}")
print(f"User rating count mean: {np.mean(list(user_rating_count.values())):.2f}, std: {np.std(list(user_rating_count.values())):.2f}")
user_rating_coverage = {uid: sum(1 for tid in gt_tracks if tid in train_data.get(uid, {})) for uid in gt_users}
print(f"Users with ratings for ground truth tracks: {sum(1 for v in user_rating_coverage.values() if v > 0)}/{len(gt_users)}")

# Debug track IDs
sample_train_items = []
for uid in list(train_data.keys())[:5]:
    sample_train_items.extend(list(train_data[uid].keys())[:5])
print(f"Sample training item IDs: {sample_train_items[:10]}")
print(f"Sample ground truth track IDs: {list(gt_tracks)[:5]}")

# Prepare training data
print("Preparing training data for LR...")
X_train = []
y_train = []
users_processed = set()
feature_stats = {
    'artist': [], 'genre': [], 'has_rating': [], 'track_score_missing': [],
    'album_score_missing': [], 'artist_score_missing': [], 'genre_score_missing': [],
    'artist_score_weighted': [], 'user_variance': [], 'user_rating_count': []
}
raw_counts = {'track': 0, 'album': 0, 'artist': 0, 'genre': 0}
label_ratings = {
    'positive': {'artist': [], 'genre': [], 'artist_score_weighted': []},
    'negative': {'artist': [], 'genre': [], 'artist_score_weighted': []}
}
track_label_counts = {tid: {'pos': 0, 'neg': 0} for tid in gt_tracks}
track_rating_debug = []

for userID in gt_users:
    u_ratings = train_data.get(userID, {})
    user_track_keys = [k for k in ground_truth.keys() if k.startswith(userID + '_')]
    if not user_track_keys:
        print(f"Warning: User {userID} has no tracks in ground truth. Skipping.")
        continue

    for track_key in user_track_keys:
        trackID = track_key.split('_')[1]
        label = ground_truth[track_key]
        track_label_counts[trackID]['pos' if label == 1 else 'neg'] += 1

        # Extract features
        t_data = test_tracks.get(track_key, {'trackID': trackID, 'albumID': None, 'artistID': None, 'genreIDs': []})
        # Try raw trackID and prefixed versions
        raw_track_score = u_ratings.get(t_data['trackID'], u_ratings.get(f"track_{t_data['trackID']}", 0))
        raw_album_score = u_ratings.get(t_data['albumID'], 0) if t_data['albumID'] else 0
        raw_artist_score = u_ratings.get(t_data['artistID'], 0) if t_data['artistID'] else 0
        genre_scores = [u_ratings.get(gid, 0) for gid in t_data['genreIDs']]
        genre_rated = [s for s in genre_scores if s > 0]
        raw_genre_score = np.mean(genre_rated) if genre_rated else 0

        # Debug track ratings
        track_rating_debug.append(raw_track_score)
        if raw_track_score > 0:
            raw_counts['track'] += 1
        if raw_album_score > 0:
            raw_counts['album'] += 1
        if raw_artist_score > 0:
            raw_counts['artist'] += 1
        if raw_genre_score > 0:
            raw_counts['genre'] += 1

        has_rating = 1 if raw_track_score > 0 else 0
        track_score_missing = 1 if raw_track_score == 0 else 0
        album_score_missing = 1 if raw_album_score == 0 else 0
        artist_score_missing = 1 if raw_artist_score == 0 else 0
        genre_score_missing = 1 if not genre_rated else 0
        artist_score_weighted = raw_artist_score * (1 - artist_score_missing)
        user_variance = user_rating_variance.get(userID, 0)
        user_rcount = user_rating_count.get(userID, 0)
        default_score = user_avg_scores.get(userID, global_avg_score)
        artist_score = min(raw_artist_score, 100) if raw_artist_score > 0 else default_score
        genre_score = min(raw_genre_score, 100) if raw_genre_score > 0 else default_score

        features = [
            artist_score,
            genre_score,
            has_rating,
            track_score_missing,
            album_score_missing,
            artist_score_missing,
            genre_score_missing,
            artist_score_weighted,
            user_variance,
            user_rcount
        ]
        X_train.append(features)
        y_train.append(label)

        # Track feature statistics and label ratings
        feature_stats['artist'].append(artist_score)
        feature_stats['genre'].append(genre_score)
        feature_stats['has_rating'].append(has_rating)
        feature_stats['track_score_missing'].append(track_score_missing)
        feature_stats['album_score_missing'].append(album_score_missing)
        feature_stats['artist_score_missing'].append(artist_score_missing)
        feature_stats['genre_score_missing'].append(genre_score_missing)
        feature_stats['artist_score_weighted'].append(artist_score_weighted)
        feature_stats['user_variance'].append(user_variance)
        feature_stats['user_rating_count'].append(user_rcount)
        label_ratings['positive' if label == 1 else 'negative']['artist'].append(artist_score)
        label_ratings['positive' if label == 1 else 'negative']['genre'].append(genre_score)
        label_ratings['positive' if label == 1 else 'negative']['artist_score_weighted'].append(artist_score_weighted)

    users_processed.add(userID)

X_train = np.array(X_train)
y_train = np.array(y_train)
print(f"Prepared {len(X_train)} training samples from {len(users_processed)} users.")
print("Feature statistics (non-zero counts):")
for fname, fvals in feature_stats.items():
    if fname in ['has_rating', 'track_score_missing', 'album_score_missing', 'artist_score_missing', 'genre_score_missing']:
        non_zero = sum(1 for v in fvals if v != 0)
    else:
        non_zero = sum(1 for v in fvals if v != global_avg_score and v != 0)
    print(f"  {fname}: {non_zero}/{len(fvals)} non-zero values")
print("Raw rating counts (before defaulting):")
for fname, count in raw_counts.items():
    print(f"  {fname}: {count}/{len(X_train)} non-zero values")
print(f"Track rating debug: {sum(1 for v in track_rating_debug if v > 0)}/{len(track_rating_debug)} non-zero raw_track_score")
for ftype in ['artist', 'genre', 'artist_score_weighted']:
    print(f"Average {ftype}_score for positive labels: {np.mean(label_ratings['positive'][ftype]):.2f}")
    print(f"Average {ftype}_score for negative labels: {np.mean(label_ratings['negative'][ftype]):.2f}")
    print(f"Std {ftype}_score for positive labels: {np.std(label_ratings['positive'][ftype]):.2f}")
    print(f"Std {ftype}_score for negative labels: {np.std(label_ratings['negative'][ftype]):.2f}")
mixed_label_tracks = sum(1 for v in track_label_counts.values() if v['pos'] > 0 and v['neg'] > 0)
print(f"Tracks with mixed labels: {mixed_label_tracks}/{len(gt_tracks)}")
# Feature correlations
feature_names = ['artist', 'genre', 'has_rating', 'track_score_missing', 'album_score_missing', 'artist_score_missing', 'genre_score_missing', 'artist_score_weighted', 'user_variance', 'user_rating_count']
corr_matrix = np.corrcoef(X_train.T)
print("Feature correlations:")
for i, fname in enumerate(feature_names):
    for j in range(i+1, len(feature_names)):
        if abs(corr_matrix[i, j]) > 0.5:
            print(f"  {fname} vs {feature_names[j]}: {corr_matrix[i, j]:.2f}")

# Train Logistic Regression model
print("Training Logistic Regression model...")
scaler = RobustScaler()
X_train_scaled = scaler.fit_transform(X_train)
lr_model = LogisticRegression(random_state=42, max_iter=2000, solver='lbfgs', class_weight='balanced', C=2.0)
lr_model.fit(X_train_scaled, y_train)
cv_scores = cross_val_score(lr_model, X_train_scaled, y_train, cv=5, scoring='accuracy')
cv_precision_scores = cross_val_score(lr_model, X_train_scaled, y_train, cv=5, scoring='precision')
print("Model training completed.")
print("Feature coefficients:", dict(zip(feature_names, lr_model.coef_[0])))
print("Feature means after scaling:", np.mean(X_train_scaled, axis=0))
print("Feature stds after scaling:", np.std(X_train_scaled, axis=0))
print(f"Cross-validation accuracy: {np.mean(cv_scores):.4f} ± {np.std(cv_scores):.4f}")
print(f"Cross-validation Precision@3: {np.mean(cv_precision_scores):.4f} ± {np.std(cv_precision_scores):.4f}")

# Output predictions
fOut = open(output_file, 'w')
fOut.write("TrackID,Predictor\n")
written_keys = set()
unseen_users = 0
cold_start_preds = 0

print("Processing test data...")
with open(file_name_test, 'r') as fTest:
    lastUserID = None
    user_tracks = []
    user_metadata = []

    for line in fTest:
        parts = line.strip().split('|')
        userID = parts[0]
        trackID = parts[1]
        albumID = parts[2] if parts[2] != "None" else None
        artistID = parts[3] if len(parts) > 3 and parts[3] != "None" else None
        genreIDs = parts[4:] if len(parts) > 4 else []

        if userID != lastUserID and lastUserID is not None and user_tracks:
            u_ratings = train_data.get(lastUserID, {})
            default_score = user_avg_scores.get(lastUserID, global_avg_score)
            if lastUserID not in train_data or not u_ratings:
                unseen_users += 1
                # Fallback to weighted global artist_score
                artist_scores = []
                for i, t_data in enumerate(user_metadata):
                    artist_score = artist_global_avg.get(t_data['artistID'], global_avg_score) if t_data['artistID'] else global_avg_score
                    weight = 1.2 if artist_score > global_avg_score else 0.8
                    artist_scores.append(artist_score * weight)
                top3 = np.argsort(-np.array(artist_scores))[:3]
                preds = [1 if i in top3 else 0 for i in range(len(user_tracks))]
                cold_start_preds += sum(preds)
            else:
                X_test = []
                for i, t_data in enumerate(user_metadata):
                    tID = t_data['trackID']
                    raw_track_score = u_ratings.get(tID, u_ratings.get(f"track_{tID}", 0))
                    raw_album_score = u_ratings.get(t_data['albumID'], 0) if t_data['albumID'] else 0
                    raw_artist_score = u_ratings.get(t_data['artistID'], 0) if t_data['artistID'] else 0
                    genre_scores = [u_ratings.get(gid, 0) for gid in t_data['genreIDs']]
                    genre_rated = [s for s in genre_scores if s > 0]
                    raw_genre_score = np.mean(genre_rated) if genre_rated else 0

                    has_rating = 1 if raw_track_score > 0 else 0
                    track_score_missing = 1 if raw_track_score == 0 else 0
                    album_score_missing = 1 if raw_album_score == 0 else 0
                    artist_score_missing = 1 if raw_artist_score == 0 else 0
                    genre_score_missing = 1 if not genre_rated else 0
                    artist_score_weighted = raw_artist_score * (1 - artist_score_missing)
                    user_variance = user_rating_variance.get(lastUserID, 0)
                    user_rcount = user_rating_count.get(lastUserID, 0)
                    artist_score = min(raw_artist_score, 100) if raw_artist_score > 0 else default_score
                    genre_score = min(raw_genre_score, 100) if raw_genre_score > 0 else default_score

                    features = [
                        artist_score,
                        genre_score,
                        has_rating,
                        track_score_missing,
                        album_score_missing,
                        artist_score_missing,
                        genre_score_missing,
                        artist_score_weighted,
                        user_variance,
                        user_rcount
                    ]
                    X_test.append(features)

                X_test = np.array(X_test)
                X_test_scaled = scaler.transform(X_test)
                probs = lr_model.predict_proba(X_test_scaled)[:, 1]
                prob_diff = np.max(probs) - np.min(probs)
                top3 = np.argsort(-probs)[:3 if prob_diff > 0.25 else 2]
                preds = [1 if i in top3 else 0 for i in range(len(user_tracks))]
                if prob_diff < 0.1:  # Fallback for low confidence
                    artist_scores = [X_test[i][0] * (1 - X_test[i][5]) for i in range(len(X_test))]
                    top3 = np.argsort(-np.array(artist_scores))[:3]
                    preds = [1 if i in top3 else 0 for i in range(len(user_tracks))]
                    cold_start_preds += sum(preds)

            for i, t_data in enumerate(user_metadata):
                t_key = f"{lastUserID}_{t_data['trackID']}"
                if t_key not in written_keys:
                    fOut.write(f"{t_key},{preds[i]}\n")
                    written_keys.add(t_key)

            user_tracks = []
            user_metadata = []

        user_tracks.append(trackID)
        user_metadata.append({'trackID': trackID, 'albumID': albumID, 'artistID': artistID, 'genreIDs': genreIDs})
        lastUserID = userID

    # Handle last user block
    if user_tracks:
        u_ratings = train_data.get(lastUserID, {})
        default_score = user_avg_scores.get(lastUserID, global_avg_score)
        if lastUserID not in train_data or not u_ratings:
            unseen_users += 1
            artist_scores = []
            for i, t_data in enumerate(user_metadata):
                artist_score = artist_global_avg.get(t_data['artistID'], global_avg_score) if t_data['artistID'] else global_avg_score
                weight = 1.2 if artist_score > global_avg_score else 0.8
                artist_scores.append(artist_score * weight)
            top3 = np.argsort(-np.array(artist_scores))[:3]
            preds = [1 if i in top3 else 0 for i in range(len(user_tracks))]
            cold_start_preds += sum(preds)
        else:
            X_test = []
            for i, t_data in enumerate(user_metadata):
                tID = t_data['trackID']
                raw_track_score = u_ratings.get(tID, u_ratings.get(f"track_{tID}", 0))
                raw_album_score = u_ratings.get(t_data['albumID'], 0) if t_data['albumID'] else 0
                raw_artist_score = u_ratings.get(t_data['artistID'], 0) if t_data['artistID'] else 0
                genre_scores = [u_ratings.get(gid, 0) for gid in t_data['genreIDs']]
                genre_rated = [s for s in genre_scores if s > 0]
                raw_genre_score = np.mean(genre_rated) if genre_rated else 0

                has_rating = 1 if raw_track_score > 0 else 0
                track_score_missing = 1 if raw_track_score == 0 else 0
                album_score_missing = 1 if raw_album_score == 0 else 0
                artist_score_missing = 1 if raw_artist_score == 0 else 0
                genre_score_missing = 1 if not genre_rated else 0
                artist_score_weighted = raw_artist_score * (1 - artist_score_missing)
                user_variance = user_rating_variance.get(lastUserID, 0)
                user_rcount = user_rating_count.get(lastUserID, 0)
                artist_score = min(raw_artist_score, 100) if raw_artist_score > 0 else default_score
                genre_score = min(raw_genre_score, 100) if raw_genre_score > 0 else default_score

                features = [
                    artist_score,
                    genre_score,
                    has_rating,
                    track_score_missing,
                    album_score_missing,
                    artist_score_missing,
                    genre_score_missing,
                    artist_score_weighted,
                    user_variance,
                    user_rcount
                ]
                X_test.append(features)

            X_test = np.array(X_test)
            X_test_scaled = scaler.transform(X_test)
            probs = lr_model.predict_proba(X_test_scaled)[:, 1]
            prob_diff = np.max(probs) - np.min(probs)
            top3 = np.argsort(-probs)[:3 if prob_diff > 0.25 else 2]
            preds = [1 if i in top3 else 0 for i in range(len(user_tracks))]
            if prob_diff < 0.1:
                artist_scores = [X_test[i][0] * (1 - X_test[i][5]) for i in range(len(X_test))]
                top3 = np.argsort(-np.array(artist_scores))[:3]
                preds = [1 if i in top3 else 0 for i in range(len(user_tracks))]
                cold_start_preds += sum(preds)

        for i, t_data in enumerate(user_metadata):
            t_key = f"{lastUserID}_{t_data['trackID']}"
            if t_key not in written_keys:
                fOut.write(f"{t_key},{preds[i]}\n")
                written_keys.add(t_key)

fOut.close()
print(f"Submission file '{output_file}' written with {len(written_keys)} predictions.")
print(f"Unseen users in test set: {unseen_users}")
print(f"Cold-start predictions (positive): {cold_start_preds}")

# Evaluate on ground truth
print("Evaluating on ground truth...")
y_true = []
y_pred = []
eval_users = set()
prob_dist = []

for userID in gt_users:
    user_track_keys = [k for k in ground_truth.keys() if k.startswith(userID + '_')]
    if not user_track_keys:
        print(f"Warning: User {userID} has no tracks in ground truth. Skipping.")
        continue

    u_ratings = train_data.get(userID, {})
    default_score = user_avg_scores.get(userID, global_avg_score)
    X_test = []

    for track_key in user_track_keys:
        t_data = test_tracks.get(track_key, {'trackID': track_key.split('_')[1], 'albumID': None, 'artistID': None, 'genreIDs': []})
        raw_track_score = u_ratings.get(t_data['trackID'], u_ratings.get(f"track_{t_data['trackID']}", 0))
        raw_album_score = u_ratings.get(t_data['albumID'], 0) if t_data['albumID'] else 0
        raw_artist_score = u_ratings.get(t_data['artistID'], 0) if t_data['artistID'] else 0
        genre_scores = [u_ratings.get(gid, 0) for gid in t_data['genreIDs']]
        genre_rated = [s for s in genre_scores if s > 0]
        raw_genre_score = np.mean(genre_rated) if genre_rated else 0

        has_rating = 1 if raw_track_score > 0 else 0
        track_score_missing = 1 if raw_track_score == 0 else 0
        album_score_missing = 1 if raw_album_score == 0 else 0
        artist_score_missing = 1 if raw_artist_score == 0 else 0
        genre_score_missing = 1 if not genre_rated else 0
        artist_score_weighted = raw_artist_score * (1 - artist_score_missing)
        user_variance = user_rating_variance.get(userID, 0)
        user_rcount = user_rating_count.get(userID, 0)
        artist_score = min(raw_artist_score, 100) if raw_artist_score > 0 else default_score
        genre_score = min(raw_genre_score, 100) if raw_genre_score > 0 else default_score

        features = [
            artist_score,
            genre_score,
            has_rating,
            track_score_missing,
            album_score_missing,
            artist_score_missing,
            genre_score_missing,
            artist_score_weighted,
            user_variance,
            user_rcount
        ]
        X_test.append(features)

    X_test = np.array(X_test)
    X_test_scaled = scaler.transform(X_test)
    probs = lr_model.predict_proba(X_test_scaled)[:, 1]
    prob_diff = np.max(probs) - np.min(probs)
    top3 = np.argsort(-probs)[:3 if prob_diff > 0.25 else 2]
    preds = [1 if i in top3 else 0 for i in range(len(user_track_keys))]
    if prob_diff < 0.1:
        artist_scores = [X_test[i][0] * (1 - X_test[i][5]) for i in range(len(X_test))]
        top3 = np.argsort(-np.array(artist_scores))[:3]
        preds = [1 if i in top3 else 0 for i in range(len(user_track_keys))]
        cold_start_preds += sum(preds)

    y_pred.extend(preds)
    y_true.extend([ground_truth[k] for k in user_track_keys])
    prob_dist.extend(probs)
    eval_users.add(userID)

accuracy = accuracy_score(y_true, y_pred)
precision = precision_score(y_true, y_pred, average='binary')
cm = confusion_matrix(y_true, y_pred)
print(f"Ground truth accuracy: {accuracy:.4f} over {len(y_true)} samples from {len(eval_users)} users.")
print(f"Precision@3: {precision:.4f}")
print(f"Confusion matrix:\n{cm}")
print(f"Prediction probability distribution: mean={np.mean(prob_dist):.4f}, std={np.std(prob_dist):.4f}, min={np.min(prob_dist):.4f}, max={np.max(prob_dist):.4f}")

In [None]:
# this code is using Random Forest as the Classifier
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import RobustScaler
from sklearn.metrics import accuracy_score, precision_score, confusion_matrix
from sklearn.model_selection import cross_val_score, RandomizedSearchCV

# File paths
file_name_test = 'testTrack_hierarchy.txt'
file_name_train = 'trainIdx2_matrix.txt'
ground_truth_file = 'test2_new.txt'
output_file = 'submission_rf_optimized_v13.csv'

# Load ground truth
print("Loading ground truth...")
ground_truth = {}
with open(ground_truth_file, 'r') as f:
    for line in f:
        userID, trackID, label = line.strip().split('|')
        ground_truth[f"{userID}_{trackID}"] = int(label)
gt_users = set(k.split('_')[0] for k in ground_truth.keys())
gt_tracks = set(k.split('_')[1] for k in ground_truth.keys())
print(f"Loaded {len(ground_truth)} ground truth labels ({sum(1 for v in ground_truth.values() if v == 1)} positive, {sum(1 for v in ground_truth.values() if v == 0)} negative).")
print(f"Unique users in ground truth: {len(gt_users)}")

# Load test hierarchy
print("Loading test hierarchy for trackIDs...")
test_tracks = {}
with open(file_name_test, 'r') as fTest:
    for line in fTest:
        parts = line.strip().split('|')
        userID = parts[0]
        trackID = parts[1]
        albumID = parts[2] if parts[2] != "None" else None
        artistID = parts[3] if len(parts) > 3 and parts[3] != "None" else None
        genreIDs = parts[4:] if len(parts) > 4 else []
        test_tracks[f"{userID}_{trackID}"] = {
            'trackID': trackID,
            'albumID': albumID,
            'artistID': artistID,
            'genreIDs': genreIDs
        }
test_track_ids = set(t['trackID'] for t in test_tracks.values())
print(f"Loaded {len(test_tracks)} user-track pairs.")
print(f"Ground truth tracks in test hierarchy: {len(gt_tracks & test_track_ids)}/{len(gt_tracks)}")

# Load training data
print("Reading training data...")
train_data = {}
user_ratings = {}
all_scores = []
artist_global_scores = {}
genre_global_scores = {}

with open(file_name_train, 'r') as fTrain:
    for line in fTrain:
        userID, itemID, score = line.strip().split('|')
        score = int(score)
        if userID not in train_data:
            train_data[userID] = {}
        train_data[userID][itemID] = score
        all_scores.append(score)
        if userID not in user_ratings:
            user_ratings[userID] = []
        user_ratings[userID].append(score)
        if itemID.startswith('artist_'):
            if itemID not in artist_global_scores:
                artist_global_scores[itemID] = []
            artist_global_scores[itemID].append(score)
        if itemID.startswith('genre_'):
            if itemID not in genre_global_scores:
                genre_global_scores[itemID] = []
            genre_global_scores[itemID].append(score)

global_avg_score = np.mean(all_scores) if all_scores else 0
user_avg_scores = {
    uid: np.mean(scores) if scores else global_avg_score for uid, scores in user_ratings.items()
}
user_rating_count = {
    uid: len(scores) for uid, scores in user_ratings.items()
}
user_rating_variance = {
    uid: np.std(scores) if scores else 0 for uid, scores in user_ratings.items()
}
# Log-transform and cap user_rating_count
user_rating_count = {uid: min(np.log1p(count), 5.0) for uid, count in user_rating_count.items()}
# Global artist and genre averages for new features and cold-start
artist_global_avg = {
    aid: np.mean(scores) if scores else global_avg_score for aid, scores in artist_global_scores.items()
}
genre_global_avg = {
    gid: np.mean(scores) if scores else global_avg_score for gid, scores in genre_global_scores.items()
}
print(f"Training data loaded. Global average score: {global_avg_score:.2f}")
print(f"User variance mean: {np.mean(list(user_rating_variance.values())):.2f}, std: {np.std(list(user_rating_variance.values())):.2f}")
print(f"User rating count mean: {np.mean(list(user_rating_count.values())):.2f}, std: {np.std(list(user_rating_count.values())):.2f}")
user_rating_coverage = {uid: sum(1 for tid in gt_tracks if tid in train_data.get(uid, {})) for uid in gt_users}
print(f"Users with ratings for ground truth tracks: {sum(1 for v in user_rating_coverage.values() if v > 0)}/{len(gt_users)}")

# Debug track IDs
sample_train_items = []
for uid in list(train_data.keys())[:5]:
    sample_train_items.extend(list(train_data[uid].keys())[:5])
print(f"Sample training item IDs: {sample_train_items[:10]}")
print(f"Sample ground truth track IDs: {list(gt_tracks)[:5]}")

# Prepare training data
print("Preparing training data for RF...")
X_train = []
y_train = []
users_processed = set()
feature_stats = {
    'artist': [], 'genre': [], 'album_score_missing': [], 'artist_score_missing': [],
    'genre_score_missing': [], 'artist_score_weighted': [], 'user_variance': [],
    'user_rating_count': [], 'genre_count': [], 'artist_popularity': [], 'genre_score_weighted': []
}
raw_counts = {'track': 0, 'album': 0, 'artist': 0, 'genre': 0}
label_ratings = {
    'positive': {'artist': [], 'genre': [], 'artist_score_weighted': [], 'genre_count': [], 'artist_popularity': [], 'genre_score_weighted': []},
    'negative': {'artist': [], 'genre': [], 'artist_score_weighted': [], 'genre_count': [], 'artist_popularity': [], 'genre_score_weighted': []}
}
track_label_counts = {tid: {'pos': 0, 'neg': 0} for tid in gt_tracks}
track_rating_debug = []

for userID in gt_users:
    u_ratings = train_data.get(userID, {})
    user_track_keys = [k for k in ground_truth.keys() if k.startswith(userID + '_')]
    if not user_track_keys:
        print(f"Warning: User {userID} has no tracks in ground truth. Skipping.")
        continue

    for track_key in user_track_keys:
        trackID = track_key.split('_')[1]
        label = ground_truth[track_key]
        track_label_counts[trackID]['pos' if label == 1 else 'neg'] += 1

        # Extract features
        t_data = test_tracks.get(track_key, {'trackID': trackID, 'albumID': None, 'artistID': None, 'genreIDs': []})
        raw_track_score = u_ratings.get(t_data['trackID'], u_ratings.get(f"track_{t_data['trackID']}", 0))
        raw_album_score = u_ratings.get(t_data['albumID'], 0) if t_data['albumID'] else 0
        raw_artist_score = u_ratings.get(t_data['artistID'], 0) if t_data['artistID'] else 0
        genre_scores = [u_ratings.get(gid, 0) for gid in t_data['genreIDs']]
        genre_rated = [s for s in genre_scores if s > 0]
        raw_genre_score = np.mean(genre_rated) if genre_rated else 0
        genre_count = len(t_data['genreIDs'])
        # New features
        artist_popularity = artist_global_avg.get(t_data['artistID'], global_avg_score) if t_data['artistID'] else global_avg_score
        genre_score_weighted = raw_genre_score * (len(genre_rated) / len(t_data['genreIDs']) if t_data['genreIDs'] else 0)

        # Debug track ratings
        track_rating_debug.append(raw_track_score)
        if raw_track_score > 0:
            raw_counts['track'] += 1
        if raw_album_score > 0:
            raw_counts['album'] += 1
        if raw_artist_score > 0:
            raw_counts['artist'] += 1
        if raw_genre_score > 0:
            raw_counts['genre'] += 1

        album_score_missing = 1 if raw_album_score == 0 else 0
        artist_score_missing = 1 if raw_artist_score == 0 else 0
        genre_score_missing = 1 if not genre_rated else 0
        artist_score_weighted = raw_artist_score * (1 - artist_score_missing)
        user_variance = user_rating_variance.get(userID, 0)
        user_rcount = user_rating_count.get(userID, 0)
        default_score = user_avg_scores.get(userID, global_avg_score)
        artist_score = min(raw_artist_score, 100) if raw_artist_score > 0 else default_score
        genre_score = min(raw_genre_score, 100) if raw_genre_score > 0 else default_score

        features = [
            artist_score,
            genre_score,
            album_score_missing,
            artist_score_missing,
            genre_score_missing,
            artist_score_weighted,
            user_variance,
            user_rcount,
            genre_count,
            artist_popularity,
            genre_score_weighted
        ]
        X_train.append(features)
        y_train.append(label)

        # Track feature statistics and label ratings
        feature_stats['artist'].append(artist_score)
        feature_stats['genre'].append(genre_score)
        feature_stats['album_score_missing'].append(album_score_missing)
        feature_stats['artist_score_missing'].append(artist_score_missing)
        feature_stats['genre_score_missing'].append(genre_score_missing)
        feature_stats['artist_score_weighted'].append(artist_score_weighted)
        feature_stats['user_variance'].append(user_variance)
        feature_stats['user_rating_count'].append(user_rcount)
        feature_stats['genre_count'].append(genre_count)
        feature_stats['artist_popularity'].append(artist_popularity)
        feature_stats['genre_score_weighted'].append(genre_score_weighted)
        label_ratings['positive' if label == 1 else 'negative']['artist'].append(artist_score)
        label_ratings['positive' if label == 1 else 'negative']['genre'].append(genre_score)
        label_ratings['positive' if label == 1 else 'negative']['artist_score_weighted'].append(artist_score_weighted)
        label_ratings['positive' if label == 1 else 'negative']['genre_count'].append(genre_count)
        label_ratings['positive' if label == 1 else 'negative']['artist_popularity'].append(artist_popularity)
        label_ratings['positive' if label == 1 else 'negative']['genre_score_weighted'].append(genre_score_weighted)

    users_processed.add(userID)

X_train = np.array(X_train)
y_train = np.array(y_train)
print(f"Prepared {len(X_train)} training samples from {len(users_processed)} users.")
print("Feature statistics (non-zero counts):")
for fname, fvals in feature_stats.items():
    if fname in ['album_score_missing', 'artist_score_missing', 'genre_score_missing']:
        non_zero = sum(1 for v in fvals if v != 0)
    else:
        non_zero = sum(1 for v in fvals if v != global_avg_score and v != 0)
    print(f"  {fname}: {non_zero}/{len(fvals)} non-zero values")
print("Raw rating counts (before defaulting):")
for fname, count in raw_counts.items():
    print(f"  {fname}: {count}/{len(X_train)} non-zero values")
print(f"Track rating debug: {sum(1 for v in track_rating_debug if v > 0)}/{len(track_rating_debug)} non-zero raw_track_score")
for ftype in ['artist', 'genre', 'artist_score_weighted', 'genre_count', 'artist_popularity', 'genre_score_weighted']:
    print(f"Average {ftype}_score for positive labels: {np.mean(label_ratings['positive'][ftype]):.2f}")
    print(f"Average {ftype}_score for negative labels: {np.mean(label_ratings['negative'][ftype]):.2f}")
    print(f"Std {ftype}_score for positive labels: {np.std(label_ratings['positive'][ftype]):.2f}")
    print(f"Std {ftype}_score for negative labels: {np.std(label_ratings['negative'][ftype]):.2f}")
mixed_label_tracks = sum(1 for v in track_label_counts.values() if v['pos'] > 0 and v['neg'] > 0)
print(f"Tracks with mixed labels: {mixed_label_tracks}/{len(gt_tracks)}")
# Feature correlations
feature_names = ['artist', 'genre', 'album_score_missing', 'artist_score_missing', 'genre_score_missing', 'artist_score_weighted', 'user_variance', 'user_rating_count', 'genre_count', 'artist_popularity', 'genre_score_weighted']
corr_matrix = np.corrcoef(X_train.T)
print("Feature correlations:")
for i, fname in enumerate(feature_names):
    for j in range(i+1, len(feature_names)):
        if abs(corr_matrix[i, j]) > 0.5:
            print(f"  {fname} vs {feature_names[j]}: {corr_matrix[i, j]:.2f}")

# Train Random Forest model with RandomizedSearchCV
print("Training Random Forest model with optimized hyperparameter tuning...")
scaler = RobustScaler()
X_train_scaled = scaler.fit_transform(X_train)
rf_model = RandomForestClassifier(random_state=42, class_weight='balanced')
param_dist = {
    'n_estimators': [100, 200, 300, 400],
    'max_depth': [5, 10, 15, 20],
    'min_samples_split': [5, 10, 20],
    'min_samples_leaf': [2, 5, 10],
    'max_features': ['sqrt', 'log2', 0.5, 0.7]
}
random_search = RandomizedSearchCV(rf_model, param_distributions=param_dist, n_iter=50, cv=5, scoring='precision', n_jobs=-1, random_state=42)
random_search.fit(X_train_scaled, y_train)
rf_model = random_search.best_estimator_
cv_scores = cross_val_score(rf_model, X_train_scaled, y_train, cv=5, scoring='accuracy')
cv_precision_scores = cross_val_score(rf_model, X_train_scaled, y_train, cv=5, scoring='precision')
print("Model training completed.")
print(f"Best hyperparameters: {random_search.best_params_}")
print("Feature importances:", dict(zip(feature_names, rf_model.feature_importances_)))
print("Feature means after scaling:", np.mean(X_train_scaled, axis=0))
print("Feature stds after scaling:", np.std(X_train_scaled, axis=0))
print(f"Cross-validation accuracy: {np.mean(cv_scores):.4f} ± {np.std(cv_scores):.4f}")
print(f"Cross-validation Precision@3: {np.mean(cv_precision_scores):.4f} ± {np.std(cv_precision_scores):.4f}")

# Output predictions
fOut = open(output_file, 'w')
fOut.write("TrackID,Predictor\n")
written_keys = set()
unseen_users = 0
cold_start_preds = 0

print("Processing test data...")
with open(file_name_test, 'r') as fTest:
    lastUserID = None
    user_tracks = []
    user_metadata = []

    for line in fTest:
        parts = line.strip().split('|')
        userID = parts[0]
        trackID = parts[1]
        albumID = parts[2] if parts[2] != "None" else None
        artistID = parts[3] if len(parts) > 3 and parts[3] != "None" else None
        genreIDs = parts[4:] if len(parts) > 4 else []

        if userID != lastUserID and lastUserID is not None and user_tracks:
            u_ratings = train_data.get(lastUserID, {})
            default_score = user_avg_scores.get(lastUserID, global_avg_score)
            if lastUserID not in train_data or not u_ratings:
                unseen_users += 1
                # Fallback to weighted global artist_score
                artist_scores = []
                for i, t_data in enumerate(user_metadata):
                    artist_score = artist_global_avg.get(t_data['artistID'], global_avg_score) if t_data['artistID'] else global_avg_score
                    weight = 1.2 if artist_score > global_avg_score else 0.8
                    artist_scores.append(artist_score * weight)
                top3 = np.argsort(-np.array(artist_scores))[:3]
                preds = [1 if i in top3 else 0 for i in range(len(user_tracks))]
                cold_start_preds += sum(preds)
            else:
                X_test = []
                for i, t_data in enumerate(user_metadata):
                    tID = t_data['trackID']
                    raw_track_score = u_ratings.get(tID, u_ratings.get(f"track_{tID}", 0))
                    raw_album_score = u_ratings.get(t_data['albumID'], 0) if t_data['albumID'] else 0
                    raw_artist_score = u_ratings.get(t_data['artistID'], 0) if t_data['artistID'] else 0
                    genre_scores = [u_ratings.get(gid, 0) for gid in t_data['genreIDs']]
                    genre_rated = [s for s in genre_scores if s > 0]
                    raw_genre_score = np.mean(genre_rated) if genre_rated else 0
                    genre_count = len(t_data['genreIDs'])
                    artist_popularity = artist_global_avg.get(t_data['artistID'], global_avg_score) if t_data['artistID'] else global_avg_score
                    genre_score_weighted = raw_genre_score * (len(genre_rated) / len(t_data['genreIDs']) if t_data['genreIDs'] else 0)

                    album_score_missing = 1 if raw_album_score == 0 else 0
                    artist_score_missing = 1 if raw_artist_score == 0 else 0
                    genre_score_missing = 1 if not genre_rated else 0
                    artist_score_weighted = raw_artist_score * (1 - artist_score_missing)
                    user_variance = user_rating_variance.get(lastUserID, 0)
                    user_rcount = user_rating_count.get(lastUserID, 0)
                    artist_score = min(raw_artist_score, 100) if raw_artist_score > 0 else default_score
                    genre_score = min(raw_genre_score, 100) if raw_genre_score > 0 else default_score

                    features = [
                        artist_score,
                        genre_score,
                        album_score_missing,
                        artist_score_missing,
                        genre_score_missing,
                        artist_score_weighted,
                        user_variance,
                        user_rcount,
                        genre_count,
                        artist_popularity,
                        genre_score_weighted
                    ]
                    X_test.append(features)

                X_test = np.array(X_test)
                X_test_scaled = scaler.transform(X_test)
                probs = rf_model.predict_proba(X_test_scaled)[:, 1]
                prob_diff = np.max(probs) - np.min(probs)
                top3 = np.argsort(-probs)[:3]
                preds = [1 if i in top3 else 0 for i in range(len(user_tracks))]
                if prob_diff < 0.03:
                    artist_scores = [X_test[i][0] * (1 - X_test[i][3]) for i in range(len(X_test))]
                    top3 = np.argsort(-np.array(artist_scores))[:3]
                    preds = [1 if i in top3 else 0 for i in range(len(user_tracks))]
                    cold_start_preds += sum(preds)

            for i, t_data in enumerate(user_metadata):
                t_key = f"{lastUserID}_{t_data['trackID']}"
                if t_key not in written_keys:
                    fOut.write(f"{t_key},{preds[i]}\n")
                    written_keys.add(t_key)

            user_tracks = []
            user_metadata = []

        user_tracks.append(trackID)
        user_metadata.append({'trackID': trackID, 'albumID': albumID, 'artistID': artistID, 'genreIDs': genreIDs})
        lastUserID = userID

    # Handle last user block
    if user_tracks:
        u_ratings = train_data.get(lastUserID, {})
        default_score = user_avg_scores.get(lastUserID, global_avg_score)
        if lastUserID not in train_data or not u_ratings:
            unseen_users += 1
            artist_scores = []
            for i, t_data in enumerate(user_metadata):
                artist_score = artist_global_avg.get(t_data['artistID'], global_avg_score) if t_data['artistID'] else global_avg_score
                weight = 1.2 if artist_score > global_avg_score else 0.8
                artist_scores.append(artist_score * weight)
            top3 = np.argsort(-np.array(artist_scores))[:3]
            preds = [1 if i in top3 else 0 for i in range(len(user_tracks))]
            cold_start_preds += sum(preds)
        else:
            X_test = []
            for i, t_data in enumerate(user_metadata):
                tID = t_data['trackID']
                raw_track_score = u_ratings.get(tID, u_ratings.get(f"track_{tID}", 0))
                raw_album_score = u_ratings.get(t_data['albumID'], 0) if t_data['albumID'] else 0
                raw_artist_score = u_ratings.get(t_data['artistID'], 0) if t_data['artistID'] else 0
                genre_scores = [u_ratings.get(gid, 0) for gid in t_data['genreIDs']]
                genre_rated = [s for s in genre_scores if s > 0]
                raw_genre_score = np.mean(genre_rated) if genre_rated else 0
                genre_count = len(t_data['genreIDs'])
                artist_popularity = artist_global_avg.get(t_data['artistID'], global_avg_score) if t_data['artistID'] else global_avg_score
                genre_score_weighted = raw_genre_score * (len(genre_rated) / len(t_data['genreIDs']) if t_data['genreIDs'] else 0)

                album_score_missing = 1 if raw_album_score == 0 else 0
                artist_score_missing = 1 if raw_artist_score == 0 else 0
                genre_score_missing = 1 if not genre_rated else 0
                artist_score_weighted = raw_artist_score * (1 - artist_score_missing)
                user_variance = user_rating_variance.get(lastUserID, 0)
                user_rcount = user_rating_count.get(lastUserID, 0)
                artist_score = min(raw_artist_score, 100) if raw_artist_score > 0 else default_score
                genre_score = min(raw_genre_score, 100) if raw_genre_score > 0 else default_score

                features = [
                    artist_score,
                    genre_score,
                    album_score_missing,
                    artist_score_missing,
                    genre_score_missing,
                    artist_score_weighted,
                    user_variance,
                    user_rcount,
                    genre_count,
                    artist_popularity,
                    genre_score_weighted
                ]
                X_test.append(features)

            X_test = np.array(X_test)
            X_test_scaled = scaler.transform(X_test)
            probs = rf_model.predict_proba(X_test_scaled)[:, 1]
            prob_diff = np.max(probs) - np.min(probs)
            top3 = np.argsort(-probs)[:3]
            preds = [1 if i in top3 else 0 for i in range(len(user_tracks))]
            if prob_diff < 0.03:
                artist_scores = [X_test[i][0] * (1 - X_test[i][3]) for i in range(len(X_test))]
                top3 = np.argsort(-np.array(artist_scores))[:3]
                preds = [1 if i in top3 else 0 for i in range(len(user_tracks))]
                cold_start_preds += sum(preds)

        for i, t_data in enumerate(user_metadata):
            t_key = f"{lastUserID}_{t_data['trackID']}"
            if t_key not in written_keys:
                fOut.write(f"{t_key},{preds[i]}\n")
                written_keys.add(t_key)

fOut.close()
print(f"Submission file '{output_file}' written with {len(written_keys)} predictions.")
print(f"Unseen users in test set: {unseen_users}")
print(f"Cold-start predictions (positive): {cold_start_preds}")

# Evaluate on ground truth
print("Evaluating on ground truth...")
y_true = []
y_pred = []
eval_users = set()
prob_dist = []

for userID in gt_users:
    user_track_keys = [k for k in ground_truth.keys() if k.startswith(userID + '_')]
    if not user_track_keys:
        print(f"Warning: User {userID} has no tracks in ground truth. Skipping.")
        continue

    u_ratings = train_data.get(userID, {})
    default_score = user_avg_scores.get(userID, global_avg_score)
    X_test = []

    for track_key in user_track_keys:
        t_data = test_tracks.get(track_key, {'trackID': track_key.split('_')[1], 'albumID': None, 'artistID': None, 'genreIDs': []})
        raw_track_score = u_ratings.get(t_data['trackID'], u_ratings.get(f"track_{t_data['trackID']}", 0))
        raw_album_score = u_ratings.get(t_data['albumID'], 0) if t_data['albumID'] else 0
        raw_artist_score = u_ratings.get(t_data['artistID'], 0) if t_data['artistID'] else 0
        genre_scores = [u_ratings.get(gid, 0) for gid in t_data['genreIDs']]
        genre_rated = [s for s in genre_scores if s > 0]
        raw_genre_score = np.mean(genre_rated) if genre_rated else 0
        genre_count = len(t_data['genreIDs'])
        artist_popularity = artist_global_avg.get(t_data['artistID'], global_avg_score) if t_data['artistID'] else global_avg_score
        genre_score_weighted = raw_genre_score * (len(genre_rated) / len(t_data['genreIDs']) if t_data['genreIDs'] else 0)

        album_score_missing = 1 if raw_album_score == 0 else 0
        artist_score_missing = 1 if raw_artist_score == 0 else 0
        genre_score_missing = 1 if not genre_rated else 0
        artist_score_weighted = raw_artist_score * (1 - artist_score_missing)
        user_variance = user_rating_variance.get(userID, 0)
        user_rcount = user_rating_count.get(userID, 0)
        artist_score = min(raw_artist_score, 100) if raw_artist_score > 0 else default_score
        genre_score = min(raw_genre_score, 100) if raw_genre_score > 0 else default_score

        features = [
            artist_score,
            genre_score,
            album_score_missing,
            artist_score_missing,
            genre_score_missing,
            artist_score_weighted,
            user_variance,
            user_rcount,
            genre_count,
            artist_popularity,
            genre_score_weighted
        ]
        X_test.append(features)

    X_test = np.array(X_test)
    X_test_scaled = scaler.transform(X_test)
    probs = rf_model.predict_proba(X_test_scaled)[:, 1]
    prob_diff = np.max(probs) - np.min(probs)
    top3 = np.argsort(-probs)[:3]
    preds = [1 if i in top3 else 0 for i in range(len(user_track_keys))]
    if prob_diff < 0.03:
        artist_scores = [X_test[i][0] * (1 - X_test[i][3]) for i in range(len(X_test))]
        top3 = np.argsort(-np.array(artist_scores))[:3]
        preds = [1 if i in top3 else 0 for i in range(len(user_track_keys))]
        cold_start_preds += sum(preds)

    y_pred.extend(preds)
    y_true.extend([ground_truth[k] for k in user_track_keys])
    prob_dist.extend(probs)
    eval_users.add(userID)

accuracy = accuracy_score(y_true, y_pred)
precision = precision_score(y_true, y_pred, average='binary')
cm = confusion_matrix(y_true, y_pred)
print(f"Ground truth accuracy: {accuracy:.4f} over {len(y_true)} samples from {len(eval_users)} users.")
print(f"Precision@3: {precision:.4f}")
print(f"Confusion matrix:\n{cm}")
print(f"Prediction probability distribution: mean={np.mean(prob_dist):.4f}, std={np.std(prob_dist):.4f}, min={np.min(prob_dist):.4f}, max={np.max(prob_dist):.4f}")

In [None]:
# This code is using Decision Tree as the classifier
import numpy as np
from sklearn.tree import DecisionTreeClassifier
from sklearn.preprocessing import RobustScaler
from sklearn.metrics import accuracy_score, precision_score, confusion_matrix
from sklearn.model_selection import cross_val_score, GridSearchCV

# File paths
file_name_test = 'testTrack_hierarchy.txt'
file_name_train = 'trainIdx2_matrix.txt'
ground_truth_file = 'test2_new.txt'
output_file = 'submission_dt_tuned_v1.csv'

# Load ground truth
print("Loading ground truth...")
ground_truth = {}
with open(ground_truth_file, 'r') as f:
    for line in f:
        userID, trackID, label = line.strip().split('|')
        ground_truth[f"{userID}_{trackID}"] = int(label)
gt_users = set(k.split('_')[0] for k in ground_truth.keys())
gt_tracks = set(k.split('_')[1] for k in ground_truth.keys())
print(f"Loaded {len(ground_truth)} ground truth labels ({sum(1 for v in ground_truth.values() if v == 1)} positive, {sum(1 for v in ground_truth.values() if v == 0)} negative).")
print(f"Unique users in ground truth: {len(gt_users)}")

# Load test hierarchy
print("Loading test hierarchy for trackIDs...")
test_tracks = {}
with open(file_name_test, 'r') as fTest:
    for line in fTest:
        parts = line.strip().split('|')
        userID = parts[0]
        trackID = parts[1]
        albumID = parts[2] if parts[2] != "None" else None
        artistID = parts[3] if len(parts) > 3 and parts[3] != "None" else None
        genreIDs = parts[4:] if len(parts) > 4 else []
        test_tracks[f"{userID}_{trackID}"] = {
            'trackID': trackID,
            'albumID': albumID,
            'artistID': artistID,
            'genreIDs': genreIDs
        }
test_track_ids = set(t['trackID'] for t in test_tracks.values())
print(f"Loaded {len(test_tracks)} user-track pairs.")
print(f"Ground truth tracks in test hierarchy: {len(gt_tracks & test_track_ids)}/{len(gt_tracks)}")

# Load training data
print("Reading training data...")
train_data = {}
user_ratings = {}
all_scores = []
artist_global_scores = {}

with open(file_name_train, 'r') as fTrain:
    for line in fTrain:
        userID, itemID, score = line.strip().split('|')
        score = int(score)
        if userID not in train_data:
            train_data[userID] = {}
        train_data[userID][itemID] = score
        all_scores.append(score)
        if userID not in user_ratings:
            user_ratings[userID] = []
        user_ratings[userID].append(score)
        if itemID.startswith('artist_'):
            if itemID not in artist_global_scores:
                artist_global_scores[itemID] = []
            artist_global_scores[itemID].append(score)

global_avg_score = np.mean(all_scores) if all_scores else 0
user_avg_scores = {
    uid: np.mean(scores) if scores else global_avg_score for uid, scores in user_ratings.items()
}
user_rating_count = {
    uid: len(scores) for uid, scores in user_ratings.items()
}
user_rating_variance = {
    uid: np.std(scores) if scores else 0 for uid, scores in user_ratings.items()
}
# Log-transform and cap user_rating_count
user_rating_count = {uid: min(np.log1p(count), 5.0) for uid, count in user_rating_count.items()}
# Global artist averages for cold-start
artist_global_avg = {
    aid: np.mean(scores) if scores else global_avg_score for aid, scores in artist_global_scores.items()
}
print(f"Training data loaded. Global average score: {global_avg_score:.2f}")
print(f"User variance mean: {np.mean(list(user_rating_variance.values())):.2f}, std: {np.std(list(user_rating_variance.values())):.2f}")
print(f"User rating count mean: {np.mean(list(user_rating_count.values())):.2f}, std: {np.std(list(user_rating_count.values())):.2f}")
user_rating_coverage = {uid: sum(1 for tid in gt_tracks if tid in train_data.get(uid, {})) for uid in gt_users}
print(f"Users with ratings for ground truth tracks: {sum(1 for v in user_rating_coverage.values() if v > 0)}/{len(gt_users)}")

# Debug track IDs
sample_train_items = []
for uid in list(train_data.keys())[:5]:
    sample_train_items.extend(list(train_data[uid].keys())[:5])
print(f"Sample training item IDs: {sample_train_items[:10]}")
print(f"Sample ground truth track IDs: {list(gt_tracks)[:5]}")

# Prepare training data
print("Preparing training data for DT...")
X_train = []
y_train = []
users_processed = set()
feature_stats = {
    'artist': [], 'genre': [], 'album_score_missing': [], 'artist_score_missing': [],
    'genre_score_missing': [], 'artist_score_weighted': [], 'user_variance': [], 'user_rating_count': []
}
raw_counts = {'track': 0, 'album': 0, 'artist': 0, 'genre': 0}
label_ratings = {
    'positive': {'artist': [], 'genre': [], 'artist_score_weighted': []},
    'negative': {'artist': [], 'genre': [], 'artist_score_weighted': []}
}
track_label_counts = {tid: {'pos': 0, 'neg': 0} for tid in gt_tracks}
track_rating_debug = []

for userID in gt_users:
    u_ratings = train_data.get(userID, {})
    user_track_keys = [k for k in ground_truth.keys() if k.startswith(userID + '_')]
    if not user_track_keys:
        print(f"Warning: User {userID} has no tracks in ground truth. Skipping.")
        continue

    for track_key in user_track_keys:
        trackID = track_key.split('_')[1]
        label = ground_truth[track_key]
        track_label_counts[trackID]['pos' if label == 1 else 'neg'] += 1

        # Extract features
        t_data = test_tracks.get(track_key, {'trackID': trackID, 'albumID': None, 'artistID': None, 'genreIDs': []})
        # Try raw trackID and prefixed versions
        raw_track_score = u_ratings.get(t_data['trackID'], u_ratings.get(f"track_{t_data['trackID']}", 0))
        raw_album_score = u_ratings.get(t_data['albumID'], 0) if t_data['albumID'] else 0
        raw_artist_score = u_ratings.get(t_data['artistID'], 0) if t_data['artistID'] else 0
        genre_scores = [u_ratings.get(gid, 0) for gid in t_data['genreIDs']]
        genre_rated = [s for s in genre_scores if s > 0]
        raw_genre_score = np.mean(genre_rated) if genre_rated else 0

        # Debug track ratings
        track_rating_debug.append(raw_track_score)
        if raw_track_score > 0:
            raw_counts['track'] += 1
        if raw_album_score > 0:
            raw_counts['album'] += 1
        if raw_artist_score > 0:
            raw_counts['artist'] += 1
        if raw_genre_score > 0:
            raw_counts['genre'] += 1

        album_score_missing = 1 if raw_album_score == 0 else 0
        artist_score_missing = 1 if raw_artist_score == 0 else 0
        genre_score_missing = 1 if not genre_rated else 0
        artist_score_weighted = raw_artist_score * (1 - artist_score_missing)
        user_variance = user_rating_variance.get(userID, 0)
        user_rcount = user_rating_count.get(userID, 0)
        default_score = user_avg_scores.get(userID, global_avg_score)
        artist_score = min(raw_artist_score, 100) if raw_artist_score > 0 else default_score
        genre_score = min(raw_genre_score, 100) if raw_genre_score > 0 else default_score

        features = [
            artist_score,
            genre_score,
            album_score_missing,
            artist_score_missing,
            genre_score_missing,
            artist_score_weighted,
            user_variance,
            user_rcount
        ]
        X_train.append(features)
        y_train.append(label)

        # Track feature statistics and label ratings
        feature_stats['artist'].append(artist_score)
        feature_stats['genre'].append(genre_score)
        feature_stats['album_score_missing'].append(album_score_missing)
        feature_stats['artist_score_missing'].append(artist_score_missing)
        feature_stats['genre_score_missing'].append(genre_score_missing)
        feature_stats['artist_score_weighted'].append(artist_score_weighted)
        feature_stats['user_variance'].append(user_variance)
        feature_stats['user_rating_count'].append(user_rcount)
        label_ratings['positive' if label == 1 else 'negative']['artist'].append(artist_score)
        label_ratings['positive' if label == 1 else 'negative']['genre'].append(genre_score)
        label_ratings['positive' if label == 1 else 'negative']['artist_score_weighted'].append(artist_score_weighted)

    users_processed.add(userID)

X_train = np.array(X_train)
y_train = np.array(y_train)
print(f"Prepared {len(X_train)} training samples from {len(users_processed)} users.")
print("Feature statistics (non-zero counts):")
for fname, fvals in feature_stats.items():
    if fname in ['album_score_missing', 'artist_score_missing', 'genre_score_missing']:
        non_zero = sum(1 for v in fvals if v != 0)
    else:
        non_zero = sum(1 for v in fvals if v != global_avg_score and v != 0)
    print(f"  {fname}: {non_zero}/{len(fvals)} non-zero values")
print("Raw rating counts (before defaulting):")
for fname, count in raw_counts.items():
    print(f"  {fname}: {count}/{len(X_train)} non-zero values")
print(f"Track rating debug: {sum(1 for v in track_rating_debug if v > 0)}/{len(track_rating_debug)} non-zero raw_track_score")
for ftype in ['artist', 'genre', 'artist_score_weighted']:
    print(f"Average {ftype}_score for positive labels: {np.mean(label_ratings['positive'][ftype]):.2f}")
    print(f"Average {ftype}_score for negative labels: {np.mean(label_ratings['negative'][ftype]):.2f}")
    print(f"Std {ftype}_score for positive labels: {np.std(label_ratings['positive'][ftype]):.2f}")
    print(f"Std {ftype}_score for negative labels: {np.std(label_ratings['negative'][ftype]):.2f}")
mixed_label_tracks = sum(1 for v in track_label_counts.values() if v['pos'] > 0 and v['neg'] > 0)
print(f"Tracks with mixed labels: {mixed_label_tracks}/{len(gt_tracks)}")
# Feature correlations
feature_names = ['artist', 'genre', 'album_score_missing', 'artist_score_missing', 'genre_score_missing', 'artist_score_weighted', 'user_variance', 'user_rating_count']
corr_matrix = np.corrcoef(X_train.T)
print("Feature correlations:")
for i, fname in enumerate(feature_names):
    for j in range(i+1, len(feature_names)):
        if abs(corr_matrix[i, j]) > 0.5:
            print(f"  {fname} vs {feature_names[j]}: {corr_matrix[i, j]:.2f}")

# Train Decision Tree model with GridSearchCV
print("Training Decision Tree model with hyperparameter tuning...")
scaler = RobustScaler()
X_train_scaled = scaler.fit_transform(X_train)
dt_model = DecisionTreeClassifier(random_state=42, class_weight='balanced')
param_grid = {
    'max_depth': [5, 10, 15, 20],
    'min_samples_split': [10, 20, 30],
    'min_samples_leaf': [5, 10, 15]
}
grid_search = GridSearchCV(dt_model, param_grid, cv=5, scoring='precision', n_jobs=-1)
grid_search.fit(X_train_scaled, y_train)
dt_model = grid_search.best_estimator_
cv_scores = cross_val_score(dt_model, X_train_scaled, y_train, cv=5, scoring='accuracy')
cv_precision_scores = cross_val_score(dt_model, X_train_scaled, y_train, cv=5, scoring='precision')
print("Model training completed.")
print(f"Best hyperparameters: {grid_search.best_params_}")
print("Feature importances:", dict(zip(feature_names, dt_model.feature_importances_)))
print("Feature means after scaling:", np.mean(X_train_scaled, axis=0))
print("Feature stds after scaling:", np.std(X_train_scaled, axis=0))
print(f"Cross-validation accuracy: {np.mean(cv_scores):.4f} ± {np.std(cv_scores):.4f}")
print(f"Cross-validation Precision@3: {np.mean(cv_precision_scores):.4f} ± {np.std(cv_precision_scores):.4f}")

# Output predictions
fOut = open(output_file, 'w')
fOut.write("TrackID,Predictor\n")
written_keys = set()
unseen_users = 0
cold_start_preds = 0

print("Processing test data...")
with open(file_name_test, 'r') as fTest:
    lastUserID = None
    user_tracks = []
    user_metadata = []

    for line in fTest:
        parts = line.strip().split('|')
        userID = parts[0]
        trackID = parts[1]
        albumID = parts[2] if parts[2] != "None" else None
        artistID = parts[3] if len(parts) > 3 and parts[3] != "None" else None
        genreIDs = parts[4:] if len(parts) > 4 else []

        if userID != lastUserID and lastUserID is not None and user_tracks:
            u_ratings = train_data.get(lastUserID, {})
            default_score = user_avg_scores.get(lastUserID, global_avg_score)
            if lastUserID not in train_data or not u_ratings:
                unseen_users += 1
                # Fallback to weighted global artist_score
                artist_scores = []
                for i, t_data in enumerate(user_metadata):
                    artist_score = artist_global_avg.get(t_data['artistID'], global_avg_score) if t_data['artistID'] else global_avg_score
                    weight = 1.2 if artist_score > global_avg_score else 0.8
                    artist_scores.append(artist_score * weight)
                top3 = np.argsort(-np.array(artist_scores))[:3]
                preds = [1 if i in top3 else 0 for i in range(len(user_tracks))]
                cold_start_preds += sum(preds)
            else:
                X_test = []
                for i, t_data in enumerate(user_metadata):
                    tID = t_data['trackID']
                    raw_track_score = u_ratings.get(tID, u_ratings.get(f"track_{tID}", 0))
                    raw_album_score = u_ratings.get(t_data['albumID'], 0) if t_data['albumID'] else 0
                    raw_artist_score = u_ratings.get(t_data['artistID'], 0) if t_data['artistID'] else 0
                    genre_scores = [u_ratings.get(gid, 0) for gid in t_data['genreIDs']]
                    genre_rated = [s for s in genre_scores if s > 0]
                    raw_genre_score = np.mean(genre_rated) if genre_rated else 0

                    album_score_missing = 1 if raw_album_score == 0 else 0
                    artist_score_missing = 1 if raw_artist_score == 0 else 0
                    genre_score_missing = 1 if not genre_rated else 0
                    artist_score_weighted = raw_artist_score * (1 - artist_score_missing)
                    user_variance = user_rating_variance.get(lastUserID, 0)
                    user_rcount = user_rating_count.get(lastUserID, 0)
                    artist_score = min(raw_artist_score, 100) if raw_artist_score > 0 else default_score
                    genre_score = min(raw_genre_score, 100) if raw_genre_score > 0 else default_score

                    features = [
                        artist_score,
                        genre_score,
                        album_score_missing,
                        artist_score_missing,
                        genre_score_missing,
                        artist_score_weighted,
                        user_variance,
                        user_rcount
                    ]
                    X_test.append(features)

                X_test = np.array(X_test)
                X_test_scaled = scaler.transform(X_test)
                probs = dt_model.predict_proba(X_test_scaled)[:, 1]
                prob_diff = np.max(probs) - np.min(probs)
                top3 = np.argsort(-probs)[:3 if prob_diff > 0.25 else 2]
                preds = [1 if i in top3 else 0 for i in range(len(user_tracks))]
                if prob_diff < 0.1:  # Fallback for low confidence
                    artist_scores = [X_test[i][0] * (1 - X_test[i][3]) for i in range(len(X_test))]
                    top3 = np.argsort(-np.array(artist_scores))[:3]
                    preds = [1 if i in top3 else 0 for i in range(len(user_tracks))]
                    cold_start_preds += sum(preds)

            for i, t_data in enumerate(user_metadata):
                t_key = f"{lastUserID}_{t_data['trackID']}"
                if t_key not in written_keys:
                    fOut.write(f"{t_key},{preds[i]}\n")
                    written_keys.add(t_key)

            user_tracks = []
            user_metadata = []

        user_tracks.append(trackID)
        user_metadata.append({'trackID': trackID, 'albumID': albumID, 'artistID': artistID, 'genreIDs': genreIDs})
        lastUserID = userID

    # Handle last user block
    if user_tracks:
        u_ratings = train_data.get(lastUserID, {})
        default_score = user_avg_scores.get(lastUserID, global_avg_score)
        if lastUserID not in train_data or not u_ratings:
            unseen_users += 1
            artist_scores = []
            for i, t_data in enumerate(user_metadata):
                artist_score = artist_global_avg.get(t_data['artistID'], global_avg_score) if t_data['artistID'] else global_avg_score
                weight = 1.2 if artist_score > global_avg_score else 0.8
                artist_scores.append(artist_score * weight)
            top3 = np.argsort(-np.array(artist_scores))[:3]
            preds = [1 if i in top3 else 0 for i in range(len(user_tracks))]
            cold_start_preds += sum(preds)
        else:
            X_test = []
            for i, t_data in enumerate(user_metadata):
                tID = t_data['trackID']
                raw_track_score = u_ratings.get(tID, u_ratings.get(f"track_{tID}", 0))
                raw_album_score = u_ratings.get(t_data['albumID'], 0) if t_data['albumID'] else 0
                raw_artist_score = u_ratings.get(t_data['artistID'], 0) if t_data['artistID'] else 0
                genre_scores = [u_ratings.get(gid, 0) for gid in t_data['genreIDs']]
                genre_rated = [s for s in genre_scores if s > 0]
                raw_genre_score = np.mean(genre_rated) if genre_rated else 0

                album_score_missing = 1 if raw_album_score == 0 else 0
                artist_score_missing = 1 if raw_artist_score == 0 else 0
                genre_score_missing = 1 if not genre_rated else 0
                artist_score_weighted = raw_artist_score * (1 - artist_score_missing)
                user_variance = user_rating_variance.get(lastUserID, 0)
                user_rcount = user_rating_count.get(lastUserID, 0)
                artist_score = min(raw_artist_score, 100) if raw_artist_score > 0 else default_score
                genre_score = min(raw_genre_score, 100) if raw_genre_score > 0 else default_score

                features = [
                    artist_score,
                    genre_score,
                    album_score_missing,
                    artist_score_missing,
                    genre_score_missing,
                    artist_score_weighted,
                    user_variance,
                    user_rcount
                ]
                X_test.append(features)

            X_test = np.array(X_test)
            X_test_scaled = scaler.transform(X_test)
            probs = dt_model.predict_proba(X_test_scaled)[:, 1]
            prob_diff = np.max(probs) - np.min(probs)
            top3 = np.argsort(-probs)[:3 if prob_diff > 0.25 else 2]
            preds = [1 if i in top3 else 0 for i in range(len(user_tracks))]
            if prob_diff < 0.1:
                artist_scores = [X_test[i][0] * (1 - X_test[i][3]) for i in range(len(X_test))]
                top3 = np.argsort(-np.array(artist_scores))[:3]
                preds = [1 if i in top3 else 0 for i in range(len(user_tracks))]
                cold_start_preds += sum(preds)

        for i, t_data in enumerate(user_metadata):
            t_key = f"{lastUserID}_{t_data['trackID']}"
            if t_key not in written_keys:
                fOut.write(f"{t_key},{preds[i]}\n")
                written_keys.add(t_key)

fOut.close()
print(f"Submission file '{output_file}' written with {len(written_keys)} predictions.")
print(f"Unseen users in test set: {unseen_users}")
print(f"Cold-start predictions (positive): {cold_start_preds}")

# Evaluate on ground truth
print("Evaluating on ground truth...")
y_true = []
y_pred = []
eval_users = set()
prob_dist = []

for userID in gt_users:
    user_track_keys = [k for k in ground_truth.keys() if k.startswith(userID + '_')]
    if not user_track_keys:
        print(f"Warning: User {userID} has no tracks in ground truth. Skipping.")
        continue

    u_ratings = train_data.get(userID, {})
    default_score = user_avg_scores.get(userID, global_avg_score)
    X_test = []

    for track_key in user_track_keys:
        t_data = test_tracks.get(track_key, {'trackID': track_key.split('_')[1], 'albumID': None, 'artistID': None, 'genreIDs': []})
        raw_track_score = u_ratings.get(t_data['trackID'], u_ratings.get(f"track_{t_data['trackID']}", 0))
        raw_album_score = u_ratings.get(t_data['albumID'], 0) if t_data['albumID'] else 0
        raw_artist_score = u_ratings.get(t_data['artistID'], 0) if t_data['artistID'] else 0
        genre_scores = [u_ratings.get(gid, 0) for gid in t_data['genreIDs']]
        genre_rated = [s for s in genre_scores if s > 0]
        raw_genre_score = np.mean(genre_rated) if genre_rated else 0

        album_score_missing = 1 if raw_album_score == 0 else 0
        artist_score_missing = 1 if raw_artist_score == 0 else 0
        genre_score_missing = 1 if not genre_rated else 0
        artist_score_weighted = raw_artist_score * (1 - artist_score_missing)
        user_variance = user_rating_variance.get(userID, 0)
        user_rcount = user_rating_count.get(userID, 0)
        artist_score = min(raw_artist_score, 100) if raw_artist_score > 0 else default_score
        genre_score = min(raw_genre_score, 100) if raw_genre_score > 0 else default_score

        features = [
            artist_score,
            genre_score,
            album_score_missing,
            artist_score_missing,
            genre_score_missing,
            artist_score_weighted,
            user_variance,
            user_rcount
        ]
        X_test.append(features)

    X_test = np.array(X_test)
    X_test_scaled = scaler.transform(X_test)
    probs = dt_model.predict_proba(X_test_scaled)[:, 1]
    prob_diff = np.max(probs) - np.min(probs)
    top3 = np.argsort(-probs)[:3 if prob_diff > 0.25 else 2]
    preds = [1 if i in top3 else 0 for i in range(len(user_track_keys))]
    if prob_diff < 0.1:
        artist_scores = [X_test[i][0] * (1 - X_test[i][3]) for i in range(len(X_test))]
        top3 = np.argsort(-np.array(artist_scores))[:3]
        preds = [1 if i in top3 else 0 for i in range(len(user_track_keys))]
        cold_start_preds += sum(preds)

    y_pred.extend(preds)
    y_true.extend([ground_truth[k] for k in user_track_keys])
    prob_dist.extend(probs)
    eval_users.add(userID)

accuracy = accuracy_score(y_true, y_pred)
precision = precision_score(y_true, y_pred, average='binary')
cm = confusion_matrix(y_true, y_pred)
print(f"Ground truth accuracy: {accuracy:.4f} over {len(y_true)} samples from {len(eval_users)} users.")
print(f"Precision@3: {precision:.4f}")
print(f"Confusion matrix:\n{cm}")
print(f"Prediction probability distribution: mean={np.mean(prob_dist):.4f}, std={np.std(prob_dist):.4f}, min={np.min(prob_dist):.4f}, max={np.max(prob_dist):.4f}")

In [None]:
# This code is using a Gradient Boost Classifier with proper hyperparameter tuning
import os
import numpy as np
from xgboost import XGBClassifier
from sklearn.preprocessing import RobustScaler
from sklearn.metrics import accuracy_score, precision_score, confusion_matrix
from sklearn.model_selection import (
    train_test_split,
    StratifiedKFold,
    RandomizedSearchCV
)

# File paths
file_name_test    = 'testTrack_hierarchy.txt'
file_name_train   = 'trainIdx2_matrix.txt'
ground_truth_file = 'test2_new.txt'
output_file       = 'submission_gb_v5.csv'

# 1. Load ground truth
print("Loading ground truth...")
ground_truth = {}
with open(ground_truth_file, 'r') as f:
    for line in f:
        u, t, l = line.strip().split('|')
        ground_truth[f"{u}_{t}"] = int(l)
gt_users  = set(k.split('_')[0] for k in ground_truth)
gt_tracks = set(k.split('_')[1] for k in ground_truth)
print(f"Loaded {len(ground_truth)} labels ({sum(ground_truth.values())} positives).")

# 2. Load test hierarchy
print("Loading test hierarchy...")
test_tracks = {}
with open(file_name_test, 'r') as f:
    for line in f:
        parts = line.strip().split('|')
        u, t = parts[0], parts[1]
        alb = parts[2] if parts[2] != "None" else None
        art = parts[3] if len(parts)>3 and parts[3]!="None" else None
        gens= parts[4:] if len(parts)>4 else []
        test_tracks[f"{u}_{t}"] = {
            'trackID':  t,
            'albumID':  alb,
            'artistID': art,
            'genreIDs': gens
        }
print(f"Loaded {len(test_tracks)} test pairs.")

# 3. Load training data
print("Reading training data...")
train_data     = {}
user_ratings   = {}
artist_scores  = {}
genre_scores   = {}
user_genre_rats= {}
all_scores     = []

with open(file_name_train, 'r') as f:
    for line in f:
        u, i, s = line.strip().split('|')
        s = int(s)
        train_data.setdefault(u, {})[i] = s
        all_scores.append(s)
        user_ratings.setdefault(u, []).append(s)
        if i.startswith('artist_'):
            artist_scores.setdefault(i, []).append(s)
        if i.startswith('genre_'):
            genre_scores.setdefault(i, []).append(s)
            user_genre_rats.setdefault(u, []).append(s)

# 4. Precompute stats
global_avg     = np.mean(all_scores) if all_scores else 0.0
user_avg       = {u:np.mean(v) for u,v in user_ratings.items()}
user_var       = {u:np.std(v)  for u,v in user_ratings.items()}
user_cnt       = {u:np.log1p(len(v)) for u,v in user_ratings.items()}
artist_avg     = {a:np.mean(v) for a,v in artist_scores.items()}
genre_avg      = {g:np.mean(v) for g,v in genre_scores.items()}
user_genre_aff = {u:np.mean(v) for u,v in user_genre_rats.items()}

# 5. Feature engineering helper
def compute_features(u, td):
    u_r = train_data.get(u, {})
    default = user_avg.get(u, global_avg)

    raw_track  = u_r.get(td['trackID'], u_r.get(f"track_{td['trackID']}", 0))
    raw_album  = u_r.get(td['albumID'], 0) if td['albumID'] else 0
    raw_artist = u_r.get(td['artistID'], 0) if td['artistID'] else 0

    gs_list = [u_r.get(g,0) for g in td['genreIDs']]
    gs_vals = [s for s in gs_list if s>0]
    raw_genre = np.mean(gs_vals) if gs_vals else 0

    gc     = len(td['genreIDs'])
    apr    = artist_avg.get(f"artist_{td['artistID']}", global_avg) if td['artistID'] else global_avg
    gw     = raw_genre * (len(gs_vals)/gc if gc else 0)
    ugaff  = user_genre_aff.get(u, global_avg)

    asm = 1 if raw_artist==0 else 0
    gsm = 1 if not gs_vals else 0
    asw = raw_artist * (1-asm)
    uv  = user_var.get(u,0.0)
    uc  = user_cnt.get(u,0.0)
    a_s = min(raw_artist,100) if raw_artist>0 else user_avg.get(u,global_avg)
    g_s = min(raw_genre,100)  if raw_genre>0  else user_avg.get(u,global_avg)

    album_score = min(raw_album,100) if raw_album>0 else default
    track_score = min(raw_track,100) if raw_track>0 else default
    has_album   = 1 if td['albumID']  else 0
    has_artist  = 1 if td['artistID'] else 0

    return [
        a_s, g_s, asm, gsm, asw, uv, uc, gc, apr, gw, ugaff,
        album_score, track_score, has_album, has_artist
    ]

# 6. Build train matrix
print("Building training matrix...")
X, y = [], []
for key, lbl in ground_truth.items():
    u, t = key.split('_')
    td = test_tracks.get(key, {'trackID':t,'albumID':None,'artistID':None,'genreIDs':[]})
    X.append(compute_features(u, td))
    y.append(lbl)
X = np.array(X);  y = np.array(y)

# 7. Split + scale
X_tr, X_val, y_tr, y_val = train_test_split(
    X, y, test_size=0.15, stratify=y, random_state=42
)
scaler  = RobustScaler()
X_tr_s  = scaler.fit_transform(X_tr)
X_val_s = scaler.transform(X_val)

# 8. Hyperparameter tuning (no early stopping here)
print("Starting hyperparameter tuning...")
param_dist = {
    'n_estimators':    [100,200,300,400],
    'max_depth':       [3,5,7,10],
    'learning_rate':   [0.01,0.05,0.1,0.2],
    'subsample':       [0.6,0.8,1.0],
    'colsample_bytree':[0.6,0.8,1.0],
    'scale_pos_weight':[1,2,3]
}
cv     = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
xgb    = XGBClassifier(use_label_encoder=False, eval_metric='logloss', random_state=42)
search = RandomizedSearchCV(
    xgb, param_dist, n_iter=30, scoring='accuracy',
    cv=cv, verbose=1, n_jobs=1, random_state=42
)
# **Removed eval_set & early_stopping_rounds here**
search.fit(X_tr_s, y_tr)
best_params = search.best_params_
print("Best hyperparameters:", best_params)

# 9. Refit on training split and evaluate on validation
best_model = XGBClassifier(**best_params, use_label_encoder=False, eval_metric='logloss', random_state=42)
best_model.fit(X_tr_s, y_tr)   # simple fit, no early stopping

# Evaluate on held‑out validation set
y_pred_val = best_model.predict(X_val_s)
print("Validation Accuracy :", accuracy_score(y_val, y_pred_val))
print("Validation Precision:", precision_score(y_val, y_pred_val))


# 10. Retrain on all data
print("Retraining on full data...")
X_all_s = scaler.fit_transform(X)
best_model.fit(X_all_s, y, verbose=False)

# 11. Generate submission
print("Generating submission...")
with open(output_file, 'w') as fout:
    fout.write("TrackID,Predictor\n")
    last_u = None
    block  = []
    for line in open(file_name_test):
        parts = line.strip().split('|')
        u, t = parts[0], parts[1]
        key   = f"{u}_{t}"
        if last_u is None: last_u = u
        if u != last_u:
            # write previous user
            Xb = np.array([compute_features(last_u, test_tracks[k]) for k in block])
            Xb_s = scaler.transform(Xb)
            probs = best_model.predict_proba(Xb_s)[:,1]
            top3  = set(np.argsort(-probs)[:3])
            for idx,k in enumerate(block):
                fout.write(f"{k},{1 if idx in top3 else 0}\n")
            block = []
            last_u = u
        block.append(key)
    # last user
    if block:
        Xb = np.array([compute_features(last_u, test_tracks[k]) for k in block])
        Xb_s = scaler.transform(Xb)
        probs = best_model.predict_proba(Xb_s)[:,1]
        top3  = set(np.argsort(-probs)[:3])
        for idx,k in enumerate(block):
            fout.write(f"{k},{1 if idx in top3 else 0}\n")

print(f"Submission '{output_file}' written.")

Loading ground truth...
Loaded 6000 labels (3000 positives).
Loading test hierarchy...
Loaded 120000 test pairs.
Reading training data...
Building training matrix...
Starting hyperparameter tuning...
Fitting 5 folds for each of 30 candidates, totalling 150 fits


Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encode

Best hyperparameters: {'subsample': 1.0, 'scale_pos_weight': 1, 'n_estimators': 300, 'max_depth': 3, 'learning_rate': 0.01, 'colsample_bytree': 1.0}
Validation Accuracy : 0.8455555555555555
Validation Precision: 0.8956743002544529
Retraining on full data...


Parameters: { "use_label_encoder" } are not used.



Generating submission...
Submission 'submission_gb_v5.csv' written.


In [None]:
import os
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import RobustScaler
from sklearn.metrics import accuracy_score, precision_score, confusion_matrix
from sklearn.model_selection import (
    train_test_split,
    StratifiedKFold,
    RandomizedSearchCV
)

# File paths
file_name_test    = 'testTrack_hierarchy.txt'
file_name_train   = 'trainIdx2_matrix.txt'
ground_truth_file = 'test2_new.txt'
output_file       = 'submission_lr_v5.csv'

# 1. Load ground truth
print("Loading ground truth...")
ground_truth = {}
with open(ground_truth_file, 'r') as f:
    for line in f:
        u, t, l = line.strip().split('|')
        ground_truth[f"{u}_{t}"] = int(l)
gt_users  = set(k.split('_')[0] for k in ground_truth)
gt_tracks = set(k.split('_')[1] for k in ground_truth)
print(f"Loaded {len(ground_truth)} labels ({sum(ground_truth.values())} positives).")

# 2. Load test hierarchy
print("Loading test hierarchy...")
test_tracks = {}
with open(file_name_test, 'r') as f:
    for line in f:
        parts = line.strip().split('|')
        u, t = parts[0], parts[1]
        alb = parts[2] if parts[2] != "None" else None
        art = parts[3] if len(parts)>3 and parts[3]!="None" else None
        gens= parts[4:] if len(parts)>4 else []
        test_tracks[f"{u}_{t}"] = {
            'trackID':  t,
            'albumID':  alb,
            'artistID': art,
            'genreIDs': gens
        }
print(f"Loaded {len(test_tracks)} test pairs.")

# 3. Load training data
print("Reading training data...")
train_data     = {}
user_ratings   = {}
artist_scores  = {}
genre_scores   = {}
user_genre_rats= {}
all_scores     = []

with open(file_name_train, 'r') as f:
    for line in f:
        u, i, s = line.strip().split('|')
        s = int(s)
        train_data.setdefault(u, {})[i] = s
        all_scores.append(s)
        user_ratings.setdefault(u, []).append(s)
        if i.startswith('artist_'):
            artist_scores.setdefault(i, []).append(s)
        if i.startswith('genre_'):
            genre_scores.setdefault(i, []).append(s)
            user_genre_rats.setdefault(u, []).append(s)

# 4. Precompute stats
global_avg     = np.mean(all_scores) if all_scores else 0.0
user_avg       = {u:np.mean(v) for u,v in user_ratings.items()}
user_var       = {u:np.std(v)  for u,v in user_ratings.items()}
user_cnt       = {u:np.log1p(len(v)) for u,v in user_ratings.items()}
artist_avg     = {a:np.mean(v) for a,v in artist_scores.items()}
genre_avg      = {g:np.mean(v) for g,v in genre_scores.items()}
user_genre_aff = {u:np.mean(v) for u,v in user_genre_rats.items()}

# 5. Feature engineering helper
def compute_features(u, td):
    u_r = train_data.get(u, {})
    default = user_avg.get(u, global_avg)

    raw_track  = u_r.get(td['trackID'], u_r.get(f"track_{td['trackID']}", 0))
    raw_album  = u_r.get(td['albumID'], 0) if td['albumID'] else 0
    raw_artist = u_r.get(td['artistID'], 0) if td['artistID'] else 0

    gs_list = [u_r.get(g,0) for g in td['genreIDs']]
    gs_vals = [s for s in gs_list if s>0]
    raw_genre = np.mean(gs_vals) if gs_vals else 0

    gc     = len(td['genreIDs'])
    apr    = artist_avg.get(f"artist_{td['artistID']}", global_avg) if td['artistID'] else global_avg
    gw     = raw_genre * (len(gs_vals)/gc if gc else 0)
    ugaff  = user_genre_aff.get(u, global_avg)

    asm = 1 if raw_artist==0 else 0
    gsm = 1 if not gs_vals else 0
    asw = raw_artist * (1-asm)
    uv  = user_var.get(u,0.0)
    uc  = user_cnt.get(u,0.0)
    a_s = min(raw_artist,100) if raw_artist>0 else user_avg.get(u,global_avg)
    g_s = min(raw_genre,100)  if raw_genre>0  else user_avg.get(u,global_avg)

    album_score = min(raw_album,100) if raw_album>0 else default
    track_score = min(raw_track,100) if raw_track>0 else default
    has_album   = 1 if td['albumID']  else 0
    has_artist  = 1 if td['artistID'] else 0

    return [
        a_s, g_s, asm, gsm, asw, uv, uc, gc, apr, gw, ugaff,
        album_score, track_score, has_album, has_artist
    ]

# 6. Build train matrix
print("Building training matrix...")
X, y = [], []
for key, lbl in ground_truth.items():
    u, t = key.split('_')
    td = test_tracks.get(key, {'trackID':t,'albumID':None,'artistID':None,'genreIDs':[]})
    X.append(compute_features(u, td))
    y.append(lbl)
X = np.array(X);  y = np.array(y)

# 7. Split + scale
X_tr, X_val, y_tr, y_val = train_test_split(
    X, y, test_size=0.15, stratify=y, random_state=42
)
scaler  = RobustScaler()
X_tr_s  = scaler.fit_transform(X_tr)
X_val_s = scaler.transform(X_val)

# 8. Hyperparameter tuning
print("Starting hyperparameter tuning...")
param_dist = {
    'C': [0.01, 0.1, 1, 10, 100],
    'solver': ['lbfgs', 'liblinear', 'saga'],
    'class_weight': [None, 'balanced', {0:1, 1:2}, {0:1, 1:3}]
}
cv     = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
lr     = LogisticRegression(random_state=42, max_iter=1000)
search = RandomizedSearchCV(
    lr, param_dist, n_iter=30, scoring='accuracy',
    cv=cv, verbose=1, n_jobs=1, random_state=42
)
search.fit(X_tr_s, y_tr)
best_params = search.best_params_
print("Best hyperparameters:", best_params)

# 9. Refit on training split and evaluate on validation
best_model = LogisticRegression(**best_params, random_state=42, max_iter=1000)
best_model.fit(X_tr_s, y_tr)

# Evaluate on held-out validation set
y_pred_val = best_model.predict(X_val_s)
print("Validation Accuracy :", accuracy_score(y_val, y_pred_val))
print("Validation Precision:", precision_score(y_val, y_pred_val))

# 10. Retrain on all data
print("Retraining on full data...")
X_all_s = scaler.fit_transform(X)
best_model.fit(X_all_s, y)

# 11. Generate submission
print("Generating submission...")
with open(output_file, 'w') as fout:
    fout.write("TrackID,Predictor\n")
    last_u = None
    block  = []
    for line in open(file_name_test):
        parts = line.strip().split('|')
        u, t = parts[0], parts[1]
        key   = f"{u}_{t}"
        if last_u is None: last_u = u
        if u != last_u:
            # write previous user
            Xb = np.array([compute_features(last_u, test_tracks[k]) for k in block])
            Xb_s = scaler.transform(Xb)
            probs = best_model.predict_proba(Xb_s)[:,1]
            top3  = set(np.argsort(-probs)[:3])
            for idx,k in enumerate(block):
                fout.write(f"{k},{1 if idx in top3 else 0}\n")
            block = []
            last_u = u
        block.append(key)
    # last user
    if block:
        Xb = np.array([compute_features(last_u, test_tracks[k]) for k in block])
        Xb_s = scaler.transform(Xb)
        probs = best_model.predict_proba(Xb_s)[:,1]
        top3  = set(np.argsort(-probs)[:3])
        for idx,k in enumerate(block):
            fout.write(f"{k},{1 if idx in top3 else 0}\n")

print(f"Submission '{output_file}' written.")

Loading ground truth...
Loaded 6000 labels (3000 positives).
Loading test hierarchy...
Loaded 120000 test pairs.
Reading training data...
Building training matrix...
Starting hyperparameter tuning...
Fitting 5 folds for each of 30 candidates, totalling 150 fits
Best hyperparameters: {'solver': 'saga', 'class_weight': None, 'C': 100}
Validation Accuracy : 0.8466666666666667
Validation Precision: 0.8823529411764706
Retraining on full data...
Generating submission...
Submission 'submission_lr_v5.csv' written.


In [None]:
import os
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import RobustScaler
from sklearn.metrics import accuracy_score, precision_score, confusion_matrix
from sklearn.model_selection import (
    train_test_split,
    StratifiedKFold,
    RandomizedSearchCV
)
from sklearn.feature_selection import SelectKBest, f_classif
from sklearn.pipeline import Pipeline

# File paths
file_name_test    = 'testTrack_hierarchy.txt'
file_name_train   = 'trainIdx2_matrix.txt'
ground_truth_file = 'test2_new.txt'
output_file       = 'submission_lr_v9.csv'

# 1. Load ground truth
print("Loading ground truth...")
ground_truth = {}
with open(ground_truth_file, 'r') as f:
    for line in f:
        u, t, l = line.strip().split('|')
        ground_truth[f"{u}_{t}"] = int(l)
gt_users  = set(k.split('_')[0] for k in ground_truth)
gt_tracks = set(k.split('_')[1] for k in ground_truth)
print(f"Loaded {len(ground_truth)} labels ({sum(ground_truth.values())} positives).")

# 2. Load test hierarchy
print("Loading test hierarchy...")
test_tracks = {}
with open(file_name_test, 'r') as f:
    for line in f:
        parts = line.strip().split('|')
        u, t = parts[0], parts[1]
        alb = parts[2] if parts[2] != "None" else None
        art = parts[3] if len(parts)>3 and parts[3]!="None" else None
        gens= parts[4:] if len(parts)>4 else []
        test_tracks[f"{u}_{t}"] = {
            'trackID':  t,
            'albumID':  alb,
            'artistID': art,
            'genreIDs': gens
        }
print(f"Loaded {len(test_tracks)} test pairs.")

# 3. Load training data
print("Reading training data...")
train_data     = {}
user_ratings   = {}
artist_scores  = {}
genre_scores   = {}
user_genre_rats= {}
all_scores     = []
user_artist_counts = {}  # New: Track user-artist interaction counts

with open(file_name_train, 'r') as f:
    for line in f:
        u, i, s = line.strip().split('|')
        s = int(s)
        train_data.setdefault(u, {})[i] = s
        all_scores.append(s)
        user_ratings.setdefault(u, []).append(s)
        if i.startswith('artist_'):
            artist_scores.setdefault(i, []).append(s)
            user_artist_counts.setdefault(u, {}).setdefault(i, 0)
            user_artist_counts[u][i] += 1  # Increment artist interaction count
        if i.startswith('genre_'):
            genre_scores.setdefault(i, []).append(s)
            user_genre_rats.setdefault(u, []).append(s)

# 4. Precompute stats
global_avg     = np.mean(all_scores) if all_scores else 0.0
user_avg       = {u:np.mean(v) for u,v in user_ratings.items()}
user_var       = {u:np.std(v)  for u,v in user_ratings.items()}
user_cnt       = {u:np.log1p(len(v)) for u,v in user_ratings.items()}
artist_avg     = {a:np.mean(v) for a,v in artist_scores.items()}
genre_avg      = {g:np.mean(v) for g,v in genre_scores.items()}
user_genre_aff = {u:np.mean(v) for u,v in user_genre_rats.items()}

# 5. Feature engineering helper
def compute_features(u, td):
    u_r = train_data.get(u, {})
    default = user_avg.get(u, global_avg)

    raw_track  = u_r.get(td['trackID'], u_r.get(f"track_{td['trackID']}", 0))
    raw_album  = u_r.get(td['albumID'], 0) if td['albumID'] else 0
    raw_artist = u_r.get(td['artistID'], 0) if td['artistID'] else 0

    gs_list = [u_r.get(g,0) for g in td['genreIDs']]
    gs_vals = [s for s in gs_list if s>0]
    raw_genre = np.mean(gs_vals) if gs_vals else 0

    gc     = len(td['genreIDs'])
    apr    = artist_avg.get(f"artist_{td['artistID']}", global_avg) if td['artistID'] else global_avg
    gw     = raw_genre * (len(gs_vals)/gc if gc else 0)
    ugaff  = user_genre_aff.get(u, global_avg)

    asm = 1 if raw_artist==0 else 0
    gsm = 1 if not gs_vals else 0
    asw = raw_artist * (1-asm)
    uv  = user_var.get(u,0.0)
    uc  = user_cnt.get(u,0.0)
    a_s = min(raw_artist,100) if raw_artist>0 else user_avg.get(u,global_avg)
    g_s = min(raw_genre,100)  if raw_genre>0  else user_avg.get(u,global_avg)

    album_score = min(raw_album,100) if raw_album>0 else default
    track_score = min(raw_track,100) if raw_track>0 else default
    has_album   = 1 if td['albumID']  else 0
    has_artist  = 1 if td['artistID'] else 0

    # New: User-artist interaction count
    artist_count = user_artist_counts.get(u, {}).get(f"artist_{td['artistID']}", 0) if td['artistID'] else 0

    return [
        a_s, g_s, asm, gsm, asw, uv, uc, gc, apr, gw, ugaff,
        album_score, track_score, has_album, has_artist, artist_count
    ]

# 6. Build train matrix
print("Building training matrix...")
X, y = [], []
for key, lbl in ground_truth.items():
    u, t = key.split('_')
    td = test_tracks.get(key, {'trackID':t,'albumID':None,'artistID':None,'genreIDs':[]})
    X.append(compute_features(u, td))
    y.append(lbl)
X = np.array(X);  y = np.array(y)

# 7. Split + scale
X_tr, X_val, y_tr, y_val = train_test_split(
    X, y, test_size=0.15, stratify=y, random_state=42
)
scaler  = RobustScaler()
X_tr_s  = scaler.fit_transform(X_tr)
X_val_s = scaler.transform(X_val)

# 8. Hyperparameter tuning
print("Starting hyperparameter tuning...")
param_dist = {
    'classifier__C': [0.01, 0.1, 1, 10, 100],
    'classifier__solver': ['liblinear'],
    'classifier__class_weight': [None, 'balanced', {0:1, 1:2}, {0:1, 1:3}],
    'select__k': [10, 12, 14]
}
cv     = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
pipeline = Pipeline([
    ('select', SelectKBest(score_func=f_classif)),
    ('classifier', LogisticRegression(random_state=42, max_iter=1000))
])
search = RandomizedSearchCV(
    pipeline, param_dist, n_iter=30, scoring='accuracy',
    cv=cv, verbose=1, n_jobs=1, random_state=42
)
search.fit(X_tr_s, y_tr)
best_params = search.best_params_
print("Best hyperparameters:", best_params)

# 9. Refit on training split and evaluate on validation
best_model = Pipeline([
    ('select', SelectKBest(score_func=f_classif, k=best_params['select__k'])),
    ('classifier', LogisticRegression(
        C=best_params['classifier__C'],
        solver=best_params['classifier__solver'],
        class_weight=best_params['classifier__class_weight'],
        random_state=42,
        max_iter=2000
    ))
])
best_model.fit(X_tr_s, y_tr)

# Evaluate on held-out validation set
y_pred_val = best_model.predict(X_val_s)
val_accuracy = accuracy_score(y_val, y_pred_val)
print("Validation Accuracy :", val_accuracy)
print("Validation Precision:", precision_score(y_val, y_pred_val))
print("Confusion Matrix:\n", confusion_matrix(y_val, y_pred_val))

# 10. Evaluate on full ground truth
print("Evaluating on full ground truth...")
X_all_s = scaler.fit_transform(X)
best_model.fit(X_all_s, y)
y_pred_all = best_model.predict(X_all_s)
ground_truth_accuracy = accuracy_score(y, y_pred_all)
print("Ground Truth Accuracy:", ground_truth_accuracy)

# 11. Generate submission
print("Generating submission...")
with open(output_file, 'w') as fout:
    fout.write("TrackID,Predictor\n")
    last_u = None
    block  = []
    for line in open(file_name_test):
        parts = line.strip().split('|')
        u, t = parts[0], parts[1]
        key   = f"{u}_{t}"
        if last_u is None: last_u = u
        if u != last_u:
            # write previous user
            Xb = np.array([compute_features(last_u, test_tracks[k]) for k in block])
            Xb_s = scaler.transform(Xb)
            probs = best_model.predict_proba(Xb_s)[:,1]
            top3  = set(np.argsort(-probs)[:3])
            for idx,k in enumerate(block):
                fout.write(f"{k},{1 if idx in top3 else 0}\n")
            block = []
            last_u = u
        block.append(key)
    # last user
    if block:
        Xb = np.array([compute_features(last_u, test_tracks[k]) for k in block])
        Xb_s = scaler.transform(Xb)
        probs = best_model.predict_proba(Xb_s)[:,1]
        top3  = set(np.argsort(-probs)[:3])
        for idx,k in enumerate(block):
            fout.write(f"{k},{1 if idx in top3 else 0}\n")

print(f"Submission '{output_file}' written.")

Loading ground truth...
Loaded 6000 labels (3000 positives).
Loading test hierarchy...
Loaded 120000 test pairs.
Reading training data...
Building training matrix...
Starting hyperparameter tuning...
Fitting 5 folds for each of 30 candidates, totalling 150 fits


  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = ms

Best hyperparameters: {'select__k': 14, 'classifier__solver': 'liblinear', 'classifier__class_weight': None, 'classifier__C': 100}
Validation Accuracy : 0.8466666666666667
Validation Precision: 0.8823529411764706
Confusion Matrix:
 [[402  48]
 [ 90 360]]
Evaluating on full ground truth...
Ground Truth Accuracy: 0.8508333333333333
Generating submission...
Submission 'submission_lr_v9.csv' written.


In [None]:
import os
import numpy as np
from xgboost import XGBClassifier
from sklearn.preprocessing import RobustScaler
from sklearn.metrics import accuracy_score, precision_score, confusion_matrix
from sklearn.model_selection import (
    train_test_split,
    StratifiedKFold,
    RandomizedSearchCV
)

# File paths
file_name_test    = 'testTrack_hierarchy.txt'
file_name_train   = 'trainIdx2_matrix.txt'
ground_truth_file = 'test2_new.txt'
output_file       = 'submission_xgb_v10.csv'

# 1. Load ground truth
print("Loading ground truth...")
ground_truth = {}
with open(ground_truth_file, 'r') as f:
    for line in f:
        u, t, l = line.strip().split('|')
        ground_truth[f"{u}_{t}"] = int(l)
gt_users  = set(k.split('_')[0] for k in ground_truth)
gt_tracks = set(k.split('_')[1] for k in ground_truth)
print(f"Loaded {len(ground_truth)} labels ({sum(ground_truth.values())} positives).")

# 2. Load test hierarchy
print("Loading test hierarchy...")
test_tracks = {}
with open(file_name_test, 'r') as f:
    for line in f:
        parts = line.strip().split('|')
        u, t = parts[0], parts[1]
        alb = parts[2] if parts[2] != "None" else None
        art = parts[3] if len(parts)>3 and parts[3]!="None" else None
        gens= parts[4:] if len(parts)>4 else []
        test_tracks[f"{u}_{t}"] = {
            'trackID':  t,
            'albumID':  alb,
            'artistID': art,
            'genreIDs': gens
        }
print(f"Loaded {len(test_tracks)} test pairs.")

# 3. Load training data
print("Reading training data...")
train_data     = {}
user_ratings   = {}
artist_scores  = {}
genre_scores   = {}
user_genre_rats= {}
all_scores     = []
user_artist_counts = {}
user_genre_counts  = {}  # New: Track user-genre interaction counts

with open(file_name_train, 'r') as f:
    for line in f:
        u, i, s = line.strip().split('|')
        s = int(s)
        train_data.setdefault(u, {})[i] = s
        all_scores.append(s)
        user_ratings.setdefault(u, []).append(s)
        if i.startswith('artist_'):
            artist_scores.setdefault(i, []).append(s)
            user_artist_counts.setdefault(u, {}).setdefault(i, 0)
            user_artist_counts[u][i] += 1
        if i.startswith('genre_'):
            genre_scores.setdefault(i, []).append(s)
            user_genre_rats.setdefault(u, []).append(s)
            user_genre_counts.setdefault(u, {}).setdefault(i, 0)
            user_genre_counts[u][i] += 1

# 4. Precompute stats
global_avg     = np.mean(all_scores) if all_scores else 0.0
user_avg       = {u:np.mean(v) for u,v in user_ratings.items()}
user_var       = {u:np.std(v)  for u,v in user_ratings.items()}
user_cnt       = {u:np.log1p(len(v)) for u,v in user_ratings.items()}
artist_avg     = {a:np.mean(v) for a,v in artist_scores.items()}
genre_avg      = {g:np.mean(v) for g,v in genre_scores.items()}
user_genre_aff = {u:np.mean(v) for u,v in user_genre_rats.items()}

# 5. Feature engineering helper
def compute_features(u, td):
    u_r = train_data.get(u, {})
    default = user_avg.get(u, global_avg)

    raw_track  = u_r.get(td['trackID'], u_r.get(f"track_{td['trackID']}", 0))
    raw_album  = u_r.get(td['albumID'], 0) if td['albumID'] else 0
    raw_artist = u_r.get(td['artistID'], 0) if td['artistID'] else 0

    gs_list = [u_r.get(g,0) for g in td['genreIDs']]
    gs_vals = [s for s in gs_list if s>0]
    raw_genre = np.mean(gs_vals) if gs_vals else 0

    gc     = len(td['genreIDs'])
    apr    = artist_avg.get(f"artist_{td['artistID']}", global_avg) if td['artistID'] else global_avg
    gw     = raw_genre * (len(gs_vals)/gc if gc else 0)
    ugaff  = user_genre_aff.get(u, global_avg)

    asm = 1 if raw_artist==0 else 0
    gsm = 1 if not gs_vals else 0
    asw = raw_artist * (1-asm)
    uv  = user_var.get(u,0.0)
    uc  = user_cnt.get(u,0.0)
    a_s = min(raw_artist,100) if raw_artist>0 else user_avg.get(u,global_avg)
    g_s = min(raw_genre,100)  if raw_genre>0  else user_avg.get(u,global_avg)

    album_score = min(raw_album,100) if raw_album>0 else default
    track_score = min(raw_track,100) if raw_track>0 else default
    has_album   = 1 if td['albumID']  else 0
    has_artist  = 1 if td['artistID'] else 0

    artist_count = user_artist_counts.get(u, {}).get(f"artist_{td['artistID']}", 0) if td['artistID'] else 0
    genre_count  = sum(user_genre_counts.get(u, {}).get(g, 0) for g in td['genreIDs']) if td['genreIDs'] else 0

    return [
        a_s, g_s, asm, gsm, asw, uv, uc, gc, apr, gw, ugaff,
        album_score, track_score, has_album, has_artist, artist_count, genre_count
    ]

# 6. Build train matrix
print("Building training matrix...")
X, y = [], []
for key, lbl in ground_truth.items():
    u, t = key.split('_')
    td = test_tracks.get(key, {'trackID':t,'albumID':None,'artistID':None,'genreIDs':[]})
    X.append(compute_features(u, td))
    y.append(lbl)
X = np.array(X);  y = np.array(y)

# 7. Split + scale
X_tr, X_val, y_tr, y_val = train_test_split(
    X, y, test_size=0.15, stratify=y, random_state=42
)
scaler  = RobustScaler()
X_tr_s  = scaler.fit_transform(X_tr)
X_val_s = scaler.transform(X_val)

# 8. Hyperparameter tuning
print("Starting hyperparameter tuning...")
param_dist = {
    'max_depth': [3, 5, 7],
    'learning_rate': [0.01, 0.1, 0.3],
    'n_estimators': [100, 200],
    'subsample': [0.8, 1.0],
    'colsample_bytree': [0.8, 1.0]
}
cv     = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
model  = XGBClassifier(random_state=42, eval_metric='logloss', early_stopping_rounds=10)
search = RandomizedSearchCV(
    model, param_dist, n_iter=20, scoring='accuracy',
    cv=cv, verbose=1, n_jobs=1, random_state=42
)
search.fit(X_tr_s, y_tr, eval_set=[(X_val_s, y_val)], verbose=False)
best_params = search.best_params_
print("Best hyperparameters:", best_params)

# 9. Refit on training split and evaluate on validation
best_model = XGBClassifier(
    **best_params, random_state=42, eval_metric='logloss', early_stopping_rounds=10
)
best_model.fit(X_tr_s, y_tr, eval_set=[(X_val_s, y_val)], verbose=False)

# Evaluate on held-out validation set
y_pred_val = best_model.predict(X_val_s)
val_accuracy = accuracy_score(y_val, y_pred_val)
print("Validation Accuracy :", val_accuracy)
print("Validation Precision:", precision_score(y_val, y_pred_val))
print("Confusion Matrix:\n", confusion_matrix(y_val, y_pred_val))

# 10. Evaluate on full ground truth
print("Evaluating on full ground truth...")
X_all_s = scaler.fit_transform(X)
best_model.fit(X_all_s, y, eval_set=[(X_all_s, y)], verbose=False)
y_pred_all = best_model.predict(X_all_s)
ground_truth_accuracy = accuracy_score(y, y_pred_all)
print("Ground Truth Accuracy:", ground_truth_accuracy)

# 11. Generate submission
print("Generating submission...")
with open(output_file, 'w') as fout:
    fout.write("TrackID,Predictor\n")
    last_u = None
    block  = []
    for line in open(file_name_test):
        parts = line.strip().split('|')
        u, t = parts[0], parts[1]
        key   = f"{u}_{t}"
        if last_u is None: last_u = u
        if u != last_u:
            # write previous user
            Xb = np.array([compute_features(last_u, test_tracks[k]) for k in block])
            Xb_s = scaler.transform(Xb)
            probs = best_model.predict_proba(Xb_s)[:,1]
            top3  = set(np.argsort(-probs)[:3])
            for idx,k in enumerate(block):
                fout.write(f"{k},{1 if idx in top3 else 0}\n")
            block = []
            last_u = u
        block.append(key)
    # last user
    if block:
        Xb = np.array([compute_features(last_u, test_tracks[k]) for k in block])
        Xb_s = scaler.transform(Xb)
        probs = best_model.predict_proba(Xb_s)[:,1]
        top3  = set(np.argsort(-probs)[:3])
        for idx,k in enumerate(block):
            fout.write(f"{k},{1 if idx in top3 else 0}\n")

print(f"Submission '{output_file}' written.")

Loading ground truth...
Loaded 6000 labels (3000 positives).
Loading test hierarchy...
Loaded 120000 test pairs.
Reading training data...
Building training matrix...
Starting hyperparameter tuning...
Fitting 5 folds for each of 20 candidates, totalling 100 fits
Best hyperparameters: {'subsample': 0.8, 'n_estimators': 200, 'max_depth': 3, 'learning_rate': 0.3, 'colsample_bytree': 1.0}
Validation Accuracy : 0.8488888888888889
Validation Precision: 0.8944723618090452
Confusion Matrix:
 [[408  42]
 [ 94 356]]
Evaluating on full ground truth...
Ground Truth Accuracy: 0.8985
Generating submission...
Submission 'submission_xgb_v10.csv' written.
