# Import necessary libraries


In [None]:
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
import pandas as pd
import json
import time
from sentence_transformers import SentenceTransformer
from codecarbon import EmissionsTracker
import os
from datetime import datetime

  from .autonotebook import tqdm as notebook_tqdm


# Read the data

In [3]:
# Defining column names
col_behaviors = ['ImpressionId', 'User', 'Time', 'History', 'Impressions']
col_news = ['NewsId', 'Category', 'SubCat', 'Title', 'Abstract', 'url', 'TitleEnt', 'AbstractEnt']

# Read TSV files with Pandas
behaviors_train = pd.read_csv("data/train/behaviors.tsv", sep="\t", header=None, names=col_behaviors)
news_train = pd.read_csv("data/train/news.tsv", sep="\t", header=None, names=col_news)

behaviors_val = pd.read_csv("data/validation/behaviors.tsv", sep="\t", header=None, names=col_behaviors)
news_val = pd.read_csv("data/validation/news.tsv", sep="\t", header=None, names=col_news)

behaviors_test = pd.read_csv("data/test/behaviors.tsv", sep="\t", header=None, names=col_behaviors)
news_test = pd.read_csv("data/test/news.tsv", sep="\t", header=None, names=col_news)

# zip train and val files
behaviors_train_val = pd.concat([behaviors_train, behaviors_val])
news_train_val = pd.concat([news_train, news_val])

# Convert time column to timestamp and sort by time
behaviors_train_val['Timestamp'] = behaviors_train_val['Time'].apply(lambda x: time.mktime(time.strptime(x, "%m/%d/%Y %I:%M:%S %p")))
behaviors_train_val = behaviors_train_val.sort_values(by='Timestamp')

# Convert time column to timestamp and sort by time
behaviors_val['Timestamp'] = behaviors_val['Time'].apply(lambda x: time.mktime(time.strptime(x, "%m/%d/%Y %I:%M:%S %p")))
behaviors_val = behaviors_val.sort_values(by='Timestamp')


# Step 1: Setup Carbon Emissions Tracking

In [4]:
# Initialize the emissions tracker
tracker = EmissionsTracker(project_name="news_recommendation_ctr_baseline", output_dir="emissions", log_level="critical")
# Start tracking emissions
tracker.start()

# Step 2: Feature Combination



In [5]:
def combine_news_text(row):
    """ Combine news text fields into a single string for processing."""
    return f"{row['Title']} {row['Abstract']} {row['Category']} {row['SubCat']}"

news_val['combined_text'] = news_val.fillna("").apply(combine_news_text, axis=1)


# Step 3: Load Sentence Transformer model

In [6]:
model = SentenceTransformer('all-MiniLM-L6-v2')

# Step 4: Create Embeddings for every article

In [7]:
# Compute embeddings for all combined texts
news_embeddings = model.encode(
    news_val['combined_text'].tolist(),
    show_progress_bar=True,
    normalize_embeddings=True  # Important for cosine similarity
)

Batches: 100%|██████████| 1326/1326 [01:36<00:00, 13.73it/s]


# Step 5: Create a similarity matrix

In [8]:
similarity_matrix = cosine_similarity(news_embeddings, news_embeddings)

# Step 6: Ranking Function

In [9]:
def rank_news_for_user(user_id, impression_news, news_ids, similarity_matrix):
    """
    Ranks news articles in an impression based on similarity to the user's clicked news.
    """
    # Create a mapping from news IDs to their indices for quick lookup
    news_id_to_idx = {nid: idx for idx, nid in enumerate(news_ids)}

    # Create a dictionary mapping user IDs to their history
    user_history_map = behaviors_val.set_index("User")["History"].to_dict()

    # If the user has no history, return the impression news as is
    if user_id not in user_history_map:
        return impression_news  # Default: No history

    # Retrieve the user's history
    history = user_history_map[user_id]

    # If the history is invalid (empty, NaN, or not a string), return the impression news as is
    if pd.isna(history) or not isinstance(history, str) or not history.strip():
        return impression_news  # No valid history

    # Get the indices of the news articles the user has clicked on
    clicked_indices = [news_id_to_idx[nid] for nid in history.split() if nid in news_id_to_idx]

    # If no valid clicked news articles are found, return the impression news as is
    if not clicked_indices:
        return impression_news  # No valid clicks with embeddings

    # Calculate similarity scores for each news article in the impression
    scores = []
    for news_id in impression_news:
        # Get the index of the current news article
        news_idx = news_id_to_idx.get(news_id)
        if news_idx is None:
            # If the news article is not in the mapping, assign a score of 0
            scores.append((news_id, 0))
            continue

        # Compute similarity scores with the user's clicked news articles
        similarity_scores = similarity_matrix[news_idx, clicked_indices]
        # Calculate the average similarity score
        avg_score = np.mean(similarity_scores) if similarity_scores.size > 0 else 0
        scores.append((news_id, avg_score))

    # Sort the news articles by their similarity scores in descending order
    scores.sort(key=lambda x: x[1], reverse=True)

    # Return the ranked list of news article IDs
    return [news_id for news_id, _ in scores]


def rank_submission_format(user_id, impression_news, news_ids, similarity_matrix):
    """
    Converts the ranked news articles into the required submission format.
    """
    # Rank the news articles for the user
    ranked_news = rank_news_for_user(user_id, impression_news, news_ids, similarity_matrix)
    # Return the positions of the original impression news in the ranked list
    return [ranked_news.index(news_id) + 1 for news_id in impression_news]


# Step 7: Create Submission File

In [10]:
def generate_prediction_file(similarity_matrix, output_file="prediction.txt"):
    """
    Generates a prediction.txt file with ranked news for each impression.
    """
    # Preprocessing: extract necessary data once
    behaviors = behaviors_val.copy()

    # Split the "Impressions" column into a list of news IDs
    behaviors["ImpressionList"] = behaviors["Impressions"].apply(lambda x: x.split())

    # Create a dictionary mapping ImpressionId to the list of news IDs
    user_impressions = behaviors.set_index('ImpressionId')['ImpressionList'].to_dict()

    # Get the list of all news IDs and create a mapping from news ID to its index
    news_ids = news_val["NewsId"].tolist()
    news_id_to_idx = {nid: idx for idx, nid in enumerate(news_ids)}

    # Create a dictionary mapping ImpressionId to user and history information
    user_history_map = behaviors.set_index("ImpressionId")[["User", "History"]].to_dict(orient="index")

    # Open the output file for writing predictions
    with open(output_file, "w") as f:
        # Iterate over each impression and its associated news list
        for impression_id, news_list in user_impressions.items():
            # Retrieve user information for the current impression
            user_info = user_history_map.get(impression_id)
            if user_info is None:
                continue  # Skip if no user information is available

            # Extract user ID and clean the news list (remove any suffix after '-')
            user_id = user_info["User"]
            cleaned_news_list = [nid.split("-")[0] for nid in news_list]

            # Rank the news articles for the user and get their positions
            ranked_positions = rank_submission_format(user_id, cleaned_news_list, news_ids, similarity_matrix)

            # Write the impression ID and ranked positions to the output file
            f.write(f"{impression_id} {json.dumps(ranked_positions)}\n")

    # Print a success message after the file is created
    print(f"✅ Prediction file '{output_file}' successfully created.")


# Step 8: Execute the code

In [11]:
generate_prediction_file(similarity_matrix, output_file="prediction_val_sbert_optimized_1000.txt")

KeyboardInterrupt: 

# Step 9: Output carbon emission report

In [None]:
# Stop tracking and get the emissions data
emissions = tracker.stop()
print(f"💡 Carbon emissions from this run: {emissions:.6f} kg CO2eq")

# Display detailed emissions information and write to txt
try:
    # Load latest emissions entry
    df = pd.read_csv("emissions/emissions.csv")
    emissions_data = df.iloc[-1]

    # Diagnose available columns
    available_columns = df.columns.tolist()
    # print(f"📂 Available columns: {available_columns}")

    # Prepare values
    duration_hr = emissions_data['duration'] / 3600
    energy_kwh = emissions_data['energy_consumed']
    cpu_power = emissions_data['cpu_power']

    gpu_power = (
        f"{emissions_data['gpu_power']:.2f} W"
        if 'gpu_power' in emissions_data and not pd.isna(emissions_data['gpu_power'])
        else "Not available"
    )

    country = emissions_data['country_name'] if 'country_name' in emissions_data else "Not available"

    carbon_intensity = (
        f"{emissions_data['country_co2_eq_electricity']:.2f} gCO2eq/kWh"
        if 'country_co2_eq_electricity' in emissions_data and not pd.isna(emissions_data['country_co2_eq_electricity'])
        else "Not available"
    )

    timestamp = datetime.now().strftime("%Y-%m-%d %H:%M:%S")

    # Print to console
    print(f"\nDetailed emissions data:")
    print(f"- Duration: {duration_hr:.2f} hours")
    print(f"- Energy consumed: {energy_kwh:.4f} kWh")
    print(f"- CPU Power: {cpu_power:.2f} W")
    print(f"- GPU Power: {gpu_power}")
    print(f"- Country: {country}")

    # Create structured report text
    report = f"""\
📄 Emissions Report – {timestamp}
====================================
🌱 Total Emissions:     {emissions:.6f} kg CO2eq

🕒 Duration:            {duration_hr:.2f} hours
⚡ Energy Consumed:     {energy_kwh:.4f} kWh
🧠 CPU Power:           {cpu_power:.2f} W
🎮 GPU Power:           {gpu_power}

🌍 Country:             {country}
====================================
"""

    # Ensure output directory exists
    os.makedirs("emissions", exist_ok=True)

    # Save to .txt file
    with open("emissions/emissions_report_content_st.txt", "w") as f:
        f.write(report)

except Exception as e:
    print(f"\n❗ Could not load detailed emissions data: {str(e)}")

# Step 10: Create a truth file

In [None]:
# Generate ground truth file for validation set
def generate_truth_file(impressions, output_file="truth.txt"):
    """
    Generates a truth.txt file with ground truth click labels.
    """
    with open(output_file, "w") as f:
        for impression_id, news_list in impressions.items():
            labels = [int(news.split("-")[1]) for news in news_list]  # Click labels
            f.write(f"{impression_id} {json.dumps(labels)}\n")  # Format output

    print(f"✅ Truth file '{output_file}' successfully created.")

generate_truth_file(behaviors_val.set_index('ImpressionId')['Impressions'].apply(lambda x: x.split()), output_file="truth_val_1000.txt")

✅ Truth file 'truth_val_1000.txt' successfully created.
