In [1]:
import pandas as pd
import numpy as np
import os
import sys
import pickle # For saving objects
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import TfidfVectorizer
from tqdm import tqdm

# Define paths to the dataset
MINDSMALL_DEV_PATH = '/kaggle/input/mind-small/dataset/MINDsmall_dev'
MINDSMALL_TRAIN_PATH = '/kaggle/input/mind-small/dataset/MINDsmall_train'

# Check if paths exist (optional, but good practice)
if not os.path.exists(MINDSMALL_DEV_PATH):
    print(f"Development path not found: {MINDSMALL_DEV_PATH}")
if not os.path.exists(MINDSMALL_TRAIN_PATH):
    print(f"Training path not found: {MINDSMALL_TRAIN_PATH}")

print("Paths defined:")
print(f"MINDsmall dev: {MINDSMALL_DEV_PATH}")
print(f"MINDsmall train: {MINDSMALL_TRAIN_PATH}")
print("Libraries imported.")

Paths defined:
MINDsmall dev: /kaggle/input/mind-small/dataset/MINDsmall_dev
MINDsmall train: /kaggle/input/mind-small/dataset/MINDsmall_train
Libraries imported.


In [2]:
# These paths should be defined if Cell 1 ran successfully.
# MINDSMALL_TRAIN_PATH = '/content/drive/MyDrive/news_recomender/MINDsmall_train' # Defined in Cell 1
# MINDSMALL_DEV_PATH = '/content/drive/MyDrive/news_recomender/MINDsmall_dev'     # Defined in Cell 1

# Define column names for news.tsv
news_cols = ['NewsID', 'Category', 'SubCategory', 'Title', 'Abstract', 'URL', 'TitleEntities', 'AbstractEntities']

# Load train news.tsv
train_news_file = os.path.join(MINDSMALL_TRAIN_PATH, 'news.tsv')
try:
    train_news_df = pd.read_csv(
        train_news_file,
        sep='\t',
        header=None,
        names=news_cols
    )
    print(f"Train news data loaded successfully from {train_news_file}")
    print("Train News Data Info:")
    train_news_df.info(verbose=False, memory_usage='deep') # verbose=False for brevity
    print("\nTrain News Data Head (first 3 rows):")
    print(train_news_df.head(3))
except FileNotFoundError:
    print(f"File not found: {train_news_file}. Please ensure MINDSMALL_TRAIN_PATH is correct and the file exists.")
except Exception as e:
    print(f"Error loading {train_news_file}: {e}")

print("-" * 70)

# Define column names for behaviors.tsv
behavior_cols = ['ImpressionID', 'UserID', 'Time', 'History', 'Impressions']

# Load train behaviors.tsv
train_behaviors_file = os.path.join(MINDSMALL_TRAIN_PATH, 'behaviors.tsv')
try:
    train_behaviors_df = pd.read_csv(
        train_behaviors_file,
        sep='\t',
        header=None,
        names=behavior_cols
    )
    print(f"\nTrain behaviors data loaded successfully from {train_behaviors_file}.")
    print("Train Behaviors Data Info:")
    train_behaviors_df.info(verbose=False, memory_usage='deep') # verbose=False for brevity
    print("\nTrain Behaviors Data Head (first 3 rows):")
    print(train_behaviors_df.head(3))
except FileNotFoundError:
    print(f"File not found: {train_behaviors_file}. Please ensure MINDSMALL_TRAIN_PATH is correct and the file exists.")
except Exception as e:
    print(f"Error loading {train_behaviors_file}: {e}")

Train news data loaded successfully from /kaggle/input/mind-small/dataset/MINDsmall_train/news.tsv
Train News Data Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 51282 entries, 0 to 51281
Columns: 8 entries, NewsID to AbstractEntities
dtypes: object(8)
memory usage: 61.3 MB

Train News Data Head (first 3 rows):
   NewsID   Category      SubCategory  \
0  N55528  lifestyle  lifestyleroyals   
1  N19639     health       weightloss   
2  N61837       news        newsworld   

                                               Title  \
0  The Brands Queen Elizabeth, Prince Charles, an...   
1                      50 Worst Habits For Belly Fat   
2  The Cost of Trump's Aid Freeze in the Trenches...   

                                            Abstract  \
0  Shop the notebooks, jackets, and more that the...   
1  These seemingly harmless habits are holding yo...   
2  Lt. Ivan Molchanets peeked over a parapet of s...   

                                             URL  \
0  https://

In [3]:
# Cell 3: Load Development (Validation) Data

# Ensure pandas and os are imported. If this cell is run in a fresh session without running prior cells:
# import pandas as pd
# import os

# Paths MINDSMALL_DEV_PATH should be defined from Cell 1.
# If not defined because Cell 1 wasn't run in this session, you would need to define it:
# MINDSMALL_DEV_PATH = '/content/drive/MyDrive/news_recomender/MINDsmall_dev'

# Column names (same as training data)
news_cols = ['NewsID', 'Category', 'SubCategory', 'Title', 'Abstract', 'URL', 'TitleEntities', 'AbstractEntities']
behavior_cols = ['ImpressionID', 'UserID', 'Time', 'History', 'Impressions']

# Load dev news.tsv
dev_news_file = os.path.join(MINDSMALL_DEV_PATH, 'news.tsv')
try:
    dev_news_df = pd.read_csv(
        dev_news_file,
        sep='\t',
        header=None,
        names=news_cols
    )
    print(f"Dev news data loaded successfully from {dev_news_file}")
    print("Dev News Data Info:")
    dev_news_df.info(verbose=False, memory_usage='deep')
    print("\nDev News Data Head (first 3 rows):")
    print(dev_news_df.head(3))
except FileNotFoundError:
    print(f"File not found: {dev_news_file}. Please ensure MINDSMALL_DEV_PATH is correct and the file exists.")
except Exception as e:
    print(f"Error loading {dev_news_file}: {e}")

print("-" * 70)

# Load dev behaviors.tsv
dev_behaviors_file = os.path.join(MINDSMALL_DEV_PATH, 'behaviors.tsv')
try:
    dev_behaviors_df = pd.read_csv(
        dev_behaviors_file,
        sep='\t',
        header=None,
        names=behavior_cols
    )
    print(f"\nDev behaviors data loaded successfully from {dev_behaviors_file}.")
    print("Dev Behaviors Data Info:")
    dev_behaviors_df.info(verbose=False, memory_usage='deep')
    print("\nDev Behaviors Data Head (first 3 rows):")
    print(dev_behaviors_df.head(3))
except FileNotFoundError:
    print(f"File not found: {dev_behaviors_file}. Please ensure MINDSMALL_DEV_PATH is correct and the file exists.")
except Exception as e:
    print(f"Error loading {dev_behaviors_file}: {e}")

Dev news data loaded successfully from /kaggle/input/mind-small/dataset/MINDsmall_dev/news.tsv
Dev News Data Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 42416 entries, 0 to 42415
Columns: 8 entries, NewsID to AbstractEntities
dtypes: object(8)
memory usage: 50.2 MB

Dev News Data Head (first 3 rows):
   NewsID   Category      SubCategory  \
0  N55528  lifestyle  lifestyleroyals   
1  N18955     health          medical   
2  N61837       news        newsworld   

                                               Title  \
0  The Brands Queen Elizabeth, Prince Charles, an...   
1  Dispose of unwanted prescription drugs during ...   
2  The Cost of Trump's Aid Freeze in the Trenches...   

                                            Abstract  \
0  Shop the notebooks, jackets, and more that the...   
1                                                NaN   
2  Lt. Ivan Molchanets peeked over a parapet of s...   

                                             URL  \
0  https://assets.m

In [4]:
# Cell 4: Preprocess News Data

# Ensure pandas is available (should be from previous cells)
# import pandas as pd

def preprocess_news_dataframe(df):
    """
    Preprocesses a news dataframe:
    - Fills NaNs in key text columns with empty strings.
    - Creates a 'CombinedText' column from 'Title' and 'Abstract'.
    """
    # Fill NaN values
    text_cols_to_fill = ['Title', 'Abstract', 'Category', 'SubCategory']
    for col in text_cols_to_fill:
        if col in df.columns:
            df[col] = df[col].fillna('')
        else:
            print(f"Warning: Column '{col}' not found in news dataframe during NaN filling.")

    # Combine Title and Abstract
    if 'Title' in df.columns and 'Abstract' in df.columns:
        df['CombinedText'] = df['Title'] + " " + df['Abstract']
    else:
        print("Warning: 'Title' or 'Abstract' column not found. Cannot create 'CombinedText'.")
        # Initialize 'CombinedText' with empty strings if creation fails, to avoid downstream errors
        df['CombinedText'] = ''
        if 'Title' in df.columns and 'Abstract' not in df.columns:
            print("Using only 'Title' for 'CombinedText'.")
            df['CombinedText'] = df['Title']
        elif 'Abstract' in df.columns and 'Title' not in df.columns:
            print("Using only 'Abstract' for 'CombinedText'.")
            df['CombinedText'] = df['Abstract']

    return df

print("Preprocessing train_news_df...")
train_news_df = preprocess_news_dataframe(train_news_df.copy()) # Use .copy() to avoid SettingWithCopyWarning
print("Train News Data After Preprocessing:")
train_news_df.info(verbose=False, memory_usage='deep')
print("\nTrain News Data Head (first 3 rows with CombinedText):")
print(train_news_df[['NewsID', 'Category', 'SubCategory', 'Title', 'Abstract', 'CombinedText']].head(3))

print("-" * 70)

print("\nPreprocessing dev_news_df...")
dev_news_df = preprocess_news_dataframe(dev_news_df.copy()) # Use .copy() to avoid SettingWithCopyWarning
print("Dev News Data After Preprocessing:")
dev_news_df.info(verbose=False, memory_usage='deep')
print("\nDev News Data Head (first 3 rows with CombinedText):")
print(dev_news_df[['NewsID', 'Category', 'SubCategory', 'Title', 'Abstract', 'CombinedText']].head(3))

Preprocessing train_news_df...
Train News Data After Preprocessing:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 51282 entries, 0 to 51281
Columns: 9 entries, NewsID to CombinedText
dtypes: object(9)
memory usage: 77.7 MB

Train News Data Head (first 3 rows with CombinedText):
   NewsID   Category      SubCategory  \
0  N55528  lifestyle  lifestyleroyals   
1  N19639     health       weightloss   
2  N61837       news        newsworld   

                                               Title  \
0  The Brands Queen Elizabeth, Prince Charles, an...   
1                      50 Worst Habits For Belly Fat   
2  The Cost of Trump's Aid Freeze in the Trenches...   

                                            Abstract  \
0  Shop the notebooks, jackets, and more that the...   
1  These seemingly harmless habits are holding yo...   
2  Lt. Ivan Molchanets peeked over a parapet of s...   

                                        CombinedText  
0  The Brands Queen Elizabeth, Prince Charles, 

In [5]:
# Cell 5: Preprocess Behaviors Data - Parse Impressions and History

# Ensure pandas is available
# import pandas as pd
# from tqdm import tqdm
# tqdm.pandas() # For progress_apply

def parse_and_explode_behaviors(df, name="train"):
    """
    Parses 'Impressions' and 'History' columns in a behaviors dataframe.
    - 'Impressions' are split into individual (NewsID, ClickLabel) pairs,
      exploding the dataframe.
    - 'History' is converted from a space-separated string to a list of NewsIDs.
    """
    # Handle NaN in History: replace with empty string for consistent splitting
    df['History'] = df['History'].fillna('')
    # Split History string into a list of NewsIDs
    df['History'] = df['History'].str.split()

    # Parse impressions
    # Each impression string is like "N123-1 N456-0..."
    # We want to transform this into new rows: (UserID, Time, History, ImpressionNewsID, ClickLabel)

    impression_records = []
    print(f"Parsing impressions for {name}_behaviors_df...")
    for _, row in tqdm(df.iterrows(), total=df.shape[0], desc=f"Processing {name} behaviors"):
        user_id = row['UserID']
        time = row['Time']
        history = row['History'] # Already a list
        impression_id_main = row['ImpressionID'] # Keep original impression ID for reference if needed

        if pd.isna(row['Impressions']): # Handle cases where 'Impressions' might be NaN
            continue

        impressions_list = row['Impressions'].split()
        for impression_item in impressions_list:
            parts = impression_item.split('-')
            if len(parts) == 2:
                news_id = parts[0]
                label = int(parts[1])
                impression_records.append({
                    'UserID': user_id,
                    'Time': time,
                    'History': history,
                    'ImpressionNewsID': news_id,
                    'ClickLabel': label,
                    'OriginalImpressionID': impression_id_main # Optional: for tracing back
                })

    parsed_df = pd.DataFrame(impression_records)
    return parsed_df

# Process train_behaviors_df
# Note: This operation can be memory-intensive and time-consuming for large datasets.
# The original train_behaviors_df has 156,965 rows. After exploding, it will be much larger.
print("Processing train_behaviors_df...")
train_behaviors_parsed_df = parse_and_explode_behaviors(train_behaviors_df.copy(), name="train")
print("\nTrain Behaviors Data After Parsing and Exploding:")
train_behaviors_parsed_df.info(memory_usage='deep')
print("\nTrain Behaviors Data Head (first 5 rows):")
print(train_behaviors_parsed_df.head())

print("-" * 70)

# Process dev_behaviors_df
# The original dev_behaviors_df has 73,152 rows.
print("\nProcessing dev_behaviors_df...")
dev_behaviors_parsed_df = parse_and_explode_behaviors(dev_behaviors_df.copy(), name="dev")
print("\nDev Behaviors Data After Parsing and Exploding:")
dev_behaviors_parsed_df.info(memory_usage='deep')
print("\nDev Behaviors Data Head (first 5 rows):")
print(dev_behaviors_parsed_df.head())

Processing train_behaviors_df...
Parsing impressions for train_behaviors_df...


Processing train behaviors: 100%|██████████| 156965/156965 [00:16<00:00, 9309.55it/s]



Train Behaviors Data After Parsing and Exploding:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5843444 entries, 0 to 5843443
Data columns (total 6 columns):
 #   Column                Dtype 
---  ------                ----- 
 0   UserID                object
 1   Time                  object
 2   History               object
 3   ImpressionNewsID      object
 4   ClickLabel            int64 
 5   OriginalImpressionID  int64 
dtypes: int64(2), object(4)
memory usage: 3.3 GB

Train Behaviors Data Head (first 5 rows):
   UserID                   Time  \
0  U13740  11/11/2019 9:05:58 AM   
1  U13740  11/11/2019 9:05:58 AM   
2  U91836  11/12/2019 6:11:30 PM   
3  U91836  11/12/2019 6:11:30 PM   
4  U91836  11/12/2019 6:11:30 PM   

                                             History ImpressionNewsID  \
0  [N55189, N42782, N34694, N45794, N18445, N6330...           N55689   
1  [N55189, N42782, N34694, N45794, N18445, N6330...           N35729   
2  [N31739, N6072, N63045, N23979, N3

Processing dev behaviors: 100%|██████████| 73152/73152 [00:07<00:00, 9723.40it/s]



Dev Behaviors Data After Parsing and Exploding:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2740998 entries, 0 to 2740997
Data columns (total 6 columns):
 #   Column                Dtype 
---  ------                ----- 
 0   UserID                object
 1   Time                  object
 2   History               object
 3   ImpressionNewsID      object
 4   ClickLabel            int64 
 5   OriginalImpressionID  int64 
dtypes: int64(2), object(4)
memory usage: 1.6 GB

Dev Behaviors Data Head (first 5 rows):
   UserID                    Time  \
0  U80234  11/15/2019 12:37:50 PM   
1  U80234  11/15/2019 12:37:50 PM   
2  U80234  11/15/2019 12:37:50 PM   
3  U80234  11/15/2019 12:37:50 PM   
4  U80234  11/15/2019 12:37:50 PM   

                                             History ImpressionNewsID  \
0  [N55189, N46039, N51741, N53234, N11276, N264,...           N28682   
1  [N55189, N46039, N51741, N53234, N11276, N264,...           N48740   
2  [N55189, N46039, N51741, N53234,

In [6]:
# Cell 6: Merge News Content with Parsed Behaviors Data

# Ensure pandas is available
# import pandas as pd

# Merge training data
print("Merging training behaviors with news data...")
# We need to merge on the news ID presented in the impression
# train_behaviors_parsed_df has 'ImpressionNewsID'
# train_news_df has 'NewsID' and its content like 'Category', 'SubCategory', 'CombinedText'
train_merged_df = pd.merge(
    train_behaviors_parsed_df,
    train_news_df,
    left_on='ImpressionNewsID',
    right_on='NewsID',
    how='left' # Use left merge to keep all impressions, even if some news details are missing (though unlikely with MIND)
)

print("\nTrain Merged Data Info:")
# Displaying info for a potentially very large dataframe. Can be verbose.
# Let's check shape and head instead for brevity in output.
print(f"Shape of train_merged_df: {train_merged_df.shape}")
# train_merged_df.info(memory_usage='deep') # This can be slow for very large df

print("\nTrain Merged Data Head (first 5 rows):")
# Display relevant columns
cols_to_show = ['UserID', 'ImpressionNewsID', 'ClickLabel', 'Category', 'SubCategory', 'CombinedText', 'History']
# Check if all expected columns exist after merge before trying to print them
existing_cols_to_show_train = [col for col in cols_to_show if col in train_merged_df.columns]
print(train_merged_df[existing_cols_to_show_train].head())


# Check for any rows where the merge might have failed (i.e., ImpressionNewsID not found in news_df)
# This would result in NaN values in columns from news_df
missing_news_info_train = train_merged_df['NewsID'].isnull().sum()
if missing_news_info_train > 0:
    print(f"\nWarning: {missing_news_info_train} impressions in training data did not have matching news details.")
else:
    print("\nAll impressions in training data successfully merged with news details.")


print("-" * 70)

# Merge development data
print("\nMerging development behaviors with news data...")
dev_merged_df = pd.merge(
    dev_behaviors_parsed_df,
    dev_news_df,
    left_on='ImpressionNewsID',
    right_on='NewsID',
    how='left'
)

print("\nDev Merged Data Info:")
print(f"Shape of dev_merged_df: {dev_merged_df.shape}")
# dev_merged_df.info(memory_usage='deep')

print("\nDev Merged Data Head (first 5 rows):")
existing_cols_to_show_dev = [col for col in cols_to_show if col in dev_merged_df.columns]
print(dev_merged_df[existing_cols_to_show_dev].head())

missing_news_info_dev = dev_merged_df['NewsID'].isnull().sum()
if missing_news_info_dev > 0:
    print(f"\nWarning: {missing_news_info_dev} impressions in development data did not have matching news details.")
else:
    print("\nAll impressions in development data successfully merged with news details.")

# We can drop the redundant NewsID column now, as ImpressionNewsID serves the purpose
if 'NewsID' in train_merged_df.columns:
    train_merged_df = train_merged_df.drop(columns=['NewsID'])
if 'NewsID' in dev_merged_df.columns:
    dev_merged_df = dev_merged_df.drop(columns=['NewsID'])

print("\nDropped redundant 'NewsID' column after merge if it existed.")

Merging training behaviors with news data...

Train Merged Data Info:
Shape of train_merged_df: (5843444, 15)

Train Merged Data Head (first 5 rows):
   UserID ImpressionNewsID  ClickLabel Category   SubCategory  \
0  U13740           N55689           1   sports  football_nfl   
1  U13740           N35729           0     news        newsus   
2  U91836           N20678           0   sports   more_sports   
3  U91836           N39317           0     news  newspolitics   
4  U91836           N58114           0    autos     autosnews   

                                        CombinedText  \
0  Charles Rogers, former Michigan State football...   
1  Porsche launches into second story of New Jers...   
2  Bode Miller delivered his twin boys after midw...   
3  Senior Trump official embellished résumé, had ...   
4  2020 Ford Explorer launch hardly went accordin...   

                                             History  
0  [N55189, N42782, N34694, N45794, N18445, N6330...  
1  [N55189, 

In [7]:
# Cell 7: Feature Engineering - TF-IDF for News Content

# Ensure necessary libraries are available
# from sklearn.feature_extraction.text import TfidfVectorizer
# import pandas as pd
# import numpy as np # Will be needed in the next cell for averaging

# (Variables: train_news_df, dev_news_df should be available from previous cells)

print("Step 1: Preparing all unique news articles for TF-IDF processing.")
# Combine news from train and dev sets to get a comprehensive list of articles
# and their 'CombinedText'.
# We select only 'NewsID' and 'CombinedText' to keep this dataframe lean.
train_news_subset_for_tfidf = train_news_df[['NewsID', 'CombinedText']].copy()
dev_news_subset_for_tfidf = dev_news_df[['NewsID', 'CombinedText']].copy()

all_news_for_tfidf_df = pd.concat(
    [train_news_subset_for_tfidf, dev_news_subset_for_tfidf],
    ignore_index=True
)
# Drop duplicate NewsIDs, keeping the first occurrence.
all_news_for_tfidf_df = all_news_for_tfidf_df.drop_duplicates(subset=['NewsID']).reset_index(drop=True)

# Ensure 'CombinedText' has no NaN values (should be handled by Cell 4, but good to double-check)
all_news_for_tfidf_df['CombinedText'] = all_news_for_tfidf_df['CombinedText'].fillna('')

print(f"Total unique news articles for TF-IDF processing: {len(all_news_for_tfidf_df)}")
print("Sample of unique news articles dataframe:")
print(all_news_for_tfidf_df.head())

print("\nStep 2: Fitting TfidfVectorizer on training news text.")
# Initialize TfidfVectorizer.
# max_features limits the vocabulary size. Adjust as needed.
# ngram_range=(1, 2) includes unigrams and bigrams.
tfidf_vectorizer = TfidfVectorizer(
    max_features=10000,  # You can tune this parameter
    stop_words='english',
    ngram_range=(1, 2)
)

# Fit the vectorizer ONLY on the 'CombinedText' from the original train_news_df.
# This ensures that the vocabulary and IDF weights are learned only from training data.
# Make sure train_news_df['CombinedText'] has no NaNs (handled in Cell 4).
print(f"Fitting TF-IDF on {len(train_news_df)} training news articles' CombinedText...")
# If train_news_df has NaNs in 'CombinedText' despite earlier preprocessing, fill them here.
# train_news_df['CombinedText'] = train_news_df['CombinedText'].fillna('') # Should not be necessary if Cell 4 ran correctly.
tfidf_vectorizer.fit(train_news_df['CombinedText'])
print("TfidfVectorizer fitted successfully.")

print("\nStep 3: Transforming CombinedText of all unique news articles.")
# Transform the 'CombinedText' of ALL unique news articles using the FITTED vectorizer.
all_news_tfidf_matrix = tfidf_vectorizer.transform(all_news_for_tfidf_df['CombinedText'])
print(f"Shape of all_news_tfidf_matrix (num_unique_news, num_tfidf_features): {all_news_tfidf_matrix.shape}")

print("\nStep 4: Creating a mapping from NewsID to its TF-IDF vector index.")
# This dictionary will allow us to quickly find the row in all_news_tfidf_matrix for a given NewsID.
newsid_to_tfidf_idx = pd.Series(
    all_news_for_tfidf_df.index,
    index=all_news_for_tfidf_df['NewsID']
).to_dict()
print(f"Created newsid_to_tfidf_idx mapping for {len(newsid_to_tfidf_idx)} news articles.")

# Optional: Save the fitted vectorizer and related objects if you plan to stop and resume.
import pickle
output_path = '/kaggle/working/model_assets/' # Define your save path
os.makedirs(output_path, exist_ok=True)
with open(os.path.join(output_path, 'tfidf_vectorizer.pkl'), 'wb') as f:
    pickle.dump(tfidf_vectorizer, f)
with open(os.path.join(output_path, 'all_news_tfidf_matrix.pkl'), 'wb') as f: # Note: saving sparse matrices this way might be inefficient for very large ones. Consider scipy.sparse.save_npz
    pickle.dump(all_news_tfidf_matrix, f)
with open(os.path.join(output_path, 'newsid_to_tfidf_idx.pkl'), 'wb') as f:
    pickle.dump(newsid_to_tfidf_idx, f)
print(f"TF-IDF related objects are ready and optionally saved to {output_path}.")

# Key outputs of this cell (available in memory for the next cell):
# - tfidf_vectorizer: The fitted TfidfVectorizer.
# - all_news_tfidf_matrix: The sparse matrix of TF-IDF features for all unique news.
# - newsid_to_tfidf_idx: Dictionary mapping NewsID to its row index in all_news_tfidf_matrix.

Step 1: Preparing all unique news articles for TF-IDF processing.
Total unique news articles for TF-IDF processing: 65238
Sample of unique news articles dataframe:
   NewsID                                       CombinedText
0  N55528  The Brands Queen Elizabeth, Prince Charles, an...
1  N19639  50 Worst Habits For Belly Fat These seemingly ...
2  N61837  The Cost of Trump's Aid Freeze in the Trenches...
3  N53526  I Was An NBA Wife. Here's How It Affected My M...
4  N38324  How to Get Rid of Skin Tags, According to a De...

Step 2: Fitting TfidfVectorizer on training news text.
Fitting TF-IDF on 51282 training news articles' CombinedText...
TfidfVectorizer fitted successfully.

Step 3: Transforming CombinedText of all unique news articles.
Shape of all_news_tfidf_matrix (num_unique_news, num_tfidf_features): (65238, 10000)

Step 4: Creating a mapping from NewsID to its TF-IDF vector index.
Created newsid_to_tfidf_idx mapping for 65238 news articles.
TF-IDF related objects are ready an

In [8]:
# Cell 8: Feature Engineering - User History Profile & Interaction Features (Memory Optimized)

# Ensure necessary libraries/objects are available
from sklearn.metrics.pairwise import cosine_similarity # <<<< ADD THIS IMPORT
from sklearn.preprocessing import LabelEncoder
import numpy as np
import pandas as pd
from tqdm import tqdm
import scipy.sparse # For type checking or operations if needed, though main use is via tfidf_matrix
# tqdm.pandas() # For df.progress_apply if used, but we iterate here.

# Objects from previous cell (ensure Cell 7 was run successfully in this session):
# - all_news_tfidf_matrix (sparse matrix of TF-IDF features for all news)
# - newsid_to_tfidf_idx (dict mapping NewsID to its row index in all_news_tfidf_matrix)
# DataFrames (ensure Cells 1-6 were run successfully in this session):
# - train_merged_df, dev_merged_df

# Helper function to get TF-IDF vector for a news_id (as a dense 1D float32 array)
def get_dense_tfidf_vector_float32(news_id, newsid_to_idx_map, tfidf_matrix, num_features):
    if news_id in newsid_to_idx_map:
        idx = newsid_to_idx_map[news_id]
        # Convert sparse row vector to dense 1D np.ndarray with float32 dtype
        return np.asarray(tfidf_matrix[idx].todense(), dtype=np.float32).flatten()
    return np.zeros(num_features, dtype=np.float32) # Return zero vector if not found


print("Step 1 & 2 Combined: Calculating Interaction Features (Cosine Similarity)...")

# Cache for user history profiles: {frozenset(history_list) -> 1D_dense_profile_vector_float32}
history_profile_cache = {}
# Ensure all_news_tfidf_matrix is defined by running Cell 7
num_tfidf_features = all_news_tfidf_matrix.shape[1]

def calculate_interaction_features_for_df(df, newsid_to_idx_map, tfidf_matrix, desc="Processing"):
    cosine_similarities_list = [] # Renamed to avoid conflict if cosine_similarity was a variable

    print(f"Calculating cosine similarities for {desc} data (this might take a while)...")

    for _, row in tqdm(df.iterrows(), total=len(df), desc=desc):
        history_list = row['History'] # List of NewsIDs
        impression_news_id = row['ImpressionNewsID']

        # --- Get/Compute User History Profile Vector ---
        current_history_key = frozenset(history_list if isinstance(history_list, list) else [])

        if current_history_key in history_profile_cache:
            user_profile_vector = history_profile_cache[current_history_key]
        else:
            history_item_vectors = []
            if isinstance(history_list, list) and len(history_list) > 0:
                for news_id_in_history in history_list:
                    vec = get_dense_tfidf_vector_float32(news_id_in_history, newsid_to_idx_map, tfidf_matrix, num_tfidf_features)
                    if np.any(vec):
                        history_item_vectors.append(vec)

            if history_item_vectors:
                user_profile_vector = np.mean(np.array(history_item_vectors, dtype=np.float32), axis=0)
            else:
                user_profile_vector = np.zeros(num_tfidf_features, dtype=np.float32)
            history_profile_cache[current_history_key] = user_profile_vector

        # --- Get Impression News Vector ---
        impression_news_vector = get_dense_tfidf_vector_float32(impression_news_id, newsid_to_idx_map, tfidf_matrix, num_tfidf_features)

        # --- Calculate Cosine Similarity ---
        u_vec_2d = user_profile_vector.reshape(1, -1)
        i_vec_2d = impression_news_vector.reshape(1, -1)

        if np.all(u_vec_2d == 0) or np.all(i_vec_2d == 0):
            similarity_value = 0.0 # Renamed to avoid conflict
        else:
            similarity_value = cosine_similarity(u_vec_2d, i_vec_2d)[0, 0] # Uses the imported function

        cosine_similarities_list.append(similarity_value)

    df['HistoryImpressionCosineSimilarity'] = cosine_similarities_list
    return df

# Process Training Data
train_merged_df = calculate_interaction_features_for_df(
    train_merged_df, newsid_to_tfidf_idx, all_news_tfidf_matrix, desc="Train Interactions"
)
print("Cosine similarity feature added to train_merged_df.")

# Process Development Data
dev_merged_df = calculate_interaction_features_for_df(
    dev_merged_df, newsid_to_tfidf_idx, all_news_tfidf_matrix, desc="Dev Interactions"
)
print("Cosine similarity feature added to dev_merged_df.")

print("\nSample of train_merged_df with new cosine similarity feature:")
if 'HistoryImpressionCosineSimilarity' in train_merged_df.columns:
    print(train_merged_df[['UserID', 'ImpressionNewsID', 'ClickLabel', 'HistoryImpressionCosineSimilarity']].head())

print("\nStep 3: Label Encoding UserID and ImpressionNewsID...")
user_encoder = LabelEncoder()
news_encoder = LabelEncoder()

print("Fitting LabelEncoders on combined training and development data identifiers...")
combined_user_ids = pd.concat([train_merged_df['UserID'].astype(str), dev_merged_df['UserID'].astype(str)]).unique()
combined_news_ids = pd.concat([train_merged_df['ImpressionNewsID'].astype(str), dev_merged_df['ImpressionNewsID'].astype(str)]).unique()

user_encoder.fit(combined_user_ids)
news_encoder.fit(combined_news_ids)

print("Transforming UserID and ImpressionNewsID...")
train_merged_df['UserID_Encoded'] = user_encoder.transform(train_merged_df['UserID'].astype(str))
train_merged_df['ImpressionNewsID_Encoded'] = news_encoder.transform(train_merged_df['ImpressionNewsID'].astype(str))
dev_merged_df['UserID_Encoded'] = user_encoder.transform(dev_merged_df['UserID'].astype(str))
dev_merged_df['ImpressionNewsID_Encoded'] = news_encoder.transform(dev_merged_df['ImpressionNewsID'].astype(str))

print("UserID and ImpressionNewsID encoded.")
print("Sample of train_merged_df with encoded IDs:")
if 'UserID_Encoded' in train_merged_df.columns and 'ImpressionNewsID_Encoded' in train_merged_df.columns:
    print(train_merged_df[['UserID', 'UserID_Encoded', 'ImpressionNewsID', 'ImpressionNewsID_Encoded', 'ClickLabel']].head())

# Optional: Saving encoders (ensure output_path is defined and os is imported if not already)
import os # Make sure os is imported
output_path = '/kaggle/working/model_assets/'
os.makedirs(output_path, exist_ok=True)
import pickle # Make sure pickle is imported
with open(os.path.join(output_path, 'user_encoder.pkl'), 'wb') as f:
    pickle.dump(user_encoder, f)
with open(os.path.join(output_path, 'news_encoder.pkl'), 'wb') as f:
    pickle.dump(news_encoder, f)
print(f"Label encoders optionally saved to {output_path}.")

print("\nFinal check of train_merged_df (relevant columns) after feature engineering:")
cols_to_check = ['UserID', 'ImpressionNewsID', 'ClickLabel', 'HistoryImpressionCosineSimilarity', 'UserID_Encoded', 'ImpressionNewsID_Encoded']
existing_cols_train = [col for col in cols_to_check if col in train_merged_df.columns]
if existing_cols_train:
    train_merged_df[existing_cols_train].info(memory_usage='deep')

print("\nFinal check of dev_merged_df (relevant columns) after feature engineering:")
existing_cols_dev = [col for col in cols_to_check if col in dev_merged_df.columns]
if existing_cols_dev:
    dev_merged_df[existing_cols_dev].info(memory_usage='deep')

Step 1 & 2 Combined: Calculating Interaction Features (Cosine Similarity)...
Calculating cosine similarities for Train Interactions data (this might take a while)...


Train Interactions: 100%|██████████| 5843444/5843444 [42:50<00:00, 2273.49it/s]


Cosine similarity feature added to train_merged_df.
Calculating cosine similarities for Dev Interactions data (this might take a while)...


Dev Interactions: 100%|██████████| 2740998/2740998 [21:02<00:00, 2170.94it/s]


Cosine similarity feature added to dev_merged_df.

Sample of train_merged_df with new cosine similarity feature:
   UserID ImpressionNewsID  ClickLabel  HistoryImpressionCosineSimilarity
0  U13740           N55689           1                           0.033206
1  U13740           N35729           0                           0.010533
2  U91836           N20678           0                           0.016130
3  U91836           N39317           0                           0.028578
4  U91836           N58114           0                           0.022575

Step 3: Label Encoding UserID and ImpressionNewsID...
Fitting LabelEncoders on combined training and development data identifiers...
Transforming UserID and ImpressionNewsID...
UserID and ImpressionNewsID encoded.
Sample of train_merged_df with encoded IDs:
   UserID  UserID_Encoded ImpressionNewsID  ImpressionNewsID_Encoded  \
0  U13740            4158           N55689                     17658   
1  U13740            4158           N357

In [9]:
! pip install tensorflow



In [10]:
# Cell 9: Prepare Data for LightGBM and Train Model

# Ensure necessary libraries are available
import lightgbm as lgb
from sklearn.metrics import roc_auc_score, accuracy_score, confusion_matrix, classification_report
# import pandas as pd # Should be available
# import numpy as np # Should be available

# DataFrames from previous cell: train_merged_df, dev_merged_df
# These should have:
# 'UserID_Encoded', 'ImpressionNewsID_Encoded', 'HistoryImpressionCosineSimilarity', 'ClickLabel'

print("Preparing data for LightGBM model...")

# Define features (X) and target (y)
# We can add more features here if we engineered them (e.g., encoded categories, time features, etc.)
# For now, focusing on the core ones.
feature_columns = ['UserID_Encoded', 'ImpressionNewsID_Encoded', 'HistoryImpressionCosineSimilarity']
target_column = 'ClickLabel'

# Training data
X_train = train_merged_df[feature_columns]
y_train = train_merged_df[target_column]

# Development (validation) data
X_dev = dev_merged_df[feature_columns]
y_dev = dev_merged_df[target_column]

print(f"Training data shape: X_train {X_train.shape}, y_train {y_train.shape}")
print(f"Development data shape: X_dev {X_dev.shape}, y_dev {y_dev.shape}")

print("\nTraining LightGBM model...")

# Initialize LGBMClassifier
# These are some basic parameters; extensive hyperparameter tuning can improve performance.
lgbm_model = lgb.LGBMClassifier(
    objective='binary',        # Binary classification
    metric='auc',              # Evaluation metric: Area Under ROC Curve
    n_estimators=1000,         # Number of boosting rounds (trees)
    learning_rate=0.05,
    num_leaves=31,
    max_depth=-1,              # No limit on tree depth
    random_state=42,           # For reproducibility
    n_jobs=-1,                 # Use all available cores
    colsample_bytree=0.8,      # Subsample ratio of columns when constructing each tree
    subsample=0.8,             # Subsample ratio of the training instance
    reg_alpha=0.1,             # L1 regularization
    reg_lambda=0.1             # L2 regularization
    # class_weight='balanced' # Useful if classes are imbalanced, check y_train.value_counts()
)

# Check class balance for training data
print("\nTraining target class distribution:")
print(y_train.value_counts(normalize=True))
# If highly imbalanced, consider using scale_pos_weight or class_weight='balanced'

# Train the model
# Using X_dev, y_dev as an evaluation set for early stopping
lgbm_model.fit(
    X_train, y_train,
    eval_set=[(X_dev, y_dev)],
    eval_metric='auc',
    callbacks=[lgb.early_stopping(100, verbose=True)] # Updated: lgb.early_stopping
                                                      # Early stopping: stops if 'auc' on eval set doesn't improve for 100 rounds.
)

print("\nModel training completed.")

print("\nEvaluating model on the development set...")
# Make predictions (probabilities for the positive class)
y_pred_proba_dev = lgbm_model.predict_proba(X_dev)[:, 1]

# Make predictions (class labels) using a default threshold of 0.5
y_pred_class_dev = lgbm_model.predict(X_dev) # (thresholds 0.5 by default)
# Or, from probabilities: y_pred_class_dev = (y_pred_proba_dev >= 0.5).astype(int)


# Calculate AUC
auc_score = roc_auc_score(y_dev, y_pred_proba_dev)
print(f"Development Set AUC: {auc_score:.4f}")

# Calculate Accuracy
accuracy = accuracy_score(y_dev, y_pred_class_dev)
print(f"Development Set Accuracy: {accuracy:.4f}")

# Confusion Matrix
print("\nDevelopment Set Confusion Matrix:")
print(confusion_matrix(y_dev, y_pred_class_dev))

# Classification Report
print("\nDevelopment Set Classification Report:")
print(classification_report(y_dev, y_pred_class_dev))


# Feature Importance (optional, but insightful)
print("\nFeature Importances:")
feature_importances = pd.DataFrame({
    'feature': feature_columns,
    'importance': lgbm_model.feature_importances_
}).sort_values(by='importance', ascending=False)
print(feature_importances)


# Optional: Save the trained model
import pickle
output_path = '/kaggle/working/model_assets/' # Should be defined
os.makedirs(output_path, exist_ok=True) # Ensure os is imported and path exists
model_filename = os.path.join(output_path, 'lgbm_news_recommender_v1.pkl')
with open(model_filename, 'wb') as f:
    pickle.dump(lgbm_model, f)
print(f"\nTrained LightGBM model saved to {model_filename}")

Preparing data for LightGBM model...
Training data shape: X_train (5843444, 3), y_train (5843444,)
Development data shape: X_dev (2740998, 3), y_dev (2740998,)

Training LightGBM model...

Training target class distribution:
ClickLabel
0    0.959554
1    0.040446
Name: proportion, dtype: float64
[LightGBM] [Info] Number of positive: 236344, number of negative: 5607100
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.034061 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 765
[LightGBM] [Info] Number of data points in the train set: 5843444, number of used features: 3
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.040446 -> initscore=-3.166501
[LightGBM] [Info] Start training from score -3.166501
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[60]	valid_0's auc: 0.564681

Model training 

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


              precision    recall  f1-score   support

           0       0.96      1.00      0.98   2629615
           1       0.00      0.00      0.00    111383

    accuracy                           0.96   2740998
   macro avg       0.48      0.50      0.49   2740998
weighted avg       0.92      0.96      0.94   2740998


Feature Importances:
                             feature  importance
1           ImpressionNewsID_Encoded        1592
2  HistoryImpressionCosineSimilarity         208
0                     UserID_Encoded           0

Trained LightGBM model saved to /kaggle/working/model_assets/lgbm_news_recommender_v1.pkl


  _warn_prf(average, modifier, msg_start, len(result))


In [11]:
!pip install optuna -q # -q for quiet installation

In [12]:
# Cell 10: Hyperparameter Tuning with Optuna (Conceptual)

# Ensure optuna is installed: !pip install optuna
import optuna
import lightgbm as lgb
from sklearn.metrics import roc_auc_score
# Ensure X_train, y_train, X_dev, y_dev are available from Cell 9's context

def objective(trial):
    # Define the search space for hyperparameters
    params = {
        'objective': 'binary',
        'metric': 'auc',
        'boosting_type': 'gbdt', # Could also try 'dart' or 'goss'
        'n_estimators': trial.suggest_int('n_estimators', 200, 2000), # Broader range
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.1, log=True),
        'num_leaves': trial.suggest_int('num_leaves', 20, 150),
        'max_depth': trial.suggest_int('max_depth', 3, 15), # Or -1 for no limit
        'min_child_samples': trial.suggest_int('min_child_samples', 5, 100),
        'subsample': trial.suggest_float('subsample', 0.6, 1.0), # Bagging fraction
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.4, 1.0), # Feature fraction
        'reg_alpha': trial.suggest_float('reg_alpha', 0.0, 1.0), # L1 regularization
        'reg_lambda': trial.suggest_float('reg_lambda', 0.0, 1.0), # L2 regularization
        'random_state': 42,
        'n_jobs': -1,
        # 'scale_pos_weight': trial.suggest_float('scale_pos_weight', 1.0, 10.0) # If classes imbalanced
    }

    model = lgb.LGBMClassifier(**params)
    model.fit(X_train, y_train,
              eval_set=[(X_dev, y_dev)],
              eval_metric='auc',
              callbacks=[lgb.early_stopping(50, verbose=False)]) # Early stopping within trial

    preds_proba = model.predict_proba(X_dev)[:, 1]
    auc = roc_auc_score(y_dev, preds_proba)
    return auc

# Create a study object and specify direction (maximize AUC)
study = optuna.create_study(direction='maximize')
print("Starting Optuna hyperparameter search...")
study.optimize(objective, n_trials=50)  # Number of trials (e.g., 50-100)

print("\nOptuna search finished.")
print(f"Best trial AUC: {study.best_value}")
print("Best hyperparameters found:")
print(study.best_params)

# You would then retrain a final model using these best_params on the full X_train,
# potentially with a refined n_estimators based on early stopping from the best trial,
# and then save this optimized model.

# Example: Retrain with best params
best_lgbm_params = study.best_params
final_lgbm_model = lgb.LGBMClassifier(**best_lgbm_params)
final_lgbm_model.fit(X_train, y_train,
                     eval_set=[(X_dev, y_dev)],
                     eval_metric='auc',
                     callbacks=[lgb.early_stopping(100, verbose=True)])
# ... (save this final_lgbm_model)
output_path = '/kaggle/working/model_assets/' # Should be defined
os.makedirs(output_path, exist_ok=True) # Ensure os is imported and path exists
model_filename = os.path.join(output_path, 'lgbm_news_recommender_best_params.pkl')
with open(model_filename, 'wb') as f:
    pickle.dump(lgbm_model, f)
print(f"\nTrained LightGBM model saved to {model_filename}")

[I 2025-05-31 04:32:10,570] A new study created in memory with name: no-name-c4d2cb88-b78d-4824-bc05-0143d7162303


Starting Optuna hyperparameter search...
[LightGBM] [Info] Number of positive: 236344, number of negative: 5607100
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.190844 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 765
[LightGBM] [Info] Number of data points in the train set: 5843444, number of used features: 3
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.040446 -> initscore=-3.166501
[LightGBM] [Info] Start training from score -3.166501


[I 2025-05-31 04:32:54,093] Trial 0 finished with value: 0.5606485774818724 and parameters: {'n_estimators': 1202, 'learning_rate': 0.013606514446895583, 'num_leaves': 30, 'max_depth': 11, 'min_child_samples': 38, 'subsample': 0.7815576667070994, 'colsample_bytree': 0.9233677735522392, 'reg_alpha': 0.03544739523462459, 'reg_lambda': 0.47469757725140127}. Best is trial 0 with value: 0.5606485774818724.


[LightGBM] [Info] Number of positive: 236344, number of negative: 5607100
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.130522 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 765
[LightGBM] [Info] Number of data points in the train set: 5843444, number of used features: 3
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.040446 -> initscore=-3.166501
[LightGBM] [Info] Start training from score -3.166501


[I 2025-05-31 04:33:21,515] Trial 1 finished with value: 0.5708285947652062 and parameters: {'n_estimators': 613, 'learning_rate': 0.010119937501293473, 'num_leaves': 142, 'max_depth': 13, 'min_child_samples': 25, 'subsample': 0.7727776069827235, 'colsample_bytree': 0.9993936099118903, 'reg_alpha': 0.18118786451205504, 'reg_lambda': 0.32601530068657425}. Best is trial 1 with value: 0.5708285947652062.


[LightGBM] [Info] Number of positive: 236344, number of negative: 5607100
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.030587 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 765
[LightGBM] [Info] Number of data points in the train set: 5843444, number of used features: 3
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.040446 -> initscore=-3.166501
[LightGBM] [Info] Start training from score -3.166501


[I 2025-05-31 04:33:43,652] Trial 2 finished with value: 0.5615991323911774 and parameters: {'n_estimators': 1256, 'learning_rate': 0.015245105780278628, 'num_leaves': 149, 'max_depth': 6, 'min_child_samples': 5, 'subsample': 0.6453037251515238, 'colsample_bytree': 0.9826566123884832, 'reg_alpha': 0.9968917396040292, 'reg_lambda': 0.7226351280555013}. Best is trial 1 with value: 0.5708285947652062.


[LightGBM] [Info] Number of positive: 236344, number of negative: 5607100
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.031487 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 765
[LightGBM] [Info] Number of data points in the train set: 5843444, number of used features: 3
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.040446 -> initscore=-3.166501
[LightGBM] [Info] Start training from score -3.166501


[I 2025-05-31 04:34:02,845] Trial 3 finished with value: 0.55215230426567 and parameters: {'n_estimators': 586, 'learning_rate': 0.01907698108748108, 'num_leaves': 101, 'max_depth': 4, 'min_child_samples': 73, 'subsample': 0.8218379023009434, 'colsample_bytree': 0.5884593496371148, 'reg_alpha': 0.11876375557503005, 'reg_lambda': 0.18773301608098403}. Best is trial 1 with value: 0.5708285947652062.


[LightGBM] [Info] Number of positive: 236344, number of negative: 5607100
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.030822 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 765
[LightGBM] [Info] Number of data points in the train set: 5843444, number of used features: 3
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.040446 -> initscore=-3.166501
[LightGBM] [Info] Start training from score -3.166501


[I 2025-05-31 04:34:54,046] Trial 4 finished with value: 0.5638156467501971 and parameters: {'n_estimators': 574, 'learning_rate': 0.06172545771744644, 'num_leaves': 148, 'max_depth': 5, 'min_child_samples': 95, 'subsample': 0.8283436424155326, 'colsample_bytree': 0.7345914733419556, 'reg_alpha': 0.8654071156376416, 'reg_lambda': 0.22752739733658767}. Best is trial 1 with value: 0.5708285947652062.


[LightGBM] [Info] Number of positive: 236344, number of negative: 5607100
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.037708 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 765
[LightGBM] [Info] Number of data points in the train set: 5843444, number of used features: 3
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.040446 -> initscore=-3.166501
[LightGBM] [Info] Start training from score -3.166501


[I 2025-05-31 04:36:09,047] Trial 5 finished with value: 0.5611365506739792 and parameters: {'n_estimators': 220, 'learning_rate': 0.012801550044155247, 'num_leaves': 22, 'max_depth': 7, 'min_child_samples': 18, 'subsample': 0.6899618816279205, 'colsample_bytree': 0.9768168425856516, 'reg_alpha': 0.06512452674246849, 'reg_lambda': 0.755741981797773}. Best is trial 1 with value: 0.5708285947652062.


[LightGBM] [Info] Number of positive: 236344, number of negative: 5607100
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.130146 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 765
[LightGBM] [Info] Number of data points in the train set: 5843444, number of used features: 3
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.040446 -> initscore=-3.166501
[LightGBM] [Info] Start training from score -3.166501


[I 2025-05-31 04:36:40,593] Trial 6 finished with value: 0.5697405557047437 and parameters: {'n_estimators': 608, 'learning_rate': 0.03095326237295062, 'num_leaves': 120, 'max_depth': 14, 'min_child_samples': 98, 'subsample': 0.9723740182214704, 'colsample_bytree': 0.9177891547877436, 'reg_alpha': 0.5645607016192047, 'reg_lambda': 0.17462419569973786}. Best is trial 1 with value: 0.5708285947652062.


[LightGBM] [Info] Number of positive: 236344, number of negative: 5607100
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.030412 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 765
[LightGBM] [Info] Number of data points in the train set: 5843444, number of used features: 3
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.040446 -> initscore=-3.166501
[LightGBM] [Info] Start training from score -3.166501


[I 2025-05-31 04:37:20,233] Trial 7 finished with value: 0.5600620057410868 and parameters: {'n_estimators': 470, 'learning_rate': 0.011442361962990633, 'num_leaves': 100, 'max_depth': 3, 'min_child_samples': 6, 'subsample': 0.9890579692052897, 'colsample_bytree': 0.832073074623997, 'reg_alpha': 0.3063109933901197, 'reg_lambda': 0.8639201395542101}. Best is trial 1 with value: 0.5708285947652062.


[LightGBM] [Info] Number of positive: 236344, number of negative: 5607100
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.031406 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 765
[LightGBM] [Info] Number of data points in the train set: 5843444, number of used features: 3
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.040446 -> initscore=-3.166501
[LightGBM] [Info] Start training from score -3.166501


[I 2025-05-31 04:37:37,558] Trial 8 finished with value: 0.5508362550186705 and parameters: {'n_estimators': 1919, 'learning_rate': 0.04137693444593425, 'num_leaves': 97, 'max_depth': 4, 'min_child_samples': 38, 'subsample': 0.7813019569089403, 'colsample_bytree': 0.6402557460199261, 'reg_alpha': 0.7432361812003483, 'reg_lambda': 0.3364727523829222}. Best is trial 1 with value: 0.5708285947652062.


[LightGBM] [Info] Number of positive: 236344, number of negative: 5607100
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.031981 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 765
[LightGBM] [Info] Number of data points in the train set: 5843444, number of used features: 3
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.040446 -> initscore=-3.166501
[LightGBM] [Info] Start training from score -3.166501


[I 2025-05-31 04:37:53,293] Trial 9 finished with value: 0.5565302713349217 and parameters: {'n_estimators': 1876, 'learning_rate': 0.04292706824139983, 'num_leaves': 111, 'max_depth': 3, 'min_child_samples': 51, 'subsample': 0.8284292070782225, 'colsample_bytree': 0.9704029358625821, 'reg_alpha': 0.3448577801453415, 'reg_lambda': 0.5009018855729084}. Best is trial 1 with value: 0.5708285947652062.


[LightGBM] [Info] Number of positive: 236344, number of negative: 5607100
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.012321 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 765
[LightGBM] [Info] Number of data points in the train set: 5843444, number of used features: 3
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.040446 -> initscore=-3.166501
[LightGBM] [Info] Start training from score -3.166501


[I 2025-05-31 04:38:18,500] Trial 10 finished with value: 0.5781267851383756 and parameters: {'n_estimators': 924, 'learning_rate': 0.09687431790563442, 'num_leaves': 65, 'max_depth': 15, 'min_child_samples': 28, 'subsample': 0.6997828100101848, 'colsample_bytree': 0.42513478688623474, 'reg_alpha': 0.3305320605086387, 'reg_lambda': 0.516601781053087}. Best is trial 10 with value: 0.5781267851383756.


[LightGBM] [Info] Number of positive: 236344, number of negative: 5607100
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.108788 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 765
[LightGBM] [Info] Number of data points in the train set: 5843444, number of used features: 3
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.040446 -> initscore=-3.166501
[LightGBM] [Info] Start training from score -3.166501


[I 2025-05-31 04:38:44,144] Trial 11 finished with value: 0.5780828588660789 and parameters: {'n_estimators': 914, 'learning_rate': 0.09915635960817784, 'num_leaves': 62, 'max_depth': 14, 'min_child_samples': 27, 'subsample': 0.7164093069125737, 'colsample_bytree': 0.4028904156372342, 'reg_alpha': 0.31175918404379754, 'reg_lambda': 0.5338153940420849}. Best is trial 10 with value: 0.5781267851383756.


[LightGBM] [Info] Number of positive: 236344, number of negative: 5607100
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.012214 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 765
[LightGBM] [Info] Number of data points in the train set: 5843444, number of used features: 3
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.040446 -> initscore=-3.166501
[LightGBM] [Info] Start training from score -3.166501


[I 2025-05-31 04:39:09,797] Trial 12 finished with value: 0.5789410301951485 and parameters: {'n_estimators': 957, 'learning_rate': 0.09035257446504434, 'num_leaves': 61, 'max_depth': 15, 'min_child_samples': 30, 'subsample': 0.7012974416579587, 'colsample_bytree': 0.40894546500107765, 'reg_alpha': 0.4748034675035624, 'reg_lambda': 0.580884129120939}. Best is trial 12 with value: 0.5789410301951485.


[LightGBM] [Info] Number of positive: 236344, number of negative: 5607100
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.113682 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 765
[LightGBM] [Info] Number of data points in the train set: 5843444, number of used features: 3
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.040446 -> initscore=-3.166501
[LightGBM] [Info] Start training from score -3.166501


[I 2025-05-31 04:39:35,553] Trial 13 finished with value: 0.5782082446554758 and parameters: {'n_estimators': 951, 'learning_rate': 0.08650718493956142, 'num_leaves': 64, 'max_depth': 11, 'min_child_samples': 59, 'subsample': 0.6268801894851949, 'colsample_bytree': 0.41842665474984947, 'reg_alpha': 0.5391482664461406, 'reg_lambda': 0.6347494600787215}. Best is trial 12 with value: 0.5789410301951485.


[LightGBM] [Info] Number of positive: 236344, number of negative: 5607100
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.031081 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 765
[LightGBM] [Info] Number of data points in the train set: 5843444, number of used features: 3
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.040446 -> initscore=-3.166501
[LightGBM] [Info] Start training from score -3.166501


[I 2025-05-31 04:41:12,297] Trial 14 finished with value: 0.5650643122217078 and parameters: {'n_estimators': 1555, 'learning_rate': 0.0675578408596712, 'num_leaves': 62, 'max_depth': 11, 'min_child_samples': 64, 'subsample': 0.6116886662132359, 'colsample_bytree': 0.5050845027114972, 'reg_alpha': 0.5551610911648762, 'reg_lambda': 0.016340963868375136}. Best is trial 12 with value: 0.5789410301951485.


[LightGBM] [Info] Number of positive: 236344, number of negative: 5607100
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.130718 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 765
[LightGBM] [Info] Number of data points in the train set: 5843444, number of used features: 3
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.040446 -> initscore=-3.166501
[LightGBM] [Info] Start training from score -3.166501


[I 2025-05-31 04:42:23,689] Trial 15 finished with value: 0.5668778711027129 and parameters: {'n_estimators': 959, 'learning_rate': 0.06815697417841901, 'num_leaves': 45, 'max_depth': 9, 'min_child_samples': 56, 'subsample': 0.6073940440617054, 'colsample_bytree': 0.5038630298433896, 'reg_alpha': 0.6758089966084253, 'reg_lambda': 0.9856339816617755}. Best is trial 12 with value: 0.5789410301951485.


[LightGBM] [Info] Number of positive: 236344, number of negative: 5607100
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.117889 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 765
[LightGBM] [Info] Number of data points in the train set: 5843444, number of used features: 3
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.040446 -> initscore=-3.166501
[LightGBM] [Info] Start training from score -3.166501


[I 2025-05-31 04:42:49,840] Trial 16 finished with value: 0.5782052035731708 and parameters: {'n_estimators': 1411, 'learning_rate': 0.024533145286420664, 'num_leaves': 78, 'max_depth': 11, 'min_child_samples': 45, 'subsample': 0.6537346990554709, 'colsample_bytree': 0.4874097452772259, 'reg_alpha': 0.48279118008636857, 'reg_lambda': 0.6842761140198615}. Best is trial 12 with value: 0.5789410301951485.


[LightGBM] [Info] Number of positive: 236344, number of negative: 5607100
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.029263 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 765
[LightGBM] [Info] Number of data points in the train set: 5843444, number of used features: 3
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.040446 -> initscore=-3.166501
[LightGBM] [Info] Start training from score -3.166501


[I 2025-05-31 04:44:24,172] Trial 17 finished with value: 0.5656077065197896 and parameters: {'n_estimators': 813, 'learning_rate': 0.051300470393081894, 'num_leaves': 42, 'max_depth': 8, 'min_child_samples': 78, 'subsample': 0.9051926812648186, 'colsample_bytree': 0.6059493193997275, 'reg_alpha': 0.45402124630591134, 'reg_lambda': 0.655747802592471}. Best is trial 12 with value: 0.5789410301951485.


[LightGBM] [Info] Number of positive: 236344, number of negative: 5607100
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.128565 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 765
[LightGBM] [Info] Number of data points in the train set: 5843444, number of used features: 3
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.040446 -> initscore=-3.166501
[LightGBM] [Info] Start training from score -3.166501


[I 2025-05-31 04:44:46,459] Trial 18 finished with value: 0.5654508638989793 and parameters: {'n_estimators': 1555, 'learning_rate': 0.08257100567313497, 'num_leaves': 80, 'max_depth': 12, 'min_child_samples': 64, 'subsample': 0.7502835291411981, 'colsample_bytree': 0.7004526680003816, 'reg_alpha': 0.6766343665205092, 'reg_lambda': 0.8388721677824922}. Best is trial 12 with value: 0.5789410301951485.


[LightGBM] [Info] Number of positive: 236344, number of negative: 5607100
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.019133 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 765
[LightGBM] [Info] Number of data points in the train set: 5843444, number of used features: 3
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.040446 -> initscore=-3.166501
[LightGBM] [Info] Start training from score -3.166501


[I 2025-05-31 04:45:09,100] Trial 19 finished with value: 0.5781225033819892 and parameters: {'n_estimators': 1092, 'learning_rate': 0.05298305094822387, 'num_leaves': 47, 'max_depth': 15, 'min_child_samples': 84, 'subsample': 0.6737137161925263, 'colsample_bytree': 0.45034942302295355, 'reg_alpha': 0.40723204717074263, 'reg_lambda': 0.5957301669529974}. Best is trial 12 with value: 0.5789410301951485.


[LightGBM] [Info] Number of positive: 236344, number of negative: 5607100
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.126347 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 765
[LightGBM] [Info] Number of data points in the train set: 5843444, number of used features: 3
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.040446 -> initscore=-3.166501
[LightGBM] [Info] Start training from score -3.166501


[I 2025-05-31 04:45:31,880] Trial 20 finished with value: 0.5613511873361365 and parameters: {'n_estimators': 310, 'learning_rate': 0.03223595823440807, 'num_leaves': 71, 'max_depth': 10, 'min_child_samples': 43, 'subsample': 0.7302152057060989, 'colsample_bytree': 0.5552397412219747, 'reg_alpha': 0.19665172623571991, 'reg_lambda': 0.38957368377446494}. Best is trial 12 with value: 0.5789410301951485.


[LightGBM] [Info] Number of positive: 236344, number of negative: 5607100
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.011643 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 765
[LightGBM] [Info] Number of data points in the train set: 5843444, number of used features: 3
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.040446 -> initscore=-3.166501
[LightGBM] [Info] Start training from score -3.166501


[I 2025-05-31 04:45:57,402] Trial 21 finished with value: 0.5781733321093958 and parameters: {'n_estimators': 1471, 'learning_rate': 0.023304373375872406, 'num_leaves': 81, 'max_depth': 12, 'min_child_samples': 50, 'subsample': 0.643681746501295, 'colsample_bytree': 0.47481644576852444, 'reg_alpha': 0.5207960581944632, 'reg_lambda': 0.6488667950403271}. Best is trial 12 with value: 0.5789410301951485.


[LightGBM] [Info] Number of positive: 236344, number of negative: 5607100
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.127235 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 765
[LightGBM] [Info] Number of data points in the train set: 5843444, number of used features: 3
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.040446 -> initscore=-3.166501
[LightGBM] [Info] Start training from score -3.166501


[I 2025-05-31 04:46:18,124] Trial 22 finished with value: 0.557763842834386 and parameters: {'n_estimators': 1314, 'learning_rate': 0.02544601193150744, 'num_leaves': 52, 'max_depth': 9, 'min_child_samples': 43, 'subsample': 0.6627380718072275, 'colsample_bytree': 0.5459799592381118, 'reg_alpha': 0.6317229872027623, 'reg_lambda': 0.7693137458646401}. Best is trial 12 with value: 0.5789410301951485.


[LightGBM] [Info] Number of positive: 236344, number of negative: 5607100
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.012447 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 765
[LightGBM] [Info] Number of data points in the train set: 5843444, number of used features: 3
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.040446 -> initscore=-3.166501
[LightGBM] [Info] Start training from score -3.166501


[I 2025-05-31 04:46:44,186] Trial 23 finished with value: 0.578043569124781 and parameters: {'n_estimators': 1726, 'learning_rate': 0.08165689945172121, 'num_leaves': 87, 'max_depth': 13, 'min_child_samples': 62, 'subsample': 0.600013647171038, 'colsample_bytree': 0.40579860920211547, 'reg_alpha': 0.4503522023827671, 'reg_lambda': 0.610906331644407}. Best is trial 12 with value: 0.5789410301951485.


[LightGBM] [Info] Number of positive: 236344, number of negative: 5607100
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.110724 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 765
[LightGBM] [Info] Number of data points in the train set: 5843444, number of used features: 3
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.040446 -> initscore=-3.166501
[LightGBM] [Info] Start training from score -3.166501


[I 2025-05-31 04:47:11,358] Trial 24 finished with value: 0.5782144233224403 and parameters: {'n_estimators': 1067, 'learning_rate': 0.018973059957114496, 'num_leaves': 76, 'max_depth': 10, 'min_child_samples': 15, 'subsample': 0.6359854851880032, 'colsample_bytree': 0.45834550035086896, 'reg_alpha': 0.7826989903487098, 'reg_lambda': 0.8873865410525643}. Best is trial 12 with value: 0.5789410301951485.


[LightGBM] [Info] Number of positive: 236344, number of negative: 5607100
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.012729 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 765
[LightGBM] [Info] Number of data points in the train set: 5843444, number of used features: 3
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.040446 -> initscore=-3.166501
[LightGBM] [Info] Start training from score -3.166501


[I 2025-05-31 04:47:36,714] Trial 25 finished with value: 0.5782037742048073 and parameters: {'n_estimators': 1087, 'learning_rate': 0.08173710589586793, 'num_leaves': 56, 'max_depth': 10, 'min_child_samples': 17, 'subsample': 0.635563476476197, 'colsample_bytree': 0.4552307499243894, 'reg_alpha': 0.7836967637384498, 'reg_lambda': 0.9921628560966806}. Best is trial 12 with value: 0.5789410301951485.


[LightGBM] [Info] Number of positive: 236344, number of negative: 5607100
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.029467 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 765
[LightGBM] [Info] Number of data points in the train set: 5843444, number of used features: 3
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.040446 -> initscore=-3.166501
[LightGBM] [Info] Start training from score -3.166501


[I 2025-05-31 04:47:58,317] Trial 26 finished with value: 0.5554463277128462 and parameters: {'n_estimators': 751, 'learning_rate': 0.018030670168346245, 'num_leaves': 39, 'max_depth': 8, 'min_child_samples': 14, 'subsample': 0.6897800285084664, 'colsample_bytree': 0.5342729630554042, 'reg_alpha': 0.931167566898202, 'reg_lambda': 0.8766395222688899}. Best is trial 12 with value: 0.5789410301951485.


[LightGBM] [Info] Number of positive: 236344, number of negative: 5607100
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.031094 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 765
[LightGBM] [Info] Number of data points in the train set: 5843444, number of used features: 3
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.040446 -> initscore=-3.166501
[LightGBM] [Info] Start training from score -3.166501


[I 2025-05-31 04:48:20,620] Trial 27 finished with value: 0.5688634576793724 and parameters: {'n_estimators': 783, 'learning_rate': 0.037196799325750485, 'num_leaves': 71, 'max_depth': 13, 'min_child_samples': 35, 'subsample': 0.7418927709553869, 'colsample_bytree': 0.782160788398872, 'reg_alpha': 0.8226313256301007, 'reg_lambda': 0.9157139814116717}. Best is trial 12 with value: 0.5789410301951485.


[LightGBM] [Info] Number of positive: 236344, number of negative: 5607100
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.128896 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 765
[LightGBM] [Info] Number of data points in the train set: 5843444, number of used features: 3
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.040446 -> initscore=-3.166501
[LightGBM] [Info] Start training from score -3.166501


[I 2025-05-31 04:48:44,508] Trial 28 finished with value: 0.560250131972181 and parameters: {'n_estimators': 1089, 'learning_rate': 0.05556214390013588, 'num_leaves': 89, 'max_depth': 10, 'min_child_samples': 12, 'subsample': 0.6310854053214295, 'colsample_bytree': 0.6461623900413855, 'reg_alpha': 0.6371461565601958, 'reg_lambda': 0.7986143107043973}. Best is trial 12 with value: 0.5789410301951485.


[LightGBM] [Info] Number of positive: 236344, number of negative: 5607100
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.012310 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 765
[LightGBM] [Info] Number of data points in the train set: 5843444, number of used features: 3
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.040446 -> initscore=-3.166501
[LightGBM] [Info] Start training from score -3.166501


[I 2025-05-31 04:49:03,751] Trial 29 finished with value: 0.5781482797925506 and parameters: {'n_estimators': 1235, 'learning_rate': 0.017540154203833282, 'num_leaves': 32, 'max_depth': 12, 'min_child_samples': 30, 'subsample': 0.7067522763631156, 'colsample_bytree': 0.43946916801357777, 'reg_alpha': 0.7275716299106498, 'reg_lambda': 0.42519498700187275}. Best is trial 12 with value: 0.5789410301951485.


[LightGBM] [Info] Number of positive: 236344, number of negative: 5607100
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.117034 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 765
[LightGBM] [Info] Number of data points in the train set: 5843444, number of used features: 3
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.040446 -> initscore=-3.166501
[LightGBM] [Info] Start training from score -3.166501


[I 2025-05-31 04:49:26,640] Trial 30 finished with value: 0.5783008451705465 and parameters: {'n_estimators': 967, 'learning_rate': 0.021811549352438224, 'num_leaves': 55, 'max_depth': 8, 'min_child_samples': 22, 'subsample': 0.8669669465343977, 'colsample_bytree': 0.4013989200007565, 'reg_alpha': 0.5891641314772619, 'reg_lambda': 0.5413879305711692}. Best is trial 12 with value: 0.5789410301951485.


[LightGBM] [Info] Number of positive: 236344, number of negative: 5607100
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.012142 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 765
[LightGBM] [Info] Number of data points in the train set: 5843444, number of used features: 3
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.040446 -> initscore=-3.166501
[LightGBM] [Info] Start training from score -3.166501


[I 2025-05-31 04:49:48,301] Trial 31 finished with value: 0.5783028623992296 and parameters: {'n_estimators': 956, 'learning_rate': 0.014865910372783085, 'num_leaves': 55, 'max_depth': 8, 'min_child_samples': 21, 'subsample': 0.8745304347281883, 'colsample_bytree': 0.4129703702578582, 'reg_alpha': 0.6044327463096592, 'reg_lambda': 0.5385536718147418}. Best is trial 12 with value: 0.5789410301951485.


[LightGBM] [Info] Number of positive: 236344, number of negative: 5607100
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.109678 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 765
[LightGBM] [Info] Number of data points in the train set: 5843444, number of used features: 3
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.040446 -> initscore=-3.166501
[LightGBM] [Info] Start training from score -3.166501


[I 2025-05-31 04:50:12,147] Trial 32 finished with value: 0.5780835795285243 and parameters: {'n_estimators': 1156, 'learning_rate': 0.015202879208700558, 'num_leaves': 56, 'max_depth': 7, 'min_child_samples': 22, 'subsample': 0.8762974004831988, 'colsample_bytree': 0.4667298457985667, 'reg_alpha': 0.6088026090524534, 'reg_lambda': 0.48308855930426586}. Best is trial 12 with value: 0.5789410301951485.


[LightGBM] [Info] Number of positive: 236344, number of negative: 5607100
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.031054 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 765
[LightGBM] [Info] Number of data points in the train set: 5843444, number of used features: 3
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.040446 -> initscore=-3.166501
[LightGBM] [Info] Start training from score -3.166501


[I 2025-05-31 04:50:32,434] Trial 33 finished with value: 0.5558574542345482 and parameters: {'n_estimators': 1023, 'learning_rate': 0.020902385108124987, 'num_leaves': 34, 'max_depth': 7, 'min_child_samples': 21, 'subsample': 0.8765708094731933, 'colsample_bytree': 0.5182101486906907, 'reg_alpha': 0.39124304629426954, 'reg_lambda': 0.41750170043155893}. Best is trial 12 with value: 0.5789410301951485.


[LightGBM] [Info] Number of positive: 236344, number of negative: 5607100
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.124033 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 765
[LightGBM] [Info] Number of data points in the train set: 5843444, number of used features: 3
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.040446 -> initscore=-3.166501
[LightGBM] [Info] Start training from score -3.166501


[I 2025-05-31 04:50:53,019] Trial 34 finished with value: 0.5613652499074724 and parameters: {'n_estimators': 836, 'learning_rate': 0.015191256385597417, 'num_leaves': 71, 'max_depth': 6, 'min_child_samples': 10, 'subsample': 0.92672050862986, 'colsample_bytree': 0.5775927299259882, 'reg_alpha': 0.901497666613479, 'reg_lambda': 0.561672594377846}. Best is trial 12 with value: 0.5789410301951485.


[LightGBM] [Info] Number of positive: 236344, number of negative: 5607100
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.011563 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 765
[LightGBM] [Info] Number of data points in the train set: 5843444, number of used features: 3
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.040446 -> initscore=-3.166501
[LightGBM] [Info] Start training from score -3.166501


[I 2025-05-31 04:51:14,243] Trial 35 finished with value: 0.5783147502311933 and parameters: {'n_estimators': 1344, 'learning_rate': 0.02933726874848027, 'num_leaves': 49, 'max_depth': 8, 'min_child_samples': 33, 'subsample': 0.8573309855580636, 'colsample_bytree': 0.44736102707037606, 'reg_alpha': 0.713882509233937, 'reg_lambda': 0.28975165314476653}. Best is trial 12 with value: 0.5789410301951485.


[LightGBM] [Info] Number of positive: 236344, number of negative: 5607100
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.140132 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 765
[LightGBM] [Info] Number of data points in the train set: 5843444, number of used features: 3
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.040446 -> initscore=-3.166501
[LightGBM] [Info] Start training from score -3.166501


[I 2025-05-31 04:51:35,243] Trial 36 finished with value: 0.5779690480638193 and parameters: {'n_estimators': 711, 'learning_rate': 0.02727685080381675, 'num_leaves': 23, 'max_depth': 8, 'min_child_samples': 33, 'subsample': 0.8050635349574516, 'colsample_bytree': 0.40961128969236094, 'reg_alpha': 0.7077629899506641, 'reg_lambda': 0.2335447950652214}. Best is trial 12 with value: 0.5789410301951485.


[LightGBM] [Info] Number of positive: 236344, number of negative: 5607100
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.013155 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 765
[LightGBM] [Info] Number of data points in the train set: 5843444, number of used features: 3
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.040446 -> initscore=-3.166501
[LightGBM] [Info] Start training from score -3.166501


[I 2025-05-31 04:51:56,316] Trial 37 finished with value: 0.5779969375055075 and parameters: {'n_estimators': 1242, 'learning_rate': 0.02113724861864472, 'num_leaves': 51, 'max_depth': 6, 'min_child_samples': 23, 'subsample': 0.8546106112463654, 'colsample_bytree': 0.4931368149386077, 'reg_alpha': 0.5910101305847074, 'reg_lambda': 0.33508541029235406}. Best is trial 12 with value: 0.5789410301951485.


[LightGBM] [Info] Number of positive: 236344, number of negative: 5607100
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.011950 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 765
[LightGBM] [Info] Number of data points in the train set: 5843444, number of used features: 3
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.040446 -> initscore=-3.166501
[LightGBM] [Info] Start training from score -3.166501


[I 2025-05-31 04:52:15,197] Trial 38 finished with value: 0.5777235434275148 and parameters: {'n_estimators': 1328, 'learning_rate': 0.012696373139892053, 'num_leaves': 39, 'max_depth': 5, 'min_child_samples': 33, 'subsample': 0.9287346718675895, 'colsample_bytree': 0.40038339087812425, 'reg_alpha': 0.25283404902185486, 'reg_lambda': 0.26215744675231034}. Best is trial 12 with value: 0.5789410301951485.


[LightGBM] [Info] Number of positive: 236344, number of negative: 5607100
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.032037 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 765
[LightGBM] [Info] Number of data points in the train set: 5843444, number of used features: 3
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.040446 -> initscore=-3.166501
[LightGBM] [Info] Start training from score -3.166501


[I 2025-05-31 04:52:36,350] Trial 39 finished with value: 0.5603540165692104 and parameters: {'n_estimators': 436, 'learning_rate': 0.010830718300924755, 'num_leaves': 56, 'max_depth': 9, 'min_child_samples': 38, 'subsample': 0.8393145066827911, 'colsample_bytree': 0.8535268311362016, 'reg_alpha': 0.48975932301797154, 'reg_lambda': 0.12323004206062499}. Best is trial 12 with value: 0.5789410301951485.


[LightGBM] [Info] Number of positive: 236344, number of negative: 5607100
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.120976 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 765
[LightGBM] [Info] Number of data points in the train set: 5843444, number of used features: 3
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.040446 -> initscore=-3.166501
[LightGBM] [Info] Start training from score -3.166501


[I 2025-05-31 04:52:56,623] Trial 40 finished with value: 0.5781108110027161 and parameters: {'n_estimators': 652, 'learning_rate': 0.03047559887680618, 'num_leaves': 25, 'max_depth': 6, 'min_child_samples': 6, 'subsample': 0.799011327332592, 'colsample_bytree': 0.44016513971895344, 'reg_alpha': 0.6620690199524276, 'reg_lambda': 0.4441959290079066}. Best is trial 12 with value: 0.5789410301951485.


[LightGBM] [Info] Number of positive: 236344, number of negative: 5607100
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.011981 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 765
[LightGBM] [Info] Number of data points in the train set: 5843444, number of used features: 3
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.040446 -> initscore=-3.166501
[LightGBM] [Info] Start training from score -3.166501


[I 2025-05-31 04:53:19,055] Trial 41 finished with value: 0.5782099078350637 and parameters: {'n_estimators': 1159, 'learning_rate': 0.02069962190112737, 'num_leaves': 73, 'max_depth': 8, 'min_child_samples': 20, 'subsample': 0.8780287799599515, 'colsample_bytree': 0.4589348731361237, 'reg_alpha': 0.78190146373411, 'reg_lambda': 0.7159635623386258}. Best is trial 12 with value: 0.5789410301951485.


[LightGBM] [Info] Number of positive: 236344, number of negative: 5607100
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.111307 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 765
[LightGBM] [Info] Number of data points in the train set: 5843444, number of used features: 3
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.040446 -> initscore=-3.166501
[LightGBM] [Info] Start training from score -3.166501


[I 2025-05-31 04:53:46,316] Trial 42 finished with value: 0.5780348262145921 and parameters: {'n_estimators': 1025, 'learning_rate': 0.016258965492767872, 'num_leaves': 138, 'max_depth': 9, 'min_child_samples': 25, 'subsample': 0.9066270569866185, 'colsample_bytree': 0.47917110360371545, 'reg_alpha': 0.8194739088823882, 'reg_lambda': 0.5791014940313891}. Best is trial 12 with value: 0.5789410301951485.


[LightGBM] [Info] Number of positive: 236344, number of negative: 5607100
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.012202 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 765
[LightGBM] [Info] Number of data points in the train set: 5843444, number of used features: 3
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.040446 -> initscore=-3.166501
[LightGBM] [Info] Start training from score -3.166501


[I 2025-05-31 04:54:09,082] Trial 43 finished with value: 0.5780777969394534 and parameters: {'n_estimators': 870, 'learning_rate': 0.028733492262067267, 'num_leaves': 93, 'max_depth': 7, 'min_child_samples': 15, 'subsample': 0.8504893995461952, 'colsample_bytree': 0.43574038999580933, 'reg_alpha': 0.9911990839587658, 'reg_lambda': 0.28883184773910403}. Best is trial 12 with value: 0.5789410301951485.


[LightGBM] [Info] Number of positive: 236344, number of negative: 5607100
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.013050 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 765
[LightGBM] [Info] Number of data points in the train set: 5843444, number of used features: 3
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.040446 -> initscore=-3.166501
[LightGBM] [Info] Start training from score -3.166501


[I 2025-05-31 04:54:29,362] Trial 44 finished with value: 0.5777211850519971 and parameters: {'n_estimators': 1326, 'learning_rate': 0.012634109246610124, 'num_leaves': 60, 'max_depth': 5, 'min_child_samples': 29, 'subsample': 0.9592274523875223, 'colsample_bytree': 0.43177991595744614, 'reg_alpha': 0.5852306805001906, 'reg_lambda': 0.5381207479661809}. Best is trial 12 with value: 0.5789410301951485.


[LightGBM] [Info] Number of positive: 236344, number of negative: 5607100
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.031177 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 765
[LightGBM] [Info] Number of data points in the train set: 5843444, number of used features: 3
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.040446 -> initscore=-3.166501
[LightGBM] [Info] Start training from score -3.166501


[I 2025-05-31 04:54:50,504] Trial 45 finished with value: 0.559266096705288 and parameters: {'n_estimators': 1010, 'learning_rate': 0.03351048479948853, 'num_leaves': 51, 'max_depth': 8, 'min_child_samples': 10, 'subsample': 0.7620623217328627, 'colsample_bytree': 0.5244802920941046, 'reg_alpha': 0.7448933113834484, 'reg_lambda': 0.39125866452828983}. Best is trial 12 with value: 0.5789410301951485.


[LightGBM] [Info] Number of positive: 236344, number of negative: 5607100
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.029233 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 765
[LightGBM] [Info] Number of data points in the train set: 5843444, number of used features: 3
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.040446 -> initscore=-3.166501
[LightGBM] [Info] Start training from score -3.166501


[I 2025-05-31 04:55:22,291] Trial 46 finished with value: 0.5651967780319128 and parameters: {'n_estimators': 1174, 'learning_rate': 0.019390123286124498, 'num_leaves': 67, 'max_depth': 14, 'min_child_samples': 18, 'subsample': 0.810244724931339, 'colsample_bytree': 0.6149753026339297, 'reg_alpha': 0.7055542295547712, 'reg_lambda': 0.09509357408974448}. Best is trial 12 with value: 0.5789410301951485.


[LightGBM] [Info] Number of positive: 236344, number of negative: 5607100
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.100226 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 765
[LightGBM] [Info] Number of data points in the train set: 5843444, number of used features: 3
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.040446 -> initscore=-3.166501
[LightGBM] [Info] Start training from score -3.166501


[I 2025-05-31 04:55:45,520] Trial 47 finished with value: 0.5780649041941405 and parameters: {'n_estimators': 530, 'learning_rate': 0.02303792582899617, 'num_leaves': 111, 'max_depth': 7, 'min_child_samples': 26, 'subsample': 0.8996878371950288, 'colsample_bytree': 0.4687092711082855, 'reg_alpha': 0.5343806706094251, 'reg_lambda': 0.4717734523150097}. Best is trial 12 with value: 0.5789410301951485.


[LightGBM] [Info] Number of positive: 236344, number of negative: 5607100
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.111683 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 765
[LightGBM] [Info] Number of data points in the train set: 5843444, number of used features: 3
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.040446 -> initscore=-3.166501
[LightGBM] [Info] Start training from score -3.166501


[I 2025-05-31 04:56:09,364] Trial 48 finished with value: 0.5781139969768282 and parameters: {'n_estimators': 909, 'learning_rate': 0.014115729003203068, 'num_leaves': 46, 'max_depth': 10, 'min_child_samples': 37, 'subsample': 0.7831655205577044, 'colsample_bytree': 0.4000969809406573, 'reg_alpha': 0.8482081093586721, 'reg_lambda': 0.7128203344601364}. Best is trial 12 with value: 0.5789410301951485.


[LightGBM] [Info] Number of positive: 236344, number of negative: 5607100
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.124377 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 765
[LightGBM] [Info] Number of data points in the train set: 5843444, number of used features: 3
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.040446 -> initscore=-3.166501
[LightGBM] [Info] Start training from score -3.166501


[I 2025-05-31 04:56:27,345] Trial 49 finished with value: 0.5508362550186705 and parameters: {'n_estimators': 1435, 'learning_rate': 0.04137547738297335, 'num_leaves': 76, 'max_depth': 4, 'min_child_samples': 30, 'subsample': 0.8490645266533715, 'colsample_bytree': 0.5029691709752964, 'reg_alpha': 0.7654531168494275, 'reg_lambda': 0.3712398192535}. Best is trial 12 with value: 0.5789410301951485.



Optuna search finished.
Best trial AUC: 0.5789410301951485
Best hyperparameters found:
{'n_estimators': 957, 'learning_rate': 0.09035257446504434, 'num_leaves': 61, 'max_depth': 15, 'min_child_samples': 30, 'subsample': 0.7012974416579587, 'colsample_bytree': 0.40894546500107765, 'reg_alpha': 0.4748034675035624, 'reg_lambda': 0.580884129120939}
[LightGBM] [Info] Number of positive: 236344, number of negative: 5607100
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.014004 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 765
[LightGBM] [Info] Number of data points in the train set: 5843444, number of used features: 3
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.040446 -> initscore=-3.166501
[LightGBM] [Info] Start training from score -3.166501
Training until validation scores don't improve for 100 rounds
Early stopping, best iteratio

In [13]:
# Define path where artifacts were saved
ARTIFACTS_PATH = '/kaggle/working/model_assets/'

# Load TF-IDF vectorizer
with open(os.path.join(ARTIFACTS_PATH, 'tfidf_vectorizer.pkl'), 'rb') as f:
    tfidf_vectorizer = pickle.load(f)

# Load NewsID-to-TF-IDF-index mapping
with open(os.path.join(ARTIFACTS_PATH, 'newsid_to_tfidf_idx.pkl'), 'rb') as f:
    newsid_to_tfidf_idx = pickle.load(f)

# Load all_news_tfidf_matrix (or a way to reconstruct it/access news TF-IDF vectors)
# For inference, you might just need a way to get a vector for a news_id.
# The full matrix might be large. If all_news_for_tfidf_df was saved, you could re-transform.
# Or, more practically, news content details would be in a database.
# For this example, let's assume we have a way to get TF-IDF for new news.
# And let's assume all_news_tfidf_matrix and newsid_to_tfidf_idx cover potential history/candidate news.
with open(os.path.join(ARTIFACTS_PATH, 'all_news_tfidf_matrix.pkl'), 'rb') as f:
    all_news_tfidf_matrix = pickle.load(f) # Assuming this was saved from Cell 7

# Load Label Encoders
with open(os.path.join(ARTIFACTS_PATH, 'user_encoder.pkl'), 'rb') as f:
    user_encoder = pickle.load(f)
with open(os.path.join(ARTIFACTS_PATH, 'news_encoder.pkl'), 'rb') as f:
    news_encoder = pickle.load(f)

# Load the trained LightGBM model
with open(os.path.join(ARTIFACTS_PATH, 'lgbm_news_recommender_best_params.pkl'), 'rb') as f:
    lgbm_model = pickle.load(f)

print("All model artifacts loaded successfully.")

# --- Prepare Sample New Data for Inference ---
# This data needs to be in the same format as one row of `train_merged_df` before feature selection
# Specifically, it needs: 'UserID', 'ImpressionNewsID', 'History' (list of NewsIDs)
# And the news articles (ImpressionNewsID and those in History) need to have their content available
# (e.g., 'Title', 'Abstract') if TF-IDF vectors need to be computed for *new* news.

# For simplicity, let's assume we have a user and a candidate news article,
# and their NewsIDs are known and present in our `newsid_to_tfidf_idx`.

sample_inference_data = pd.DataFrame([{
    'UserID': 'U13009', # An existing user from the dataset for demo
    'ImpressionNewsID': 'N21256', # An existing news item for demo
    'History': ['N7074', 'N43029', 'N10307'] # Sample history list
    # If news content for ImpressionNewsID or History items is *new* and not in all_news_tfidf_matrix,
    # you'd need their 'Title' and 'Abstract' to generate TF-IDF vectors using the loaded tfidf_vectorizer.
}])

# --- Feature Engineering for the Sample Data (mimicking Cell 8 memory-optimized version) ---
num_tfidf_features_inf = all_news_tfidf_matrix.shape[1] # From loaded matrix

def get_dense_tfidf_vector_inf(news_id, nid_to_idx_map, matrix, num_features):
    if news_id in nid_to_idx_map:
        idx = nid_to_idx_map[news_id]
        return np.asarray(matrix[idx].todense(), dtype=np.float32).flatten()
    # Handle truly new news: if news_id is new, you'd fetch its text,
    # transform with tfidf_vectorizer, then convert to dense float32 array.
    # For this example, assume all IDs are known from training.
    print(f"Warning: NewsID {news_id} not found in known TF-IDF map during inference. Returning zero vector.")
    return np.zeros(num_features, dtype=np.float32)

# Calculate cosine similarity for the sample
history_list_inf = sample_inference_data.iloc[0]['History']
impression_news_id_inf = sample_inference_data.iloc[0]['ImpressionNewsID']

history_item_vectors_inf = []
if isinstance(history_list_inf, list) and len(history_list_inf) > 0:
    for nid in history_list_inf:
        vec = get_dense_tfidf_vector_inf(nid, newsid_to_tfidf_idx, all_news_tfidf_matrix, num_tfidf_features_inf)
        if np.any(vec): history_item_vectors_inf.append(vec)

if history_item_vectors_inf:
    user_profile_vec_inf = np.mean(np.array(history_item_vectors_inf, dtype=np.float32), axis=0)
else:
    user_profile_vec_inf = np.zeros(num_tfidf_features_inf, dtype=np.float32)

impression_vec_inf = get_dense_tfidf_vector_inf(impression_news_id_inf, newsid_to_tfidf_idx, all_news_tfidf_matrix, num_tfidf_features_inf)

sim = 0.0
if not (np.all(user_profile_vec_inf == 0) or np.all(impression_vec_inf == 0)):
    # Need to import cosine_similarity from sklearn.metrics.pairwise
    from sklearn.metrics.pairwise import cosine_similarity
    sim = cosine_similarity(user_profile_vec_inf.reshape(1, -1), impression_vec_inf.reshape(1, -1))[0,0]
sample_inference_data['HistoryImpressionCosineSimilarity'] = [sim]

# Encode UserID and ImpressionNewsID
# Handle unknown labels: if an ID wasn't seen during training, encoder will fail.
# Strategy: map unknown to a special category or ignore. For simplicity, assume known.
def safe_transform(encoder, value, unknown_value=-1): # -1 or some other placeholder
    try:
        return encoder.transform([str(value)])[0]
    except ValueError: # Or more specific: NotFittedError or if value not in classes_
        print(f"Warning: Value '{value}' not seen by encoder {encoder}. Assigning {unknown_value}.")
        # Check if unknown_value is a valid encoded value or if the model can handle it.
        # A robust way is to add an "unknown" category to encoders during training.
        return unknown_value # This might break model if it expects specific range

sample_inference_data['UserID_Encoded'] = safe_transform(user_encoder, sample_inference_data.iloc[0]['UserID'])
sample_inference_data['ImpressionNewsID_Encoded'] = safe_transform(news_encoder, sample_inference_data.iloc[0]['ImpressionNewsID'])

# --- Make Prediction ---
inference_features = ['UserID_Encoded', 'ImpressionNewsID_Encoded', 'HistoryImpressionCosineSimilarity']
X_inference = sample_inference_data[inference_features]

predicted_click_probability = lgbm_model.predict_proba(X_inference)[:, 1]
predicted_click_label = lgbm_model.predict(X_inference)

print(f"\nFor sample data: {sample_inference_data.iloc[0].to_dict()}")
print(f"Predicted Click Probability: {predicted_click_probability[0]:.4f}")
print(f"Predicted Click Label: {predicted_click_label[0]}")

All model artifacts loaded successfully.

For sample data: {'UserID': 'U13009', 'ImpressionNewsID': 'N21256', 'History': ['N7074', 'N43029', 'N10307'], 'HistoryImpressionCosineSimilarity': 0.01300215907394886, 'UserID_Encoded': 3346, 'ImpressionNewsID_Encoded': -1}
Predicted Click Probability: 0.0274
Predicted Click Label: 0
