In [1]:
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.multioutput import MultiOutputRegressor
from sklearn.metrics import accuracy_score, roc_auc_score
from sklearn.model_selection import train_test_split
import numpy as np
import pandas as pd
from decimal import *
import math 
from statistics import mean
import pickle
from itertools import chain

#Loading models
print("Loading the models...")
with open('model_phase1.sav', 'rb') as f:
    model_phase1 = pickle.load(f)
    
with open('model_phase2.sav', 'rb') as f:
    model_phase2 = pickle.load(f)

Loading the models...


In [2]:
# Loading datasets
print("Loading datasets...")
#test_behaviors = pd.read_parquet('Dataset/ebnerd_testset/test/behaviors.parquet')
#test_history = pd.read_parquet('Dataset/ebnerd_testset/test/history.parquet')
#articles = pd.read_parquet('Dataset/ebnerd_testset/articles.parquet')

#test_behaviors.drop(['is_beyond_accuracy'], axis=1, inplace=True) 

##  We used the small dataset because we couldn't run the test dataset.
train_behaviors = pd.read_parquet('ebnerd_small/train/behaviors.parquet')
train_history = pd.read_parquet('ebnerd_small/train/history.parquet')
val_behaviors = pd.read_parquet('ebnerd_small/validation/behaviors.parquet')
val_history = pd.read_parquet('ebnerd_small/validation/history.parquet')
articles = pd.read_parquet('ebnerd_small/articles.parquet')

test_behaviors = pd.concat([train_behaviors, val_behaviors])
test_history = pd.concat([train_history, val_history])
####################

Loading datasets...


In [3]:
# Conversion of training and verification history to long format
test_history_expanded = test_history.explode(['impression_time_fixed', 'scroll_percentage_fixed', 'article_id_fixed', 'read_time_fixed'])

# Update column names
test_history_expanded.rename(columns={'article_id_fixed': 'article_id', 'impression_time_fixed': 'impression_time', 'scroll_percentage_fixed': 'scroll_percentage', 'read_time_fixed': 'read_time'}, inplace=True)

pd.options.mode.chained_assignment = None  # default='warn' 

print("Datatypes structuring... ")


Datatypes structuring... 


In [4]:
# Fill missing values ​​and encode string values ​​to float
for i in range(len(articles['subcategory'])):
    if (len(articles['subcategory'][i])==0):
        articles['subcategory'][i] = [-1] 
    articles['subcategory'][i]=float(mean(articles['subcategory'][i])) #Reduce size by averaging the list
for i in range(len(articles['image_ids'])):
    if (type(articles['image_ids'][i]) is not np.ndarray):
        articles['image_ids'][i] = [0] #It means there is no picture in the article
    articles['image_ids'][i]=float(mean(articles['image_ids'][i])) #Reduce size by averaging the list


In [5]:
import numpy as np
import hashlib
import math
from decimal import Decimal

def string_to_float32(s):
    # Compute the hash of the string
    hash_object = hashlib.sha256(s.encode('ascii', errors='backslashreplace'))
    hash_hex = hash_object.hexdigest()
    
    # Convert hash to an integer
    hash_int = int(hash_hex, 16)
    
    # Normalize the hash to a float32 range (0, 1)
    max_hash_int = int('f' * len(hash_hex), 16)
    normalized = hash_int / max_hash_int
    
    # Convert normalized value to float32
    return np.float32(normalized)

for column in ['ner_clusters', 'entity_groups', 'topics']:
    data = articles[column]
    for i in range(len(data)):
        string_data_combined = ""
        for j in range(len(data[i])):
            string_data_combined += "/" + data[i][j]  
        
        new = string_to_float32(string_data_combined)
        data[i] = new
    articles[column] = data


In [6]:
import numpy as np
import hashlib
import math
from decimal import Decimal
from sklearn.preprocessing import LabelEncoder

def string_to_float32(s):
    # Compute the hash of the string
    hash_object = hashlib.sha256(s.encode('ascii', errors='backslashreplace'))
    hash_hex = hash_object.hexdigest()
    
    # Convert hash to an integer
    hash_int = int(hash_hex, 16)
    
    # Normalize the hash to a float32 range (0, 1)
    max_hash_int = int('f' * len(hash_hex), 16)
    normalized = hash_int / max_hash_int
    
    # Convert normalized value to float32
    return np.float32(normalized)

# We can enclose conversion operations in a function
def encode_categorical(data, label_encoders=None):
    if label_encoders is None:
        label_encoders = {}
    for column in ['device_type', 'gender']:
        le = label_encoders.get(column, LabelEncoder())
        data[column] = le.fit_transform(data[column].astype(str))
        label_encoders[column] = le
    return data, label_encoders

# We can also convert the 'title' and 'subtitle' columns into the function
def convert_to_float32(articles):
    for column in ['title', 'subtitle']:
        articles[column] = articles[column].astype(str).apply(string_to_float32)
    return articles


# Convert 'title' and 'subtitle' columns
articles = convert_to_float32(articles)

# Encode and transform Train dataset
train_behaviors, label_encoders = encode_categorical(train_behaviors)

# Just code the validation dataset
val_behaviors, _ = encode_categorical(val_behaviors, label_encoders)


In [7]:

# Encoding categorical variables for the Articles dataset
for column in ['article_type', 'category_str', 'sentiment_label']:
    le = LabelEncoder()
    articles[column] = le.fit_transform(articles[column].astype(str))
    label_encoders[column] = le

In [8]:
# Normalize numeric variables
scaler = StandardScaler()
articles[['sentiment_score']] = scaler.fit_transform(articles[['sentiment_score']].astype(float)) #Sentimen Score için zaten sayı değeri düşük o yüzden reshape yok
articles[['total_read_time']] = scaler.fit_transform(articles[['total_read_time']].astype(float).values.reshape(-1, 1))
articles[['total_pageviews']] = scaler.fit_transform(articles[['total_pageviews']].astype(float).values.reshape(-1, 1))
articles[['total_inviews']] = scaler.fit_transform(articles[['total_inviews']].astype(float).values.reshape(-1, 1))
articles[['title']] = scaler.fit_transform(articles[['title']].astype(float).values.reshape(-1, 1))
articles[['subtitle']] = scaler.fit_transform(articles[['subtitle']].astype(float).values.reshape(-1, 1))
articles[['ner_clusters']] = scaler.fit_transform(articles[['ner_clusters']].astype(float).values.reshape(-1, 1))
articles[['entity_groups']] = scaler.fit_transform(articles[['entity_groups']].astype(float).values.reshape(-1, 1))
articles[['topics']] = scaler.fit_transform(articles[['topics']].astype(float).values.reshape(-1, 1))
articles[['subcategory']] = scaler.fit_transform(articles[['subcategory']].astype(float).values.reshape(-1, 1))
articles[['category']] = scaler.fit_transform(articles[['category']].astype(float).values.reshape(-1, 1))
articles[['image_ids']] = scaler.fit_transform(articles[['image_ids']].astype(float).values.reshape(-1, 1)) #Daha da normalize edilebilecek değerler olabilir


In [9]:
# Handling date and time variables
test_behaviors['impression_time'] = pd.to_datetime(test_behaviors['impression_time']) #Only hours and days are taken from dates
test_behaviors['impression_hour'] = test_behaviors['impression_time'].dt.hour
test_behaviors['impression_day'] = test_behaviors['impression_time'].dt.day


test_history_expanded['impression_time'] = pd.to_datetime(test_history_expanded['impression_time'])
test_history_expanded['impression_hour'] = test_history_expanded['impression_time'].dt.hour
test_history_expanded['impression_day'] = test_history_expanded['impression_time'].dt.day


articles['last_modified_time'] = pd.to_datetime(articles['last_modified_time'])
articles['last_modified_hour'] = articles['last_modified_time'].dt.hour
articles['last_modified_day'] = articles['last_modified_time'].dt.day

articles['published_time'] = pd.to_datetime(articles['published_time'])
articles['published_time_hour'] = articles['published_time'].dt.hour
articles['published_time_day'] = articles['published_time'].dt.day


# Delete unnecessary columns
test_behaviors.drop(['impression_time'], axis=1, inplace=True)
test_behaviors.drop(['next_read_time'], axis=1, inplace=True)
test_behaviors.drop(['next_scroll_percentage'], axis=1, inplace=True)


#test_behaviors.drop(['impression_id'], axis=1, inplace=True)


test_behaviors.drop(['session_id'], axis=1, inplace=True)
#test_behaviors.drop(['article_id'], axis=1, inplace=True)
#####################################################################


In [10]:
test_history_expanded.drop(['impression_time'], axis=1, inplace=True)

articles.drop(['last_modified_time'], axis=1, inplace=True)
articles.drop(['published_time'], axis=1, inplace=True)
articles.drop(['body'], axis=1, inplace=True)
articles.drop(['url'], axis=1, inplace=True)

# check the column names
print("-------------------------------------------") 
print("Test History Expanded columns:", test_history_expanded.columns)
print("Test Behaviors Expanded columns:", test_behaviors.columns)
print("Articles columns:", articles.columns)

-------------------------------------------
Test History Expanded columns: Index(['user_id', 'scroll_percentage', 'article_id', 'read_time',
       'impression_hour', 'impression_day'],
      dtype='object')
Test Behaviors Expanded columns: Index(['impression_id', 'article_id', 'read_time', 'scroll_percentage',
       'device_type', 'article_ids_inview', 'article_ids_clicked', 'user_id',
       'is_sso_user', 'gender', 'postcode', 'age', 'is_subscriber',
       'impression_hour', 'impression_day'],
      dtype='object')
Articles columns: Index(['article_id', 'title', 'subtitle', 'premium', 'image_ids',
       'article_type', 'ner_clusters', 'entity_groups', 'topics', 'category',
       'subcategory', 'category_str', 'total_inviews', 'total_pageviews',
       'total_read_time', 'sentiment_score', 'sentiment_label',
       'last_modified_hour', 'last_modified_day', 'published_time_hour',
       'published_time_day'],
      dtype='object')


In [11]:

print("Dataset Merging... ")    
test_behaviors_expanded = test_behaviors.explode(['article_ids_inview']) #A record is created in the behaviors table for each reviewed article.
test_behaviors_expanded[['article_ids_inview']] = test_behaviors_expanded[['article_ids_inview']].astype(int)

test_behaviors_expanded = test_behaviors_expanded.explode(['article_ids_clicked']) #Univariate lists are being removed from the list
test_behaviors_expanded[['article_ids_clicked']] = test_behaviors_expanded[['article_ids_clicked']].astype(int)

articles[['article_id']] = articles[['article_id']].astype(int) #It is necessary to make comparisons

test_behaviors_expanded["article_id"] = test_behaviors_expanded["article_ids_inview"] #To combine the behaviors and articles tables, a column with the same name article_id is required in both tables.
test_behaviors_expanded.drop(['article_ids_inview'], axis=1, inplace=True)

test_behaviors_final = test_behaviors_expanded.merge(articles, on='article_id', how='left') #Behaviors and articles tables are joined via article_id
test_history_final = test_history_expanded.merge(articles, on='article_id', how='left') #History and articles tables are joined via article_id
X_history_final = test_history_final 

final_ids = pd.DataFrame()
final_ids['impression_id'] = test_behaviors_final['impression_id']
final_ids['article_id'] = test_behaviors_final['article_id']
final_ids = final_ids.reset_index()

Dataset Merging... 


In [12]:
############### This part will not be available when using with the test set
test_behaviors_final['clicked'] = np.where((test_behaviors_final['article_id'] == test_behaviors_final['article_ids_clicked']), True, False) #Görünen makale ile tıklanan makale aynı mı kontrol ediliyor
test_behaviors_final.drop(['article_ids_clicked'], axis=1, inplace=True)
######################################33


In [13]:
test_behaviors_final.drop(['impression_id'], axis=1, inplace=True)
test_behaviors_final.drop(['article_id'], axis=1, inplace=True)

X_history_final.drop(['article_id'], axis=1, inplace=True)

X_test_phase1 = test_behaviors_final.drop(['device_type','is_sso_user','gender','postcode','age','is_subscriber'], axis=1) ##Separating X and Y values ​​for stage one for Neural Network

Y_test_phase1 = test_behaviors_final.filter(['device_type','is_sso_user','gender','postcode','age','is_subscriber']) 

X_history_rearranged = pd.DataFrame() #The History table must be in the same order as the Behavior table, so the same columns are added to the new table in this order.
X_history_rearranged['read_time'] = X_history_final['read_time']
X_history_rearranged['scroll_percentage'] = X_history_final['scroll_percentage']
X_history_rearranged['user_id'] = X_history_final['user_id']
X_history_rearranged['impression_hour'] = X_history_final['impression_hour']
X_history_rearranged['impression_day'] = X_history_final['impression_day']
X_history_rearranged['title'] = X_history_final['title']
X_history_rearranged['subtitle'] = X_history_final['subtitle']
X_history_rearranged['premium'] = X_history_final['premium']
X_history_rearranged['image_ids'] = X_history_final['image_ids']
X_history_rearranged['article_type'] = X_history_final['article_type']
X_history_rearranged['ner_clusters'] = X_history_final['ner_clusters']
X_history_rearranged['entity_groups'] = X_history_final['entity_groups']
X_history_rearranged['topics'] = X_history_final['topics']
X_history_rearranged['category'] = X_history_final['category']
X_history_rearranged['subcategory'] = X_history_final['subcategory']
X_history_rearranged['category_str'] = X_history_final['category_str']
X_history_rearranged['total_inviews'] = X_history_final['total_inviews']
X_history_rearranged['total_pageviews'] = X_history_final['total_pageviews']
X_history_rearranged['total_read_time'] = X_history_final['total_read_time']
X_history_rearranged['sentiment_score'] = X_history_final['sentiment_score']
X_history_rearranged['sentiment_label'] = X_history_final['sentiment_label']
X_history_rearranged['last_modified_hour'] = X_history_final['last_modified_hour']
X_history_rearranged['last_modified_day'] = X_history_final['last_modified_day']
X_history_rearranged['published_time_hour'] = X_history_final['published_time_hour']
X_history_rearranged['published_time_day'] = X_history_final['published_time_day']
X_history_rearranged['clicked'] = True

X_history_final = X_history_rearranged

In [14]:
# Populating NaN values
X_test_phase1.loc[X_test_phase1["clicked"] == False, "read_time"] = 0 #Reset reading times for unclicked articles
X_test_phase1.loc[X_test_phase1["clicked"] == False, "scroll_percentage"] = 0

X_test_phase1['scroll_percentage'].fillna(value=0, inplace=True) 
X_test_phase1['total_inviews'].fillna(value=0, inplace=True)
X_test_phase1['total_pageviews'].fillna(value=0, inplace=True)
X_test_phase1['total_read_time'].fillna(value=0, inplace=True)

X_history_final['scroll_percentage'].fillna(value=0, inplace=True)
X_history_final['total_inviews'].fillna(value=0, inplace=True)
X_history_final['total_pageviews'].fillna(value=0, inplace=True)
X_history_final['total_read_time'].fillna(value=0, inplace=True)
X_history_final['read_time'].fillna(value=0, inplace=True)

Y_test_phase1['age'].fillna(value=0, inplace=True)
Y_test_phase1['postcode'].fillna(value=0, inplace=True)

Y_test_phase1[['age']] = Y_test_phase1[['age']].astype(int) 
Y_test_phase1[['postcode']] = Y_test_phase1[['postcode']].astype(int)

print("-------------------------------------------------") 

for column in X_test_phase1.items():
    print(column)


-------------------------------------------------
('read_time', 0           0.0
1           0.0
2           0.0
3          13.0
4           0.0
           ... 
5567412     0.0
5567413     0.0
5567414     0.0
5567415    18.0
5567416     0.0
Name: read_time, Length: 5567417, dtype: float32)
('scroll_percentage', 0          0.0
1          0.0
2          0.0
3          0.0
4          0.0
          ... 
5567412    0.0
5567413    0.0
5567414    0.0
5567415    0.0
5567416    0.0
Name: scroll_percentage, Length: 5567417, dtype: float32)
('user_id', 0          139836
1          139836
2          139836
3          139836
4          139836
            ...  
5567412    752762
5567413    752762
5567414    752762
5567415    752762
5567416    752762
Name: user_id, Length: 5567417, dtype: uint32)
('impression_hour', 0           7
1           7
2           7
3           7
4           7
           ..
5567412    19
5567413    19
5567414    19
5567415    19
5567416    19
Name: impression_hour, Length: 556

In [15]:
print("-------------------------------------------------")   
c = np.isinf(X_test_phase1['title']).values.sum() 
print("Behavior Title " + str(c) + " infinite values") 
c = np.isinf(X_test_phase1['subtitle']).values.sum() 
print("Behavior Subtitle " + str(c) + " infinite values") 
c = np.isinf(X_test_phase1['ner_clusters']).values.sum() 
print("Behavior Ner_clusters " + str(c) + " infinite values") 
c = np.isinf(X_test_phase1['entity_groups']).values.sum() 
print("Behavior Entity_groups " + str(c) + " infinite values") 
c = np.isinf(X_test_phase1['topics']).values.sum() 
print("Behavior Topics " + str(c) + " infinite values")

c = np.isinf(X_history_final['title']).values.sum() 
print("History Title " + str(c) + " infinite values") 
c = np.isinf(X_history_final['subtitle']).values.sum() 
print("History Subtitle " + str(c) + " infinite values") 
c = np.isinf(X_history_final['ner_clusters']).values.sum() 
print("History Ner_clusters " + str(c) + " infinite values") 
c = np.isinf(X_history_final['entity_groups']).values.sum() 
print("History Entity_groups " + str(c) + " infinite values") 
c = np.isinf(X_history_final['topics']).values.sum() 
print("History Topics " + str(c) + " infinite values") 

print("-------------------------------------------------")
print("Phase 1 model predicting... ")
y_pred = model_phase1.predict(X_history_final) #Columns that do not exist in the History set are predicted

y_pred_T = np.transpose(y_pred)
Y_history_final = pd.DataFrame() 
Y_history_final['device_type'] = y_pred_T[0]
Y_history_final['is_sso_user'] = y_pred_T[1].astype('b')
Y_history_final['gender'] = y_pred_T[2]
Y_history_final['postcode'] = y_pred_T[3]
Y_history_final['age'] = y_pred_T[4]
Y_history_final['is_subscriber'] = y_pred_T[5].astype('b')


-------------------------------------------------
Behavior Title 0 infinite values
Behavior Subtitle 0 infinite values
Behavior Ner_clusters 0 infinite values
Behavior Entity_groups 0 infinite values
Behavior Topics 0 infinite values
History Title 0 infinite values
History Subtitle 0 infinite values
History Ner_clusters 0 infinite values
History Entity_groups 0 infinite values
History Topics 0 infinite values
-------------------------------------------------
Phase 1 model predicting... 


In [16]:
X_total = X_history_final #In the next prediction we will only look at whether the article was clicked or not
Y_total = Y_history_final #So while table X contains all columns, table Y should only have ['clicked']
#Here the train and valve sets were already combined

X_total = X_total.reset_index()
Y_total = Y_total.reset_index()

X_total['device_type'] = Y_total['device_type']
X_total['is_sso_user'] = Y_total['is_sso_user']
X_total['gender'] = Y_total['gender']
X_total['postcode'] = Y_total['postcode']
X_total['age'] = Y_total['age']
X_total['is_subscriber'] = Y_total['is_subscriber']

X_total = X_total.drop(['clicked'], axis=1)
Y_total = X_total.filter(['clicked'])

X_total = X_total.drop(['index'], axis=1)

test_history_final_to_compare = X_total

print("Empty history columns are predicted.")
print("---------------------------") 
print("Phase 2 model predicting... ") #With the model obtained from train.py, we predict whether which articles were clicked or not in which impression.


Empty history columns are predicted.
---------------------------
Phase 2 model predicting... 


In [17]:
#################### This will not happen when trying with the test set
test_behaviors_final.drop(['clicked'], axis=1, inplace=True)
####################This will not happen when trying with the test set

#To calculate the probabilities, a Behaviors table that is not merged with the History set is required. Here, this table is obtained again because it has been merged before.
test_behaviors_final['read_time'].fillna(value=0, inplace=True)
test_behaviors_final['scroll_percentage'].fillna(value=0, inplace=True) 
test_behaviors_final['total_inviews'].fillna(value=0, inplace=True)
test_behaviors_final['total_pageviews'].fillna(value=0, inplace=True)
test_behaviors_final['total_read_time'].fillna(value=0, inplace=True)

test_behaviors_final['age'].fillna(value=0, inplace=True)
test_behaviors_final['postcode'].fillna(value=0, inplace=True)

test_behaviors_final[['age']] = test_behaviors_final[['age']].astype(int)
test_behaviors_final[['postcode']] = test_behaviors_final[['postcode']].astype(int)


In [18]:
test_behaviors_rearranged = pd.DataFrame() #It must be in the same order as the behavior table fitted in train.py
test_behaviors_rearranged['read_time'] = test_behaviors_final['read_time']
test_behaviors_rearranged['scroll_percentage'] = test_behaviors_final['scroll_percentage']
test_behaviors_rearranged['user_id'] = test_behaviors_final['user_id']
test_behaviors_rearranged['impression_hour'] = test_behaviors_final['impression_hour']
test_behaviors_rearranged['impression_day'] = test_behaviors_final['impression_day']
test_behaviors_rearranged['title'] = test_behaviors_final['title']
test_behaviors_rearranged['subtitle'] = test_behaviors_final['subtitle']
test_behaviors_rearranged['premium'] = test_behaviors_final['premium']
test_behaviors_rearranged['image_ids'] = test_behaviors_final['image_ids']
test_behaviors_rearranged['article_type'] = test_behaviors_final['article_type']
test_behaviors_rearranged['ner_clusters'] = test_behaviors_final['ner_clusters']
test_behaviors_rearranged['entity_groups'] = test_behaviors_final['entity_groups']
test_behaviors_rearranged['topics'] = test_behaviors_final['topics']
test_behaviors_rearranged['category'] = test_behaviors_final['category']
test_behaviors_rearranged['subcategory'] = test_behaviors_final['subcategory']
test_behaviors_rearranged['category_str'] = test_behaviors_final['category_str']
test_behaviors_rearranged['total_inviews'] = test_behaviors_final['total_inviews']
test_behaviors_rearranged['total_pageviews'] = test_behaviors_final['total_pageviews']
test_behaviors_rearranged['total_read_time'] = test_behaviors_final['total_read_time']
test_behaviors_rearranged['sentiment_score'] = test_behaviors_final['sentiment_score']
test_behaviors_rearranged['sentiment_label'] = test_behaviors_final['sentiment_label']
test_behaviors_rearranged['last_modified_hour'] = test_behaviors_final['last_modified_hour']
test_behaviors_rearranged['last_modified_day'] = test_behaviors_final['last_modified_day']
test_behaviors_rearranged['published_time_hour'] = test_behaviors_final['published_time_hour']
test_behaviors_rearranged['published_time_day'] = test_behaviors_final['published_time_day']
test_behaviors_rearranged['device_type'] = test_behaviors_final['device_type']
test_behaviors_rearranged['is_sso_user'] = test_behaviors_final['is_sso_user']
test_behaviors_rearranged['gender'] = test_behaviors_final['gender']
test_behaviors_rearranged['postcode'] = test_behaviors_final['postcode']
test_behaviors_rearranged['age'] = test_behaviors_final['age']
test_behaviors_rearranged['is_subscriber'] = test_behaviors_final['is_subscriber']

import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.impute import SimpleImputer

# Finalize the rearranged dataframe
test_behaviors_final = test_behaviors_rearranged
X_total = test_behaviors_final

# Reset index
X_total = X_total.reset_index(drop=True)

# Handle NaN values
imputer = SimpleImputer(strategy='mean')
X_total = imputer.fit_transform(X_total)

# Make predictions
y_pred = model_phase2.predict(X_total)

# Combine X and Y values into a final dataframe
Y_final = pd.DataFrame()
Y_final['clicked'] = y_pred

print(Y_final)

         clicked
0           True
1           True
2           True
3           True
4           True
...          ...
5567412     True
5567413     True
5567414     True
5567415     True
5567416     True

[5567417 rows x 1 columns]


In [19]:
# Ensure the length of Y_final matches X_total
assert len(Y_final) == len(X_total), "Lengths of Y_final and X_total do not match."

# Create the final DataFrame
X_final = pd.DataFrame(X_total, columns=test_behaviors_final.columns)
X_final['clicked'] = Y_final['clicked']  # Predicted click values

# Add 'impression_id' and 'article_id' from final_ids to X_final
X_final['impression_id'] = final_ids['impression_id']
X_final['article_id'] = final_ids['article_id']

print("Probability calculation...")

# Prepare Y_final with 'impression_id'
Y_final['impression_id'] = X_final['impression_id']

# Initialize DataFrames for current impression and user previous history
current_impression = pd.DataFrame()
user_previous_history = pd.DataFrame()

# List to store total probability ranking
total_probability_ranking_list = []

print(Y_final)

Probability calculation...
         clicked  impression_id
0           True         149474
1           True         149474
2           True         149474
3           True         149474
4           True         149474
...          ...            ...
5567412     True      579552453
5567413     True      579552453
5567414     True      579552453
5567415     True      579552453
5567416     True      579552453

[5567417 rows x 2 columns]


In [19]:
# Ensure the length of Y_final matches X_total
assert len(Y_final) == len(X_total), "Lengths of Y_final and X_total do not match."

# Create the final DataFrame
X_final = pd.DataFrame(X_total, columns=test_behaviors_final.columns)
X_final['clicked'] = Y_final['clicked']  # Predicted click values

# Add 'impression_id' and 'article_id' from final_ids to X_final
X_final['impression_id'] = final_ids['impression_id']
X_final['article_id'] = final_ids['article_id']

print("Probability calculation...")

# Prepare Y_final with 'impression_id'
Y_final['impression_id'] = X_final['impression_id']

# Initialize DataFrames for current impression and user previous history
current_impression = pd.DataFrame()
user_previous_history = pd.DataFrame()

# List to store total probability ranking
total_probability_ranking_list = []

print(Y_final)

Probability calculation...
         clicked  impression_id
0           True         149474
1           True         149474
2           True         149474
3           True         149474
4           True         149474
...          ...            ...
5567412     True      579552453
5567413     True      579552453
5567414     True      579552453
5567415     True      579552453
5567416     True      579552453

[5567417 rows x 2 columns]


In [None]:
import numpy as np
import pandas as pd

print ("----------")
i = 0
while i < X_final[X_final.columns[0]].count():
    print ("\033[A                             \033[A")
    print("Calculating probability rankings: {}/{}".format(i, X_final[X_final.columns[0]].count()))
    
    current_impression = X_final.loc[X_final['impression_id'] == X_final['impression_id'].iloc[i]].copy()
    user_previous_history = test_history_final_to_compare[test_history_final_to_compare['user_id'] == current_impression['user_id'].iloc[0]]
    current_impression = current_impression.reset_index()
    
    current_impression['rank'] = np.where(current_impression['clicked'], 0, 10).astype(float)  # Ensure 'rank' is of type float
    
    for j in range(current_impression.shape[0]):
        distance_score = 0
        distance_score += np.mean(current_impression['premium'].iloc[j] != user_previous_history['premium'])
        distance_score += np.mean(np.abs(current_impression['image_ids'].iloc[j] - user_previous_history['image_ids']))
        distance_score += np.mean(np.abs(current_impression['category'].iloc[j] - user_previous_history['category']))
        distance_score += np.mean(np.abs(current_impression['subcategory'].iloc[j] - user_previous_history['subcategory']))
        distance_score += np.mean(np.abs(current_impression['total_inviews'].iloc[j] - user_previous_history['total_inviews']))
        distance_score += np.mean(np.abs(current_impression['total_pageviews'].iloc[j] - user_previous_history['total_pageviews']))
        distance_score += np.mean(np.abs(current_impression['total_read_time'].iloc[j] - user_previous_history['total_read_time']))
        distance_score += np.mean(np.abs(current_impression['sentiment_score'].iloc[j] - user_previous_history['sentiment_score']))
        distance_score += np.mean(np.abs(current_impression['article_type'].iloc[j] - user_previous_history['article_type']))
        distance_score += (np.mean(np.abs(current_impression['category_str'].iloc[j] - user_previous_history['category_str'])))/5
        distance_score += np.mean(np.abs(current_impression['sentiment_label'].iloc[j] - user_previous_history['sentiment_label']))
        
        current_impression.at[j, 'rank'] = round(current_impression['rank'].iloc[j] + distance_score, 5)
    
    current_impression['rank'] = current_impression['rank'].rank()
    current_impression['rank'] = round(1 / current_impression['rank'], 5)
    
    ranking = current_impression['rank'].tolist()
    total_probability_ranking_list.append(ranking)
    
    i += current_impression.shape[0]

print ("----------")
print("Probability list of each impression: ")
print(total_probability_ranking_list)


----------
[A                             [A
Calculating probability rankings: 0/5567417
[A                             [A
Calculating probability rankings: 6/5567417
[A                             [A
Calculating probability rankings: 15/5567417
[A                             [A
Calculating probability rankings: 22/5567417
[A                             [A
Calculating probability rankings: 30/5567417
[A                             [A
Calculating probability rankings: 39/5567417
[A                             [A
Calculating probability rankings: 53/5567417
[A                             [A
Calculating probability rankings: 60/5567417
[A                             [A
Calculating probability rankings: 81/5567417
[A                             [A
Calculating probability rankings: 103/5567417
[A                             [A
Calculating probability rankings: 108/5567417
[A                             [A
Calculating probability rankings: 120/5567417
[A             

In [22]:
print ("----------")
print("Probability list of each impression: ")
print(total_probability_ranking_list)


----------
Probability list of each impression: 
[[0.25, 0.33333, 1.0, 0.5, 0.16667, 0.2], [0.16667, 0.2, 0.125, 1.0, 0.5, 0.11111, 0.14286, 0.33333, 0.25], [0.5, 1.0, 0.16667, 0.2, 0.33333, 0.14286, 0.25], [0.125, 0.33333, 0.25, 0.2, 0.14286, 0.16667, 0.5, 1.0], [0.125, 1.0, 0.14286, 0.16667, 0.11111, 0.25, 0.5, 0.33333, 0.2], [1.0, 0.09091, 0.125, 0.11111, 0.33333, 0.14286, 0.08333, 0.2, 0.5, 0.07692, 0.25, 0.16667, 0.1, 0.07143], [1.0, 0.5, 0.2, 0.33333, 0.14286, 0.25, 0.16667], [0.11111, 0.08333, 0.1, 0.04762, 0.16667, 0.33333, 0.14286, 0.05882, 0.2, 0.06667, 0.125, 1.0, 0.05556, 0.5, 0.25, 0.0625, 0.07143, 0.05, 0.05263, 0.07692, 0.09091], [0.08333, 0.05263, 0.06667, 1.0, 0.04545, 0.05882, 0.11111, 0.1, 0.05556, 0.33333, 0.5, 0.07692, 0.09091, 0.14286, 0.25, 0.125, 0.04762, 0.16667, 0.0625, 0.05, 0.07143, 0.2], [0.25, 0.5, 0.33333, 0.2, 1.0], [0.14286, 0.25, 0.1, 0.16667, 0.125, 0.33333, 0.11111, 0.09091, 0.2, 0.5, 1.0, 0.08333], [0.1, 0.125, 0.2, 0.14286, 0.33333, 0.07692, 0.5, 0

In [62]:
predictions = {}
for i, impression_id in enumerate(X_final['impression_id']):
    if i < len(total_probability_ranking_list):
        predictions[impression_id] = total_probability_ranking_list[i]
    else:
# End of total_probability_ranking_list reached, no predictions beyond that
        break


In [63]:
with open("predictions.txt", "w") as file:
    for impression_id, rankings in predictions.items():
        ranking_str = " ".join(map(str, rankings))
        file.write(f"{impression_id} {ranking_str}\n")


In [64]:
import zipfile

with zipfile.ZipFile("mlslnflyitu.zip", "w") as zipf:
    zipf.write("predictions.txt")
