In [1]:
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.multioutput import MultiOutputRegressor
from sklearn.metrics import accuracy_score, roc_auc_score
from sklearn.model_selection import train_test_split
import numpy as np
import pandas as pd
from decimal import *
import math 
from statistics import mean
import pickle
from itertools import chain

In [2]:
# Load datasets
print("Loading datasets...")
train_behaviors = pd.read_parquet('ebnerd_demo/train/behaviors.parquet')
train_history = pd.read_parquet('ebnerd_demo/train/history.parquet')
val_behaviors = pd.read_parquet('ebnerd_demo/validation/behaviors.parquet')
val_history = pd.read_parquet('ebnerd_demo/validation/history.parquet')
articles = pd.read_parquet('ebnerd_demo/articles.parquet')

# Converting train and validation history data to long format
train_history_expanded = train_history.explode(['impression_time_fixed', 'scroll_percentage_fixed', 'article_id_fixed', 'read_time_fixed'])
val_history_expanded = val_history.explode(['impression_time_fixed', 'scroll_percentage_fixed', 'article_id_fixed', 'read_time_fixed'])

# Update column names
train_history_expanded.rename(columns={'article_id_fixed': 'article_id', 'impression_time_fixed': 'impression_time', 'scroll_percentage_fixed': 'scroll_percentage', 'read_time_fixed': 'read_time'}, inplace=True)
val_history_expanded.rename(columns={'article_id_fixed': 'article_id', 'impression_time_fixed': 'impression_time', 'scroll_percentage_fixed': 'scroll_percentage', 'read_time_fixed': 'read_time'}, inplace=True)

pd.options.mode.chained_assignment = None  

print("Datatypes structuring... ")


Loading datasets...
Datatypes structuring... 


In [3]:
# Fill missing values ​​and encode string values ​​to float
for i in range(len(articles['subcategory'])):
    if (len(articles['subcategory'][i])==0):
        articles['subcategory'][i] = [-1] 
    articles['subcategory'][i]=float(mean(articles['subcategory'][i])) #Reduce size by averaging the list
for i in range(len(articles['image_ids'])):
    if (type(articles['image_ids'][i]) is not np.ndarray):
        articles['image_ids'][i] = [0] #It means there is no picture in the article
    articles['image_ids'][i]=float(mean(articles['image_ids'][i])) #Reduce size by averaging the list


In [4]:
import numpy as np
import hashlib
import math
from decimal import Decimal

def string_to_float32(s):
    # Compute the hash of the string
    hash_object = hashlib.sha256(s.encode('ascii', errors='backslashreplace'))
    hash_hex = hash_object.hexdigest()
    
    # Convert hash to an integer
    hash_int = int(hash_hex, 16)
    
    # Normalize the hash to a float32 range (0, 1)
    max_hash_int = int('f' * len(hash_hex), 16)
    normalized = hash_int / max_hash_int
    
    # Convert normalized value to float32
    return np.float32(normalized)

for column in ['ner_clusters', 'entity_groups', 'topics']:
    data = articles[column]
    for i in range(len(data)):
        string_data_combined = ""
        for j in range(len(data[i])):
            string_data_combined += "/" + data[i][j]  # All strings in the list are concatenated into a single string
        
        new = string_to_float32(string_data_combined)
        data[i] = new
    articles[column] = data


In [5]:
import numpy as np
import hashlib
import math
from decimal import Decimal

def string_to_float32(s):
    # Compute the hash of the string
    hash_object = hashlib.sha256(s.encode('ascii', errors='backslashreplace'))
    hash_hex = hash_object.hexdigest()
    
    # Convert hash to an integer
    hash_int = int(hash_hex, 16)
    
    # Normalize the hash to a float32 range (0, 1)
    max_hash_int = int('f' * len(hash_hex), 16)
    normalized = hash_int / max_hash_int
    
    # Convert normalized value to float32
    return np.float32(normalized)

for column in ['title', 'subtitle']:
    data = articles[column].astype(str)
    for i in range(len(data)):
        new = data[i]
        new = string_to_float32(new)
        data[i] = new
    articles[column] = data

# Coding categorical variables for train and validation behaviors
from sklearn.preprocessing import LabelEncoder

label_encoders = {}
for column in ['device_type', 'gender']:
    le = LabelEncoder()
    train_behaviors[column] = le.fit_transform(train_behaviors[column].astype(str))
    val_behaviors[column] = le.transform(val_behaviors[column].astype(str))
    label_encoders[column] = le


In [6]:
# Coding categorical variables for the Articles dataset
for column in ['article_type', 'category_str', 'sentiment_label']:
    le = LabelEncoder()
    articles[column] = le.fit_transform(articles[column].astype(str))
    label_encoders[column] = le

# Normalize numeric variables
scaler = StandardScaler()
articles[['sentiment_score']] = scaler.fit_transform(articles[['sentiment_score']].astype(float)) #The number value for Sentimen Score is already low so there is no reshaping.
articles[['total_read_time']] = scaler.fit_transform(articles[['total_read_time']].astype(float).values.reshape(-1, 1))
articles[['total_pageviews']] = scaler.fit_transform(articles[['total_pageviews']].astype(float).values.reshape(-1, 1))
articles[['total_inviews']] = scaler.fit_transform(articles[['total_inviews']].astype(float).values.reshape(-1, 1))
articles[['title']] = scaler.fit_transform(articles[['title']].astype(float).values.reshape(-1, 1))
articles[['subtitle']] = scaler.fit_transform(articles[['subtitle']].astype(float).values.reshape(-1, 1))
articles[['ner_clusters']] = scaler.fit_transform(articles[['ner_clusters']].astype(float).values.reshape(-1, 1))
articles[['entity_groups']] = scaler.fit_transform(articles[['entity_groups']].astype(float).values.reshape(-1, 1))
articles[['topics']] = scaler.fit_transform(articles[['topics']].astype(float).values.reshape(-1, 1))
articles[['subcategory']] = scaler.fit_transform(articles[['subcategory']].astype(float).values.reshape(-1, 1))
articles[['category']] = scaler.fit_transform(articles[['category']].astype(float).values.reshape(-1, 1))
articles[['image_ids']] = scaler.fit_transform(articles[['image_ids']].astype(float).values.reshape(-1, 1)) #There may be values ​​that can be further normalized


In [7]:
# Handling date and time variables
train_behaviors['impression_time'] = pd.to_datetime(train_behaviors['impression_time']) ##Only hour and day are taken from dates
train_behaviors['impression_hour'] = train_behaviors['impression_time'].dt.hour
train_behaviors['impression_day'] = train_behaviors['impression_time'].dt.day
val_behaviors['impression_time'] = pd.to_datetime(val_behaviors['impression_time'])
val_behaviors['impression_hour'] = val_behaviors['impression_time'].dt.hour
val_behaviors['impression_day'] = val_behaviors['impression_time'].dt.day

train_history_expanded['impression_time'] = pd.to_datetime(train_history_expanded['impression_time'])
train_history_expanded['impression_hour'] = train_history_expanded['impression_time'].dt.hour
train_history_expanded['impression_day'] = train_history_expanded['impression_time'].dt.day
val_history_expanded['impression_time'] = pd.to_datetime(val_history_expanded['impression_time'])
val_history_expanded['impression_hour'] = val_history_expanded['impression_time'].dt.hour
val_history_expanded['impression_day'] = val_history_expanded['impression_time'].dt.day

articles['last_modified_time'] = pd.to_datetime(articles['last_modified_time'])
articles['last_modified_hour'] = articles['last_modified_time'].dt.hour
articles['last_modified_day'] = articles['last_modified_time'].dt.day

articles['published_time'] = pd.to_datetime(articles['published_time'])
articles['published_time_hour'] = articles['published_time'].dt.hour
articles['published_time_day'] = articles['published_time'].dt.day


In [8]:
# Delete unnecessary columns
train_behaviors.drop(['impression_time'], axis=1, inplace=True)
train_behaviors.drop(['article_id'], axis=1, inplace=True)
train_behaviors.drop(['next_read_time'], axis=1, inplace=True)
train_behaviors.drop(['next_scroll_percentage'], axis=1, inplace=True)
#train_behaviors.drop(['impression_id'], axis=1, inplace=True)
train_behaviors.drop(['session_id'], axis=1, inplace=True)

val_behaviors.drop(['impression_time'], axis=1, inplace=True)
val_behaviors.drop(['article_id'], axis=1, inplace=True)
val_behaviors.drop(['next_read_time'], axis=1, inplace=True)
val_behaviors.drop(['next_scroll_percentage'], axis=1, inplace=True)
#val_behaviors.drop(['impression_id'], axis=1, inplace=True)
val_behaviors.drop(['session_id'], axis=1, inplace=True)

train_history_expanded.drop(['impression_time'], axis=1, inplace=True)
val_history_expanded.drop(['impression_time'], axis=1, inplace=True)

articles.drop(['last_modified_time'], axis=1, inplace=True)
articles.drop(['published_time'], axis=1, inplace=True)
articles.drop(['body'], axis=1, inplace=True)
articles.drop(['url'], axis=1, inplace=True)


In [9]:

# Let's check the column names
print("-------------------------------------------") 
print("Train History Expanded columns:", train_history_expanded.columns)
print("Validation History Expanded columns:", val_history_expanded.columns)

print("Train Behaviors Expanded columns:", train_behaviors.columns)
print("Validation Behaviors Expanded columns:", val_behaviors.columns)

print("Articles columns:", articles.columns)

print("Dataset Merging... ")   

-------------------------------------------
Train History Expanded columns: Index(['user_id', 'scroll_percentage', 'article_id', 'read_time',
       'impression_hour', 'impression_day'],
      dtype='object')
Validation History Expanded columns: Index(['user_id', 'scroll_percentage', 'article_id', 'read_time',
       'impression_hour', 'impression_day'],
      dtype='object')
Train Behaviors Expanded columns: Index(['impression_id', 'read_time', 'scroll_percentage', 'device_type',
       'article_ids_inview', 'article_ids_clicked', 'user_id', 'is_sso_user',
       'gender', 'postcode', 'age', 'is_subscriber', 'impression_hour',
       'impression_day'],
      dtype='object')
Validation Behaviors Expanded columns: Index(['impression_id', 'read_time', 'scroll_percentage', 'device_type',
       'article_ids_inview', 'article_ids_clicked', 'user_id', 'is_sso_user',
       'gender', 'postcode', 'age', 'is_subscriber', 'impression_hour',
       'impression_day'],
      dtype='object')
Articl

In [10]:
train_behaviors_expanded = train_behaviors.explode(['article_ids_inview']) #A record is created in the behaviors table for each reviewed article.
val_behaviors_expanded = val_behaviors.explode(['article_ids_inview'])
train_behaviors_expanded[['article_ids_inview']] = train_behaviors_expanded[['article_ids_inview']].astype(int)
val_behaviors_expanded[['article_ids_inview']] = val_behaviors_expanded[['article_ids_inview']].astype(int)

train_behaviors_expanded = train_behaviors_expanded.explode(['article_ids_clicked']) #Univariate lists are being removed from the list
val_behaviors_expanded = val_behaviors_expanded.explode(['article_ids_clicked'])
train_behaviors_expanded[['article_ids_clicked']] = train_behaviors_expanded[['article_ids_clicked']].astype(int)
val_behaviors_expanded[['article_ids_clicked']] = val_behaviors_expanded[['article_ids_clicked']].astype(int)

articles[['article_id']] = articles[['article_id']].astype(int) #It is necessary to make comparisons

train_behaviors_expanded["article_id"] = train_behaviors_expanded["article_ids_inview"] #To combine the behaviors and articles tables, a column with the same name article_id is required in both tables.
train_behaviors_expanded.drop(['article_ids_inview'], axis=1, inplace=True)
val_behaviors_expanded["article_id"] = val_behaviors_expanded["article_ids_inview"]
val_behaviors_expanded.drop(['article_ids_inview'], axis=1, inplace=True)

train_behaviors_final = train_behaviors_expanded.merge(articles, on='article_id', how='left') 
val_behaviors_final = val_behaviors_expanded.merge(articles, on='article_id', how='left')
train_history_final = train_history_expanded.merge(articles, on='article_id', how='left') 
val_history_final = val_history_expanded.merge(articles, on='article_id', how='left')

X_history_final = pd.concat([train_history_final, val_history_final])

train_final_ids = pd.DataFrame()
train_final_ids['impression_id'] = train_behaviors_final['impression_id']
train_final_ids['article_id'] = train_behaviors_final['article_id']
train_final_ids['user_id'] = train_behaviors_final['user_id']
val_final_ids = pd.DataFrame()
val_final_ids['impression_id'] = val_behaviors_final['impression_id']
val_final_ids['article_id'] = val_behaviors_final['article_id']
val_final_ids['user_id'] = val_behaviors_final['user_id']
final_ids = pd.concat([train_final_ids, val_final_ids])
final_ids = final_ids.reset_index()
final_ids = final_ids.drop(['index'], axis=1)


In [11]:
train_behaviors_final['clicked'] = np.where((train_behaviors_final['article_id'] == train_behaviors_final['article_ids_clicked']), True, False) #Checking whether the displayed article is the same as the clicked article.
val_behaviors_final['clicked'] = np.where((val_behaviors_final['article_id'] == val_behaviors_final['article_ids_clicked']), True, False)

train_behaviors_final.drop(['impression_id'], axis=1, inplace=True)
val_behaviors_final.drop(['impression_id'], axis=1, inplace=True)

train_behaviors_final.drop(['article_id'], axis=1, inplace=True) ##There is no need for article IDs as there is now a 'clicked' column that shows the clicked articles
val_behaviors_final.drop(['article_id'], axis=1, inplace=True)
train_behaviors_final.drop(['article_ids_clicked'], axis=1, inplace=True)
val_behaviors_final.drop(['article_ids_clicked'], axis=1, inplace=True)
X_history_final.drop(['article_id'], axis=1, inplace=True)

X_train_phase1 = train_behaviors_final.drop(['device_type','is_sso_user','gender','postcode','age','is_subscriber'], axis=1) 
Y_train_phase1 = train_behaviors_final.filter(['device_type','is_sso_user','gender','postcode','age','is_subscriber']) 
X_val_phase1 = val_behaviors_final.drop(['device_type','is_sso_user','gender','postcode','age','is_subscriber'], axis=1)
Y_val_phase1 = val_behaviors_final.filter(['device_type','is_sso_user','gender','postcode','age','is_subscriber'])


In [12]:
X_history_rearranged = pd.DataFrame() #The History table must be in the same order as the Behavior table, so the same columns are added to the new table in this order.
X_history_rearranged['read_time'] = X_history_final['read_time']
X_history_rearranged['scroll_percentage'] = X_history_final['scroll_percentage']
X_history_rearranged['user_id'] = X_history_final['user_id']
X_history_rearranged['impression_hour'] = X_history_final['impression_hour']
X_history_rearranged['impression_day'] = X_history_final['impression_day']
X_history_rearranged['title'] = X_history_final['title']
X_history_rearranged['subtitle'] = X_history_final['subtitle']
X_history_rearranged['premium'] = X_history_final['premium']
X_history_rearranged['image_ids'] = X_history_final['image_ids']
X_history_rearranged['article_type'] = X_history_final['article_type']
X_history_rearranged['ner_clusters'] = X_history_final['ner_clusters']
X_history_rearranged['entity_groups'] = X_history_final['entity_groups']
X_history_rearranged['topics'] = X_history_final['topics']
X_history_rearranged['category'] = X_history_final['category']
X_history_rearranged['subcategory'] = X_history_final['subcategory']
X_history_rearranged['category_str'] = X_history_final['category_str']
X_history_rearranged['total_inviews'] = X_history_final['total_inviews']
X_history_rearranged['total_pageviews'] = X_history_final['total_pageviews']
X_history_rearranged['total_read_time'] = X_history_final['total_read_time']
X_history_rearranged['sentiment_score'] = X_history_final['sentiment_score']
X_history_rearranged['sentiment_label'] = X_history_final['sentiment_label']
X_history_rearranged['last_modified_hour'] = X_history_final['last_modified_hour']
X_history_rearranged['last_modified_day'] = X_history_final['last_modified_day']
X_history_rearranged['published_time_hour'] = X_history_final['published_time_hour']
X_history_rearranged['published_time_day'] = X_history_final['published_time_day']
X_history_rearranged['clicked'] = True

X_history_final = X_history_rearranged


In [13]:
X_train_phase1.loc[X_train_phase1["clicked"] == False, "read_time"] = 0 #Reset reading times for unclicked articles
X_train_phase1.loc[X_train_phase1["clicked"] == False, "scroll_percentage"] = 0
X_val_phase1.loc[X_val_phase1["clicked"] == False, "read_time"] = 0 
X_val_phase1.loc[X_val_phase1["clicked"] == False, "scroll_percentage"] = 0

X_train_phase1['scroll_percentage'].fillna(value=0, inplace=True) #Reset dataset with NaN values ​​to 0
X_val_phase1['scroll_percentage'].fillna(value=0, inplace=True)
X_train_phase1['total_inviews'].fillna(value=0, inplace=True)
X_val_phase1['total_inviews'].fillna(value=0, inplace=True)
X_train_phase1['total_pageviews'].fillna(value=0, inplace=True)
X_val_phase1['total_pageviews'].fillna(value=0, inplace=True)
X_train_phase1['total_read_time'].fillna(value=0, inplace=True)
X_val_phase1['total_read_time'].fillna(value=0, inplace=True)

X_history_final['scroll_percentage'].fillna(value=0, inplace=True)
X_history_final['total_inviews'].fillna(value=0, inplace=True)
X_history_final['total_pageviews'].fillna(value=0, inplace=True)
X_history_final['total_read_time'].fillna(value=0, inplace=True)
X_history_final['read_time'].fillna(value=0, inplace=True)

Y_train_phase1['age'].fillna(value=0, inplace=True)
Y_val_phase1['age'].fillna(value=0, inplace=True)
Y_train_phase1['postcode'].fillna(value=0, inplace=True)
Y_val_phase1['postcode'].fillna(value=0, inplace=True)

Y_train_phase1[['age']] = Y_train_phase1[['age']].astype(int) #Since classification cannot be done with float, it is converted to int
Y_val_phase1[['age']] = Y_val_phase1[['age']].astype(int)
Y_train_phase1[['postcode']] = Y_train_phase1[['postcode']].astype(int)
Y_val_phase1[['postcode']] = Y_val_phase1[['postcode']].astype(int)

print("-------------------------------------------------") 

for column in X_train_phase1.items():
    print(column)


-------------------------------------------------
('read_time', 0           0.0
1           0.0
2           0.0
3           0.0
4           0.0
          ...  
281725      0.0
281726      0.0
281727      0.0
281728    107.0
281729      0.0
Name: read_time, Length: 281730, dtype: float32)
('scroll_percentage', 0           0.0
1           0.0
2           0.0
3           0.0
4           0.0
          ...  
281725      0.0
281726      0.0
281727      0.0
281728    100.0
281729      0.0
Name: scroll_percentage, Length: 281730, dtype: float32)
('user_id', 0           22779
1           22779
2           22779
3           22779
4           22779
           ...   
281725    2096611
281726    2096611
281727    2096611
281728    2096611
281729    2096611
Name: user_id, Length: 281730, dtype: uint32)
('impression_hour', 0         21
1         21
2         21
3         21
4         21
          ..
281725    10
281726    10
281727    10
281728    10
281729    10
Name: impression_hour, Length: 281730

In [14]:
print("-------------------------------------------------")   
c = np.isinf(X_train_phase1['title']).values.sum() 
print("Behavior Title " + str(c) + " infinite values") 
c = np.isinf(X_train_phase1['subtitle']).values.sum() 
print("Behavior Subtitle " + str(c) + " infinite values") 
c = np.isinf(X_train_phase1['ner_clusters']).values.sum() 
print("Behavior Ner_clusters " + str(c) + " infinite values") 
c = np.isinf(X_train_phase1['entity_groups']).values.sum() 
print("Behavior Entity_groups " + str(c) + " infinite values") 
c = np.isinf(X_train_phase1['topics']).values.sum() 
print("Behavior Topics " + str(c) + " infinite values")

c = np.isinf(X_history_final['title']).values.sum() 
print("History Title " + str(c) + " infinite values") 
c = np.isinf(X_history_final['subtitle']).values.sum() 
print("History Subtitle " + str(c) + " infinite values") 
c = np.isinf(X_history_final['ner_clusters']).values.sum() 
print("History Ner_clusters " + str(c) + " infinite values") 
c = np.isinf(X_history_final['entity_groups']).values.sum() 
print("History Entity_groups " + str(c) + " infinite values") 
c = np.isinf(X_history_final['topics']).values.sum() 
print("History Topics " + str(c) + " infinite values") 

print("-------------------------------------------------")


-------------------------------------------------
Behavior Title 0 infinite values
Behavior Subtitle 0 infinite values
Behavior Ner_clusters 0 infinite values
Behavior Entity_groups 0 infinite values
Behavior Topics 0 infinite values
History Title 0 infinite values
History Subtitle 0 infinite values
History Ner_clusters 0 infinite values
History Entity_groups 0 infinite values
History Topics 0 infinite values
-------------------------------------------------


In [15]:
'''
print("Phase 1 model training... ")
model_phase1 = MultiOutputRegressor(LogisticRegression(max_iter=2000)) 
#max_iter can be increased when an insufficient warning is received, but it consumes more resources.

model_phase1.fit(X_train_phase1, Y_train_phase1) 

y_pred = model_phase1.predict(X_val_phase1)
y_val = np.array(Y_val_phase1, dtype=int)

y_pred_T = np.transpose(y_pred)

error = np.mean( y_val != y_pred )
print("Phase 1 prediction error: "+str(error)) ##accuracy_score() and roc_auc_score() methods cannot be used because the output contains more than one value

print("Phase 1 model predicting... ")
X_combined_phase1 = pd.concat([X_train_phase1, X_val_phase1]) ##When you see that the model works, train and val tables are combined so that you can learn better.
Y_combined_phase1 = pd.concat([Y_train_phase1, Y_val_phase1])

model_phase1.fit(X_combined_phase1, Y_combined_phase1)
'''
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.multioutput import MultiOutputRegressor
import pandas as pd
import numpy as np

# Scaling data
scaler = StandardScaler()
X_train_phase1_scaled = scaler.fit_transform(X_train_phase1)
X_val_phase1_scaled = scaler.transform(X_val_phase1)
X_combined_phase1_scaled = scaler.fit_transform(pd.concat([X_train_phase1, X_val_phase1]))

# We convert it back to DataFrame to keep feature names
X_train_phase1_scaled_df = pd.DataFrame(X_train_phase1_scaled, columns=X_train_phase1.columns)
X_val_phase1_scaled_df = pd.DataFrame(X_val_phase1_scaled, columns=X_val_phase1.columns)
X_combined_phase1_scaled_df = pd.DataFrame(X_combined_phase1_scaled, columns=pd.concat([X_train_phase1, X_val_phase1]).columns)

print("Phase 1 model training... ")

model_phase1 = MultiOutputRegressor(LogisticRegression(max_iter=2000, solver='saga')) 
model_phase1.fit(X_train_phase1_scaled_df, Y_train_phase1)

y_pred = model_phase1.predict(X_val_phase1_scaled_df)
y_val = np.array(Y_val_phase1, dtype=int)

y_pred_T = np.transpose(y_pred)

error = np.mean(y_val != y_pred)
print("Phase 1 prediction error: " + str(error))

print("Phase 1 model predicting... ")



Phase 1 model training... 
Phase 1 prediction error: 0.12114341891002321
Phase 1 model predicting... 


In [16]:
X_combined_phase1 = pd.concat([X_train_phase1, X_val_phase1])
Y_combined_phase1 = pd.concat([Y_train_phase1, Y_val_phase1])

model_phase1.fit(X_combined_phase1_scaled_df, Y_combined_phase1)
y_pred = model_phase1.predict(X_history_final) 

y_pred_T = np.transpose(y_pred)
Y_history_final = pd.DataFrame() 
Y_history_final['device_type'] = y_pred_T[0]
Y_history_final['is_sso_user'] = y_pred_T[1].astype('b')
Y_history_final['gender'] = y_pred_T[2]
Y_history_final['postcode'] = y_pred_T[3]
Y_history_final['age'] = y_pred_T[4]
Y_history_final['is_subscriber'] = y_pred_T[5].astype('b')

X_total = pd.concat([X_history_final, X_combined_phase1]) # In the next prediction we will only look at whether the article was clicked or not
Y_total = pd.concat([Y_history_final, Y_combined_phase1]) # Therefore, while all columns are included in table X, only ['clicked'] should be in table Y.

X_total = X_total.reset_index(drop=True)
Y_total = Y_total.reset_index(drop=True)

X_total['device_type'] = Y_total['device_type']
X_total['is_sso_user'] = Y_total['is_sso_user']
X_total['gender'] = Y_total['gender']
X_total['postcode'] = Y_total['postcode']
X_total['age'] = Y_total['age']
X_total['is_subscriber'] = Y_total['is_subscriber']

Y_total = X_total.filter(['clicked'])
X_total = X_total.drop(['clicked'], axis=1)

In [17]:
X_train_phase2, X_val_phase2, Y_train_phase2, Y_val_phase2 = train_test_split(X_total, Y_total, test_size=0.2, random_state=42)

X_history_final = X_history_final.reset_index()
Y_history_final = Y_history_final.reset_index()

X_history_final['device_type'] = Y_history_final['device_type']
X_history_final['is_sso_user'] = Y_history_final['is_sso_user']
X_history_final['gender'] = Y_history_final['gender']
X_history_final['postcode'] = Y_history_final['postcode']
X_history_final['age'] = Y_history_final['age']
X_history_final['is_subscriber'] = Y_history_final['is_subscriber']

Y_history_final = X_history_final.filter(['clicked'])
X_history_final = X_history_final.drop(['clicked'], axis=1)

X_history_final = X_history_final.drop(['index'], axis=1)

test_history_final_to_compare = X_history_final

print("Empty history columns are predicted. ")
print("---------------------------") 
print("Phase 2 model training... ")

Empty history columns are predicted. 
---------------------------
Phase 2 model training... 


In [18]:
scaler_phase2 = StandardScaler()
X_train_phase2_scaled = scaler_phase2.fit_transform(X_train_phase2)
X_val_phase2_scaled = scaler_phase2.transform(X_val_phase2)

## Phase 2 model training
model_phase2 = LogisticRegression(max_iter=1000, solver='saga')
model_phase2.fit(X_train_phase2_scaled, Y_train_phase2.values.ravel())

y_pred = model_phase2.predict(X_val_phase2_scaled)
y_val = np.array(Y_val_phase2, dtype=int)
y_val_formatted = list(chain(*y_val))

error = np.mean(y_val_formatted != y_pred)
print("Phase 2 prediction error: " + str(error))

print("Phase 2 model predicting... ")

X_combined_phase2 = pd.concat([X_train_phase2, X_val_phase2])
Y_combined_phase2 = pd.concat([Y_train_phase2, Y_val_phase2])

X_combined_phase2_scaled = scaler_phase2.fit_transform(X_combined_phase2)

model_phase2.fit(X_combined_phase2_scaled, Y_combined_phase2.values.ravel())

# Modelleri kaydetme
print("Saving the models... ")
with open('model_phase1.sav', 'wb') as f:
    pickle.dump(model_phase1, f)
    
with open('model_phase2.sav', 'wb') as f:
    pickle.dump(model_phase2, f)

print("Model training completed. You can now predict on the test set.")
print("Predicting for final accuracy score...")

Phase 2 prediction error: 0.002136987812275961
Phase 2 model predicting... 
Saving the models... 
Model training completed. You can now predict on the test set.
Predicting for final accuracy score...


In [19]:
print("Predicting for final accuracy score...")

# Behaviors table not merged with History set is required to calculate final AUC scores
X_combined_phase1 = pd.concat([X_train_phase1, X_val_phase1])
Y_combined_phase1 = pd.concat([Y_train_phase1, Y_val_phase1])

X_combined_phase1 = X_combined_phase1.reset_index()
Y_combined_phase1 = Y_combined_phase1.reset_index()

ground_truth = pd.DataFrame()
ground_truth['clicked'] = X_combined_phase1['clicked'] # Gerçek tıklanma değerleri
X_combined_phase1 = X_combined_phase1.drop(['clicked'], axis=1)

X_combined_phase1['device_type'] = Y_combined_phase1['device_type']
X_combined_phase1['is_sso_user'] = Y_combined_phase1['is_sso_user']
X_combined_phase1['gender'] = Y_combined_phase1['gender']
X_combined_phase1['postcode'] = Y_combined_phase1['postcode']
X_combined_phase1['age'] = Y_combined_phase1['age']
X_combined_phase1['is_subscriber'] = Y_combined_phase1['is_subscriber']

X_combined_phase1 = X_combined_phase1.drop(['index'], axis=1)
Y_combined_phase1 = Y_combined_phase1.drop(['index'], axis=1)

X_combined_phase1_scaled = scaler_phase2.transform(X_combined_phase1)

y_pred = model_phase2.predict(X_combined_phase1_scaled) # Karşılaştırma yapılacağı için history ile birleştirilmemeli o yüzden phase 1 dataseti alıyoruz

# We can calculate AUC score
from sklearn.metrics import roc_auc_score

auc_score = roc_auc_score(ground_truth, y_pred)
print("AUC Score: " + str(auc_score))

Predicting for final accuracy score...
AUC Score: 0.9785555629320918


In [20]:
Y_final = pd.DataFrame() 
Y_final['clicked'] = y_pred

X_final = X_combined_phase1
X_final['clicked'] = Y_final['clicked'] 
X_final['impression_id'] = final_ids['impression_id']
X_final['article_id'] = final_ids['article_id']


print("Score calculation...") 

ground_truth['impression_id'] = X_final['impression_id']

current_impression = pd.DataFrame()
current_impression_y_truth = pd.DataFrame()
user_previous_history = pd.DataFrame()

total_roc_auc_score_list = []

print ("----------")

Score calculation...
----------


In [21]:

i = 0
while i < X_final[X_final.columns[0]].count():    
    current_impression = X_final.loc[X_final['impression_id'] == X_final['impression_id'][i]]
    current_impression_y_truth = ground_truth.loc[ground_truth['impression_id'] == ground_truth['impression_id'][i]]
    user_previous_history = test_history_final_to_compare.loc[test_history_final_to_compare['user_id'] == current_impression['user_id'].iloc[0]]
    
    current_impression = current_impression.reset_index(drop=True)
    current_impression_y_truth = current_impression_y_truth.reset_index(drop=True)
    
    current_impression_y_truth['truth'] = np.where((current_impression_y_truth['clicked'] == True), 1, 0)
    
    current_impression['rank'] = np.where((current_impression['clicked'] == True), 0, 10).astype(float)
    
    for j in range(current_impression[current_impression.columns[0]].count()):
        distance_score = 0
        distance_score += mean(current_impression['premium'].iloc[j] != user_previous_history['premium'])
        distance_score += mean(abs(current_impression['image_ids'].iloc[j] - user_previous_history['image_ids']))
        distance_score += mean(abs(current_impression['category'].iloc[j] - user_previous_history['category']))
        distance_score += mean(abs(current_impression['subcategory'].iloc[j] - user_previous_history['subcategory']))
        distance_score += mean(abs(current_impression['total_inviews'].iloc[j] - user_previous_history['total_inviews']))
        distance_score += mean(abs(current_impression['total_pageviews'].iloc[j] - user_previous_history['total_pageviews']))
        distance_score += mean(abs(current_impression['total_read_time'].iloc[j] - user_previous_history['total_read_time']))
        distance_score += mean(abs(current_impression['sentiment_score'].iloc[j] - user_previous_history['sentiment_score']))
        distance_score += mean(abs(current_impression['article_type'].iloc[j] - user_previous_history['article_type']))
        distance_score += (mean(abs(current_impression['category_str'].iloc[j] - user_previous_history['category_str'])) / 5)
        distance_score += mean(abs(current_impression['sentiment_label'].iloc[j] - user_previous_history['sentiment_label']))
        
        current_impression['rank'].iloc[j] = round(current_impression['rank'].iloc[j] + distance_score, 5)
    
    current_impression['rank'] = current_impression['rank'].rank()
    current_impression['rank'] = round(1 / current_impression['rank'], 5)
    
    y_pred = current_impression['rank'].tolist()
    y_true = current_impression_y_truth['truth'].tolist()
    
    total_roc_auc_score_list.append(roc_auc_score(y_true, y_pred))
    
    i = i + current_impression[current_impression.columns[0]].count()
    


In [22]:
print("----------")
print("Average AUC score across all impressions: " + str(mean(total_roc_auc_score_list)))

----------
Average AUC score across all impressions: 0.9698444749253172


-------------------------------------------------
