# Import Libraries

In [1]:
import mysql.connector
from datetime import datetime
import os

In [2]:
import pandas as pd
import numpy as np

In [3]:
import re
import pandas as pd
from bs4 import BeautifulSoup
from urllib.parse import urlparse

In [4]:
from sentence_transformers import SentenceTransformer

  from .autonotebook import tqdm as notebook_tqdm


In [5]:
from sklearn.metrics.pairwise import cosine_similarity

In [41]:
from collections import defaultdict

In [6]:
try:
    conn = mysql.connector.connect(
        user='root',
        password='root',
        host='localhost',
        database='hbd' # Optional: connect to a specific DB directly
    )

    if conn.is_connected():
        print("Connection established successfully!")

except mysql.connector.Error as err:
    print(f"Error: {err}")

Connection established successfully!


## Fetching posts data

In [7]:
query = """
SELECT 
    p.post_id,
    p.user_id,
    u.state_id,
    u.city_id,
    u.area_id,
    p.post_type,
    p.description,
    p.location,

    -- include timestamps
    p.created_at,
    p.updated_at,

    -- total number of users who reacted
    JSON_LENGTH(p.user_reacts_clean) AS total_reactions,

    -- count likes
    (
        SELECT COUNT(*) 
        FROM JSON_TABLE(
            p.user_reacts_clean,
            '$.*' COLUMNS (
                reaction VARCHAR(50) PATH '$'
            )
        ) jt
        WHERE reaction = 'like'
    ) AS like_count,

    -- count loves
    (
        SELECT COUNT(*) 
        FROM JSON_TABLE(
            p.user_reacts_clean,
            '$.*' COLUMNS (
                reaction VARCHAR(50) PATH '$'
            )
        ) jt
        WHERE reaction = 'love'
    ) AS love_count

FROM posts p
LEFT JOIN users u 
    ON p.user_id = u.id;
"""

In [8]:
posts_df = pd.read_sql(query, conn)
posts_df.columns

  posts_df = pd.read_sql(query, conn)


Index(['post_id', 'user_id', 'state_id', 'city_id', 'area_id', 'post_type',
       'description', 'location', 'created_at', 'updated_at',
       'total_reactions', 'like_count', 'love_count'],
      dtype='object')

In [9]:
posts_df['created_at'] = pd.to_datetime(posts_df['created_at'], unit='s', errors='coerce')

  posts_df['created_at'] = pd.to_datetime(posts_df['created_at'], unit='s', errors='coerce')


In [10]:
posts_df.drop('location', axis = 1, inplace = True)

In [11]:
def normalize_url(url):
    try:
        parsed = urlparse(url)
        domain = parsed.netloc.replace('www.', '')
        domain = re.sub(r'\.(com|org|in|net|co|io)$', '', domain)

        path = parsed.path.replace('/', ' ')
        text = f"{domain} {path}"
        return text.strip()
    except Exception:
        return ""

def clean_text(text):
    if pd.isna(text):
        return ""

    text = str(text).strip()

    # If text is only a URL
    if re.match(r'^https?://', text):
        text = normalize_url(text)

    # Strip HTML
    text = BeautifulSoup(text, "html.parser").get_text(" ", strip=True)

    # Normalize
    text = text.lower()
    text = re.sub(r'http\S+|www\S+', '', text)
    text = re.sub(r'[^a-z0-9\s]', ' ', text)  # optional, keeps it clean
    text = re.sub(r'\s+', ' ', text)

    return text.strip()

In [12]:
posts_df['clean_description'] = posts_df['description'].apply(clean_text)

In [13]:
len(posts_df)

420

In [14]:
model = SentenceTransformer('all-MiniLM-L6-v2')
texts = posts_df['clean_description'].fillna("").tolist()

In [15]:
embeddings = model.encode(
    texts,
    batch_size=64,
    show_progress_bar=True,
    normalize_embeddings=True  # IMPORTANT for cosine similarity
)
# embeddings shape: (num_posts, 384)
print(embeddings.shape)

Batches: 100%|██████████| 7/7 [00:03<00:00,  1.92it/s]

(420, 384)





In [16]:
assert len(posts_df) == embeddings.shape[0], \
    "Mismatch between posts_df rows and embeddings"

print("Posts:", len(posts_df))
print("Embeddings shape:", embeddings.shape)

Posts: 420
Embeddings shape: (420, 384)


#### post_id → embedding index mapping

In [17]:
post_id_to_index = {
    post_id: idx
    for idx, post_id in enumerate(posts_df['post_id'].values)
}

In [18]:
# optional reverse mapping

index_to_post_id = {
    idx: post_id
    for post_id, idx in post_id_to_index.items()
}

#### Build the frozen embedding store (in-memory)

In [19]:
post_embedding_store = {
    "embeddings": embeddings,              # numpy array (N, 384)
    "post_id_to_index": post_id_to_index,  # dict
}

# save the embedding
np.save("post_embeddings.npy", embeddings)
# and the post order - veryyy imp
posts_df[['post_id']].to_csv(
    "post_id_index.csv",
    index=False
)

#### how to reload the embeddings - later; for refr

```
import numpy as np
import pandas as pd

embeddings = np.load("post_embeddings.npy")
post_ids = pd.read_csv("post_id_index.csv")

post_id_to_index = {
    post_id: idx
    for idx, post_id in enumerate(post_ids['post_id'].values)
}

```

In [20]:
def get_post_embedding(post_id):
    idx = post_id_to_index.get(post_id)
    if idx is None:
        return None
    return embeddings[idx]


## Fetching user_interaction data

In [23]:
query = """SELECT
    user_id,
    content_id AS post_id,

    SUM(CASE WHEN event_name = 'view' THEN 1 ELSE 0 END)    AS view_count,
    SUM(CASE WHEN event_name = 'like' THEN 1 ELSE 0 END)    AS like_count,
    SUM(CASE WHEN event_name = 'comment' THEN 1 ELSE 0 END) AS comment_count,
    SUM(CASE WHEN event_name = 'share' THEN 1 ELSE 0 END)   AS share_count,

    COUNT(*) AS interaction_count,
    MAX(created_at) AS last_interaction_at

FROM user_activity_logs

WHERE content_type = 'post'

GROUP BY user_id, content_id;
"""

In [24]:
user_hist = pd.read_sql(query, conn)

  user_hist = pd.read_sql(query, conn)


In [25]:
user_hist.head(3)

Unnamed: 0,user_id,post_id,view_count,like_count,comment_count,share_count,interaction_count,last_interaction_at
0,82,82,0.0,1.0,0.0,0.0,2,2025-07-19 13:14:23
1,309,309,0.0,1.0,0.0,0.0,5,2025-07-25 00:13:06
2,594,427,0.0,0.0,2.0,0.0,2,2025-07-21 22:03:38


In [26]:
post_interactions = (
    user_hist
    .groupby('post_id')
    .agg({
        'view_count': 'sum',
        'like_count': 'sum',
        'comment_count': 'sum',
        'share_count': 'sum',
        'interaction_count': 'sum'
    })
    .reset_index()
)

post_interactions['popularity_score'] = (
    1.0 * post_interactions['view_count'] +
    3.0 * post_interactions['like_count'] +
    4.0 * post_interactions['comment_count'] +
    5.0 * post_interactions['share_count'] +
    0.3 * post_interactions['interaction_count']   # increasing the wgt more than 0.4 
                                                    # can make highly viewed post easily win over 
                                                    #  highly liked, commented posts
    )

In [27]:
# Base score = depth of engagement (explicit signals)
user_hist['base_score'] = (
    1.0 * user_hist['view_count'] +
    3.0 * user_hist['like_count'] +
    4.0 * user_hist['comment_count'] +
    5.0 * user_hist['share_count']
)

# Presence-adjusted engagement (weak implicit signal)
user_hist['engagement_score'] = (
    user_hist['base_score'] +
    0.3 * user_hist['interaction_count']
)

# Frequency dampening (prevents heavy users dominating)
user_hist['freq_score'] = np.log1p(user_hist['engagement_score'])


In [29]:
posts_df = posts_df.merge(
    post_interactions,
    on='post_id',
    how='left'
)

# Fill all interaction-related columns safely
interaction_cols = [
    'view_count',
    'like_count',
    'comment_count',
    'share_count',
    'interaction_count',
    'popularity_score'  
]

for col in interaction_cols:
    if col in posts_df.columns:
        posts_df[col] = posts_df[col].fillna(0)

In [30]:
posts_df.head(3)

Unnamed: 0,post_id,user_id,state_id,city_id,area_id,post_type,description,created_at,updated_at,total_reactions,like_count_x,love_count,clean_description,view_count,like_count_y,comment_count,share_count,interaction_count,popularity_score
0,5,1,7.0,3064.0,54678.0,profile_picture,,2024-08-26 05:49:00,1724651340,2,1,1,,0.0,,0.0,0.0,0.0,0.0
1,6,8,,,,profile_picture,,2024-08-27 06:29:07,1724740147,0,0,0,,0.0,,0.0,0.0,0.0,0.0
2,7,1,7.0,3064.0,54678.0,general,,2024-08-27 06:58:53,1724741933,2,2,0,,0.0,,0.0,0.0,0.0,0.0


### the foll code snippet can be used in future to incorporate recency decay
```
#reference time (now or max timestamp)
now = pd.Timestamp.utcnow()

user_hist_df['days_since_interaction'] = (
    now - user_hist_df['last_interaction_at']
).dt.days.clip(lower=0)

TAU = 30  # decay window in days

user_hist_df['recency_weight'] = np.exp(
    - user_hist_df['days_since_interaction'] / TAU
)

user_hist_df['interaction_score'] = (
    user_hist_df['freq_score'] *
    user_hist_df['recency_weight']
)

```


#### Build user embeddings using interaction score

In [31]:
user_hist.columns

Index(['user_id', 'post_id', 'view_count', 'like_count', 'comment_count',
       'share_count', 'interaction_count', 'last_interaction_at', 'base_score',
       'engagement_score', 'freq_score'],
      dtype='object')

In [29]:
assert 'user_id' in user_hist.columns
assert 'post_id' in user_hist.columns
assert 'freq_score' in user_hist.columns

In [32]:
user_hist_df = user_hist[
    (user_hist['interaction_count'] > 0) &
    (user_hist['post_id'].isin(post_id_to_index))
]

#### build user embeddings

In [33]:
user_embeddings = {}

for user_id, grp in user_hist_df.groupby('user_id'):

    # Minimum signal threshold
    if grp['interaction_count'].sum() < 2:
        continue

    vectors = []
    weights = []

    for _, row in grp.iterrows():
        post_id = row['post_id']
        score = row['freq_score']

        idx = post_id_to_index[post_id]
        vectors.append(embeddings[idx])
        weights.append(score)

    weights = np.array(weights)
    weights = weights / weights.sum()   # normalize

    user_embeddings[user_id] = np.average(
        np.vstack(vectors),
        axis=0,
        weights=weights
    )

In [34]:
len(user_embeddings)

8

In [35]:
sample_user = next(iter(user_embeddings))
user_embeddings[sample_user].shape

(384,)

In [36]:
np.linalg.norm(user_embeddings[sample_user])

np.float64(0.7442885335127195)

User vector = “things I like”
Post vector = “what this post is about”
Cosine similarity = “how much overlap in meaning”

In [37]:
#tells us the content level relevance
# use this only for warm users
def user_post_similarity(user_id, post_id):
    if user_id not in user_embeddings:
        return 0.0
    if post_id not in post_id_to_index:
        return 0.0

    u = user_embeddings[user_id].reshape(1, -1)
    p = embeddings[post_id_to_index[post_id]].reshape(1, -1)

    return cosine_similarity(u, p)[0][0]

In [39]:
user_hist_df.head(3)

Unnamed: 0,user_id,post_id,view_count,like_count,comment_count,share_count,interaction_count,last_interaction_at,base_score,engagement_score,freq_score
0,82,82,0.0,1.0,0.0,0.0,2,2025-07-19 13:14:23,3.0,3.6,1.526056
2,594,427,0.0,0.0,2.0,0.0,2,2025-07-21 22:03:38,8.0,8.6,2.261763
5,595,441,0.0,1.0,1.0,0.0,2,2025-07-21 23:35:17,7.0,7.6,2.151762


In [40]:
user_type_affinity = (
    user_hist_df
    .merge(
        posts_df[['post_id', 'post_type']],
        on='post_id',
        how='left'
    )
    .dropna(subset=['post_type'])
)

# Aggregate engagement by user + post_type
user_type_affinity = (
    user_type_affinity
    .groupby(['user_id', 'post_type'])['engagement_score']
    .sum()
    .reset_index()
)

# Normalize per user
user_type_affinity['type_affinity'] = (
    user_type_affinity
    .groupby('user_id')['engagement_score']
    .transform(lambda x: x / x.sum())
)

In [42]:
user_post_type_pref = {
    uid: defaultdict(float, zip(grp['post_type'], grp['type_affinity']))
    for uid, grp in user_type_affinity.groupby('user_id')
}

## Recommendation logic

### Phase A: Candidate generation

Scoring all posts for every user is inefficient and unnecessary.
We first shortlist likely-relevant posts.

The following cells: 

These functions exist to inject geographic relevance into candidate generation and fallback logic

In [43]:
def get_area_posts(area_id):
    if pd.isna(area_id):
        return []
    return posts_df[
        posts_df['area_id'] == area_id
    ]['post_id'].tolist()

In [44]:
def get_city_posts(city_id):
    if pd.isna(city_id):
        return []
    return posts_df[
        posts_df['city_id'] == city_id
    ]['post_id'].tolist()

In [45]:
def get_state_posts(state_id):
    if pd.isna(state_id):
        return []
    return posts_df[
        posts_df['state_id'] == state_id
    ]['post_id'].tolist()

In [46]:
def get_user_location(user_id):
    user_posts = user_hist_df[user_hist_df['user_id'] == user_id]

    merged = user_posts.merge(
        posts_df[['post_id', 'state_id', 'city_id', 'area_id']],
        on='post_id',
        how='left'
    )

    def most_common(series):
        s = series.dropna()
        return s.value_counts().idxmax() if len(s) > 0 else None

    return {
        'area_id': most_common(merged['area_id']),
        'city_id': most_common(merged['city_id']),
        'state_id': most_common(merged['state_id']),
    }

In [48]:
#make created_at col utc aware for next step
posts_df['created_at'] = pd.to_datetime(
    posts_df['created_at'],
    utc=True
)

In [49]:
# Recent posts (e.g., last 15 days)
RECENT_DAYS = 15
recent_cutoff = pd.Timestamp.utcnow() - pd.Timedelta(days=RECENT_DAYS)

recent_posts = posts_df.loc[
    posts_df['created_at'] >= recent_cutoff,
    'post_id'
].tolist()


# Popular posts (top-N by engagement)
TOP_N = 200
popular_posts = (
    posts_df
    .sort_values('popularity_score', ascending=False)
    .head(TOP_N)['post_id']
    .tolist()
)

Area posts → highest precision

City posts → expand recall

State posts → safety net

Prevents empty or tiny candidate sets

In [None]:
def get_candidates_for_user(user_id, min_candidates=200):
    
    candidates = set()

    # Global fallback pools (always present)
    candidates.update(recent_posts)
    candidates.update(popular_posts)

    # Location-aware expansion
    loc = get_user_location(user_id)

    if loc['area_id'] is not None:
        candidates.update(get_area_posts(loc['area_id']))

    if len(candidates) < min_candidates and loc['city_id'] is not None:
        candidates.update(get_city_posts(loc['city_id']))

    if len(candidates) < min_candidates and loc['state_id'] is not None:
        candidates.update(get_state_posts(loc['state_id']))

    return list(candidates)

In [51]:
def recency_days(post_created_at, ref_time):
    delta = ref_time - post_created_at
    return delta.total_seconds() / 86400.0

In [57]:
# Build once, outside
posts_by_id = posts_df.set_index('post_id')

In [58]:
def build_user_feature_rows(user_id, ref_time):
    """
    Builds ranking feature rows for a single user.
    Assumes ref_time is already timezone-safe.
    """

    rows = []

    candidates = get_candidates_for_user(user_id)

    user_loc = get_user_location(user_id)
    user_area  = user_loc['area_id']
    user_city  = user_loc['city_id']
    user_state = user_loc['state_id']

    has_embedding = int(user_id in user_embeddings)

    for post_id in candidates:
        if post_id not in posts_by_id.index:
            continue

        post_row = posts_by_id.loc[post_id]

        row = {
            'user_id': user_id,
            'post_id': post_id,

            # Personalization (warm users only)
            'semantic_similarity': (
                user_post_similarity(user_id, post_id)
                if has_embedding else 0.0
            ),

            # Cold / warm indicator
            'has_user_embedding': has_embedding,

            # Location signals
            'same_area': int(user_area is not None and post_row['area_id'] == user_area),
            'same_city': int(user_city is not None and post_row['city_id'] == user_city),
            'same_state': int(user_state is not None and post_row['state_id'] == user_state),

            # Freshness
            'recency_days': recency_days(
                post_row['created_at'],
                ref_time
            ),

            # Popularity
            'popularity_score': post_row['popularity_score'],

            # Content type
            'post_type': post_row['post_type'],
        }

        rows.append(row)

    return rows

In [59]:
ref_time = pd.Timestamp.utcnow()
if ref_time.tzinfo is None:
    ref_time = ref_time.tz_localize('UTC')

rows = build_user_feature_rows(user_id, ref_time)

In [60]:
ref_time = pd.Timestamp.utcnow()
if ref_time.tzinfo is None:
    ref_time = ref_time.tz_localize('UTC')

In [61]:
all_rows = []

for user_id in user_hist_df['user_id'].unique():
    all_rows.extend(
        build_user_feature_rows(user_id, ref_time)
    )

ranking_df = pd.DataFrame(all_rows)

In [62]:
ranking_df.columns

Index(['user_id', 'post_id', 'semantic_similarity', 'has_user_embedding',
       'same_area', 'same_city', 'same_state', 'recency_days',
       'popularity_score', 'post_type'],
      dtype='object')

In [65]:
ranking_df.isnull().sum()

user_id                0
post_id                0
semantic_similarity    0
has_user_embedding     0
same_area              0
same_city              0
same_state             0
recency_days           0
popularity_score       0
post_type              0
dtype: int64

In [64]:
MAX_RECENCY_DAYS = ranking_df['recency_days'].max()

ranking_df['recency_days'] = ranking_df['recency_days'].fillna(
    MAX_RECENCY_DAYS + 1
)

In [66]:
ranking_df = pd.get_dummies(
    ranking_df,
    columns=['post_type'],
    prefix='post_type',
    dummy_na=False
)

In [67]:
ranking_df.head(3)

Unnamed: 0,user_id,post_id,semantic_similarity,has_user_embedding,same_area,same_city,same_state,recency_days,popularity_score,post_type_event,post_type_general,post_type_live_streaming,post_type_profile_picture,post_type_share,post_type_shorts,post_type_video
0,82,28,1.0,1,0,0,0,449.095732,0.0,False,False,False,True,False,False,False
1,82,29,0.00987,1,0,0,0,449.091901,0.0,False,True,False,False,False,False,False
2,82,30,0.086061,1,0,0,0,448.911577,0.0,False,True,False,False,False,False,False


In [68]:
positive_pairs = (
    user_hist_df[['user_id', 'post_id']]
    .drop_duplicates()
)

ranking_df = ranking_df.merge(
    positive_pairs.assign(label=1),
    on=['user_id', 'post_id'],
    how='left'
)

In [69]:
ranking_df['label'] = ranking_df['label'].fillna(0).astype(int)

In [70]:
ranking_df['label'].value_counts()

label
0    1704
1      15
Name: count, dtype: int64

In [72]:
ranking_df[ranking_df['label'] == 1].head()

Unnamed: 0,user_id,post_id,semantic_similarity,has_user_embedding,same_area,same_city,same_state,recency_days,popularity_score,post_type_event,post_type_general,post_type_live_streaming,post_type_profile_picture,post_type_share,post_type_shorts,post_type_video,label
45,82,82,1.0,1,0,0,0,359.972538,3.6,False,False,False,True,False,False,False,1
380,594,381,0.564018,1,0,0,0,480.168093,4.3,False,False,False,False,False,True,False,1
381,594,405,0.618293,1,0,1,1,480.168093,8.6,False,False,False,False,False,False,True,1
383,594,427,0.789399,1,1,1,1,149.976959,8.6,False,False,False,True,False,False,False,1
586,595,441,1.0,1,1,1,1,149.907457,7.6,False,False,False,True,False,False,False,1


In [74]:
feature_cols = [
    'semantic_similarity',
    'has_user_embedding',
    'same_area',
    'same_city',
    'same_state',
    'recency_days',
    'popularity_score'
]
# add post_type one-hot columns
feature_cols += [c for c in ranking_df.columns if c.startswith('post_type_')]

In [75]:
ranking_df = (
    ranking_df
    .sort_values(['user_id', 'recency_days'])
    .reset_index(drop=True)
)

In [76]:
X = ranking_df[feature_cols]
y = ranking_df['label']

Every row in ranking_df belongs to exactly one user group
and no rows are missing or duplicated across groups.

In [77]:
group = ranking_df.groupby('user_id').size().to_list()
sum(group) == len(ranking_df)  # must be True

True

In [78]:
import lightgbm as lgb

ranker = lgb.LGBMRanker(
    objective='lambdarank',
    metric='ndcg',
    n_estimators=200,
    learning_rate=0.05,
    num_leaves=31,
    max_depth=-1,
    min_data_in_leaf=20,
    random_state=42
)


In [79]:
ranker.fit(
    X,
    y,
    group=group
)


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000506 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 491
[LightGBM] [Info] Number of data points in the train set: 1719, number of used features: 13


0,1,2
,boosting_type,'gbdt'
,num_leaves,31
,max_depth,-1
,learning_rate,0.05
,n_estimators,200
,subsample_for_bin,200000
,objective,'lambdarank'
,class_weight,
,min_split_gain,0.0
,min_child_weight,0.001


In [80]:
# Create directory if not exists
os.makedirs("artifacts", exist_ok=True)

# Save model
ranker.booster_.save_model("artifacts/lgbm_ranker.txt")

<lightgbm.basic.Booster at 0x15009fff430>

In [81]:
ranking_df['rank_score'] = ranker.predict(X)



In [82]:
ranking_df['rank'] = (
    ranking_df
    .groupby('user_id')['rank_score']
    .rank(method='first', ascending=False)
)

In [83]:
TOP_K = 60

top_recs = (
    ranking_df
    .sort_values(['user_id', 'rank_score'], ascending=[True, False])
    .groupby('user_id')
    .head(TOP_K)
)

In [84]:
ranking_df.groupby('label')['rank_score'].mean()

label
0   -9.260599
1    4.459705
Name: rank_score, dtype: float64

manually inspecting top posts

In [85]:
uid = ranking_df['user_id'].iloc[0]

ranking_df[
    ranking_df['user_id'] == uid
].sort_values('rank_score', ascending=False)[
    ['post_id', 'label', 'semantic_similarity', 'same_city', 'recency_days']
].head(10)

Unnamed: 0,post_id,label,semantic_similarity,same_city,recency_days
15,475,1,0.744288,1,143.108961
237,489,1,0.744289,1,480.168093
231,381,0,0.744289,0,480.168093
232,405,0,0.077259,1,480.168093
28,406,0,0.244991,1,187.125385
19,458,0,-0.022749,0,146.934019
21,446,0,0.244991,0,149.217167
22,445,0,0.244991,0,149.219285
234,450,0,0.152072,0,480.168093
177,82,0,0.244991,0,359.972538


bottom ranked posts

In [86]:
ranking_df[
    ranking_df['user_id'] == uid
].sort_values('rank_score', ascending=True)[
    ['post_id', 'label', 'semantic_similarity', 'same_city', 'recency_days']
].head(5)

Unnamed: 0,post_id,label,semantic_similarity,same_city,recency_days
170,95,0,0.073362,0,357.825026
204,54,0,0.070956,0,428.122931
53,235,0,0.071465,0,306.9602
103,175,0,0.070505,0,325.086554
69,218,0,0.05987,0,308.933012


### for cold users with lil to no interactions - we implement the foll fallback logic

In [87]:
is_cold_user = (
    user_id not in user_embeddings and
    user_id not in user_hist_df['user_id'].values
)

In [75]:
loc = get_user_location(user_id)
area_id, city_id, state_id = loc['area_id'], loc['city_id'], loc['state_id']

In [88]:
def get_cold_start_candidates(user_id, min_candidates=200):
    candidates = set()
    loc = get_user_location(user_id)

    # 1️⃣ Location-aware popular posts (expand outward)
    if loc['area_id'] is not None:
        candidates.update(
            posts_df[
                posts_df['area_id'] == loc['area_id']
            ]
            .sort_values('popularity_score', ascending=False)
            .head(100)['post_id']
        )

    if len(candidates) < min_candidates and loc['city_id'] is not None:
        candidates.update(
            posts_df[
                posts_df['city_id'] == loc['city_id']
            ]
            .sort_values('popularity_score', ascending=False)
            .head(100)['post_id']
        )

    if len(candidates) < min_candidates and loc['state_id'] is not None:
        candidates.update(
            posts_df[
                posts_df['state_id'] == loc['state_id']
            ]
            .sort_values('popularity_score', ascending=False)
            .head(100)['post_id']
        )

    # 2️⃣ Global popularity fallback
    candidates.update(popular_posts)

    # 3️⃣ Recent fallback
    candidates.update(recent_posts)

    return list(candidates)

In [89]:
def cold_start_score(post_row, ref_time):
    score = 0.0

    # Popularity (primary signal for cold users)
    score += 0.6 * post_row['popularity_score']

    # Recency (smooth decay)
    days_old = (ref_time - post_row['created_at']).total_seconds() / 86400
    score += 0.3 * np.exp(-days_old / 15)

    # Optional content bias
    if post_row['post_type'] == 'video':
        score += 0.1

    return score

#### save schema

In [90]:
import json

with open("artifacts/feature_columns.json", "w") as f:
    json.dump(list(X.columns), f)

# lets test

In [None]:
USER_ID = 594
TOP_K = 10

In [None]:
is_warm_user = USER_ID in user_embeddings
print("Warm user:", is_warm_user)

In [None]:
candidates = get_candidates_for_user(USER_ID)

In [None]:
ref_time = posts_df['created_at'].max()

rows = []

user_loc = get_user_location(USER_ID)
user_area  = user_loc['area_id']
user_city  = user_loc['city_id']
user_state = user_loc['state_id']

for post_id in candidates:
    post_row = posts_df.loc[posts_df['post_id'] == post_id].iloc[0]

    rows.append({
        'post_id': post_id,

        # personalization
        'semantic_similarity': user_post_similarity(USER_ID, post_id),

        # cold start flag
        'has_user_embedding': 1,

        # location signals
        'same_area': int(user_area is not None and post_row['area_id'] == user_area),
        'same_city': int(user_city is not None and post_row['city_id'] == user_city),
        'same_state': int(user_state is not None and post_row['state_id'] == user_state),

        # freshness
        'recency_days': recency_days(post_row['created_at'], ref_time),

        # popularity
        'total_reactions': post_row['total_reactions'],

        # content type
        'post_type': post_row['post_type'],
    })


In [None]:
user_rank_df = pd.DataFrame(rows)

In [None]:
user_rank_df = pd.get_dummies(
    user_rank_df,
    columns=['post_type'],
    prefix='post_type'
)


In [None]:
post_ids = user_rank_df['post_id'].values

In [None]:
for col in X.columns:
    if col not in user_rank_df.columns:
        user_rank_df[col] = 0

user_rank_features = user_rank_df[X.columns]


In [None]:
scores = ranker.predict(user_rank_features)


In [None]:
recommended_posts = (
    pd.DataFrame({
        'post_id': post_ids,
        'rank_score': scores
    })
    .sort_values('rank_score', ascending=False)
    .head(TOP_K)
)


In [None]:
recommended_posts = recommended_posts.merge(
    posts_df[['post_id', 'description', 'post_type', 'created_at']],
    on='post_id',
    how='left'
)


In [None]:
recommended_posts


# creating inference pipeline

In [91]:
import json
import numpy as np
import pandas as pd
import lightgbm as lgb

# Load trained LightGBM ranker
ranker = lgb.Booster(model_file="artifacts/lgbm_ranker.txt")

# Load feature columns used during training
with open("artifacts/feature_columns.json", "r") as f:
    FEATURE_COLUMNS = json.load(f)

In [92]:
posts_by_id = posts_df.set_index('post_id')

def recommend_for_warm_user(user_id, top_k=10):

    if user_id not in user_embeddings:
        raise ValueError("Warm recommender called for non-warm user")

    candidates = get_candidates_for_user(user_id)

    # Correct reference time
    ref_time = pd.Timestamp.utcnow()
    if ref_time.tzinfo is None:
        ref_time = ref_time.tz_localize('UTC')

    user_loc = get_user_location(user_id)
    rows = []

    for post_id in candidates:
        if post_id not in posts_by_id.index:
            continue

        post_row = posts_by_id.loc[post_id]

        rows.append({
            'post_id': post_id,

            # Personalization
            'semantic_similarity': user_post_similarity(user_id, post_id),
            'has_user_embedding': 1,

            # Location
            'same_area': int(user_loc['area_id'] is not None and post_row['area_id'] == user_loc['area_id']),
            'same_city': int(user_loc['city_id'] is not None and post_row['city_id'] == user_loc['city_id']),
            'same_state': int(user_loc['state_id'] is not None and post_row['state_id'] == user_loc['state_id']),

            # Freshness & popularity
            'recency_days': recency_days(post_row['created_at'], ref_time),
            'popularity_score': post_row['popularity_score'],

            # Categorical
            'post_type': post_row['post_type']
        })

    user_df = pd.DataFrame(rows)

    # One-hot encode
    user_df = pd.get_dummies(user_df, columns=['post_type'], prefix='post_type')

    # Preserve post_id
    post_ids = user_df['post_id'].values

    # Strict feature alignment (CRITICAL)
    user_df = user_df.reindex(columns=FEATURE_COLUMNS, fill_value=0)

    X_user = user_df[FEATURE_COLUMNS]

    # Predict
    scores = ranker.predict(X_user)

    ranked = (
        pd.DataFrame({
            'post_id': post_ids,
            'rank_score': scores
        })
        .sort_values('rank_score', ascending=False)
        .head(top_k)
    )

    ranked = ranked.merge(
        posts_df[['post_id', 'description', 'post_type', 'created_at']],
        on='post_id',
        how='left'
    )

    return ranked.to_dict(orient='records')


In [93]:
posts_by_id = posts_df.set_index('post_id')

posts_by_id = posts_df.set_index('post_id')

def recommend_for_cold_user(user_id, top_k=10):

    # True cold-user check (AND, not OR)
    is_cold_user = (
        user_id not in user_embeddings and
        user_id not in user_hist_df['user_id'].values
    )

    if not is_cold_user:
        return []   # or None, depending on your router design

    candidates = get_cold_start_candidates(user_id)

    # Reference time
    ref_time = pd.Timestamp.utcnow()
    if ref_time.tzinfo is None:
        ref_time = ref_time.tz_localize('UTC')

    rows = []

    for post_id in candidates:
        if post_id not in posts_by_id.index:
            continue

        post_row = posts_by_id.loc[post_id]

        score = (
            0.6 * post_row['popularity_score'] +
            0.3 * np.exp(
                -recency_days(post_row['created_at'], ref_time) / 15
            )
        )

        rows.append({
            'post_id': post_id,
            'rank_score': score
        })

    ranked = (
        pd.DataFrame(rows)
        .sort_values('rank_score', ascending=False)
        .head(top_k)
        .merge(
            posts_df[['post_id', 'description', 'post_type', 'created_at']],
            on='post_id',
            how='left'
        )
    )

    return ranked.to_dict(orient='records')


In [94]:
posts_by_id = posts_df.set_index('post_id')

def recommend_for_semi_cold_user(user_id, top_k=10):

    # Semi-cold guard:
    # no embedding BUT has history
    is_semi_cold = (
        user_id not in user_embeddings and
        user_id in user_hist_df['user_id'].values
    )

    if not is_semi_cold:
        return []

    candidates = get_candidates_for_user(user_id)

    # Correct reference time
    ref_time = pd.Timestamp.utcnow()
    if ref_time.tzinfo is None:
        ref_time = ref_time.tz_localize('UTC')

    user_loc = get_user_location(user_id)
    type_pref = user_post_type_pref.get(user_id, {})

    rows = []

    for post_id in candidates:
        if post_id not in posts_by_id.index:
            continue

        post_row = posts_by_id.loc[post_id]

        # ---------- score components ----------

        popularity = post_row['popularity_score']

        recency = np.exp(
            -recency_days(post_row['created_at'], ref_time) / 15
        )

        type_affinity = type_pref.get(post_row['post_type'], 0.0)

        # location boost
        location_boost = 0.0
        if user_loc['area_id'] is not None and post_row['area_id'] == user_loc['area_id']:
            location_boost = 1.0
        elif user_loc['city_id'] is not None and post_row['city_id'] == user_loc['city_id']:
            location_boost = 0.6
        elif user_loc['state_id'] is not None and post_row['state_id'] == user_loc['state_id']:
            location_boost = 0.3

        # ---------- final score (normalized) ----------
        score = (
            0.45 * popularity +
            0.25 * recency +
            0.20 * type_affinity +
            0.10 * location_boost
        )

        rows.append({
            'post_id': post_id,
            'rank_score': score
        })

    ranked = (
        pd.DataFrame(rows)
        .sort_values('rank_score', ascending=False)
        .head(top_k)
        .merge(
            posts_df[['post_id', 'description', 'post_type', 'created_at']],
            on='post_id',
            how='left'
        )
    )

    return ranked.to_dict(orient='records')

In [95]:
def generate_recommendations(user_id, top_k=10):

    if user_id in user_embeddings:
        # semantic personalization
        recs = recommend_for_warm_user(user_id, top_k)
        user_type = "warm"

    elif user_id in user_post_type_pref:
        # behavior-only personalization (no text)
        recs = recommend_for_semi_cold_user(user_id, top_k)
        user_type = "semi_cold"

    else:
        # ❄️ no personalization possible
        recs = recommend_for_cold_user(user_id, top_k)
        user_type = "cold"

    return {
        "user_id": user_id,
        "user_type": user_type,
        "generated_at": pd.Timestamp.utcnow().isoformat(),
        "recommendations": recs
    }

In [98]:
response = generate_recommendations(594, top_k=10)
response


{'user_id': 594,
 'user_type': 'warm',
 'generated_at': '2025-12-18T10:55:13.236853+00:00',
 'recommendations': [{'post_id': 427,
   'rank_score': 6.85103776650439,
   'description': '',
   'post_type': 'profile_picture',
   'created_at': Timestamp('2025-07-21 10:24:14+0000', tz='UTC')},
  {'post_id': 381,
   'rank_score': 3.4846203682145496,
   'description': 'car accessories',
   'post_type': 'shorts',
   'created_at': NaT},
  {'post_id': 405,
   'rank_score': 3.4799428157242454,
   'description': 'testimonial',
   'post_type': 'video',
   'created_at': NaT},
  {'post_id': 489,
   'rank_score': 3.3046315212967494,
   'description': 'Car accessories',
   'post_type': 'video',
   'created_at': NaT},
  {'post_id': 446,
   'rank_score': 0.5223601858062689,
   'description': '',
   'post_type': 'general',
   'created_at': Timestamp('2025-07-22 04:38:20+0000', tz='UTC')},
  {'post_id': 445,
   'rank_score': 0.10578252028473764,
   'description': '',
   'post_type': 'general',
   'created_a

In [103]:
response = generate_recommendations(309, top_k=10)
response

{'user_id': 309,
 'user_type': 'warm',
 'generated_at': '2025-12-18T10:56:28.449842+00:00',
 'recommendations': [{'post_id': 458,
   'rank_score': 2.8196756808255135,
   'description': 'https://www.cityhangaround.com/view/single/post/381',
   'post_type': 'share',
   'created_at': Timestamp('2025-07-24 11:26:04+0000', tz='UTC')},
  {'post_id': 446,
   'rank_score': 1.4817278683030422,
   'description': '',
   'post_type': 'general',
   'created_at': Timestamp('2025-07-22 04:38:20+0000', tz='UTC')},
  {'post_id': 445,
   'rank_score': 1.0651502027815112,
   'description': '',
   'post_type': 'general',
   'created_at': Timestamp('2025-07-22 04:35:17+0000', tz='UTC')},
  {'post_id': 457,
   'rank_score': -1.4743300409041464,
   'description': '',
   'post_type': 'general',
   'created_at': Timestamp('2025-07-24 10:46:07+0000', tz='UTC')},
  {'post_id': 406,
   'rank_score': -2.3215121044109006,
   'description': '',
   'post_type': 'general',
   'created_at': Timestamp('2025-06-14 06:50:

In [100]:
response = generate_recommendations(1, top_k=10)
response

{'user_id': 1,
 'user_type': 'warm',
 'generated_at': '2025-12-18T10:55:24.963128+00:00',
 'recommendations': [{'post_id': 489,
   'rank_score': 6.772892688103034,
   'description': 'Car accessories',
   'post_type': 'video',
   'created_at': NaT},
  {'post_id': 475,
   'rank_score': 6.769436067256824,
   'description': 'Time is money',
   'post_type': 'general',
   'created_at': Timestamp('2025-07-28 07:14:09+0000', tz='UTC')},
  {'post_id': 381,
   'rank_score': 1.1601591629933474,
   'description': 'car accessories',
   'post_type': 'shorts',
   'created_at': NaT},
  {'post_id': 405,
   'rank_score': -2.5596552361020186,
   'description': 'testimonial',
   'post_type': 'video',
   'created_at': NaT},
  {'post_id': 406,
   'rank_score': -3.261761742451788,
   'description': '',
   'post_type': 'general',
   'created_at': Timestamp('2025-06-14 06:50:30+0000', tz='UTC')},
  {'post_id': 450,
   'rank_score': -4.49145294593693,
   'description': 'cleopatra',
   'post_type': 'video',
   '

In [108]:
response = generate_recommendations(82, top_k=10)
response

{'user_id': 82,
 'user_type': 'warm',
 'generated_at': '2025-12-18T10:59:25.528586+00:00',
 'recommendations': [{'post_id': 82,
   'rank_score': 2.976343244491466,
   'description': '',
   'post_type': 'profile_picture',
   'created_at': Timestamp('2024-12-23 10:30:36+0000', tz='UTC')},
  {'post_id': 209,
   'rank_score': 1.438753184652671,
   'description': '',
   'post_type': 'general',
   'created_at': Timestamp('2025-02-12 11:07:53+0000', tz='UTC')},
  {'post_id': 446,
   'rank_score': -1.391831840916667,
   'description': '',
   'post_type': 'general',
   'created_at': Timestamp('2025-07-22 04:38:20+0000', tz='UTC')},
  {'post_id': 445,
   'rank_score': -1.8150921579106671,
   'description': '',
   'post_type': 'general',
   'created_at': Timestamp('2025-07-22 04:35:17+0000', tz='UTC')},
  {'post_id': 406,
   'rank_score': -2.852456919895885,
   'description': '',
   'post_type': 'general',
   'created_at': Timestamp('2025-06-14 06:50:30+0000', tz='UTC')},
  {'post_id': 427,
   'r

In [110]:
response = generate_recommendations(3, top_k=10)
response

{'user_id': 3,
 'user_type': 'cold',
 'generated_at': '2025-12-18T11:00:26.667590+00:00',
 'recommendations': [{'post_id': 445,
   'rank_score': 5.160014301621606,
   'description': '',
   'post_type': 'general',
   'created_at': Timestamp('2025-07-22 04:35:17+0000', tz='UTC')},
  {'post_id': 427,
   'rank_score': 5.160013597165553,
   'description': '',
   'post_type': 'profile_picture',
   'created_at': Timestamp('2025-07-21 10:24:14+0000', tz='UTC')},
  {'post_id': 457,
   'rank_score': 4.560016624449498,
   'description': '',
   'post_type': 'general',
   'created_at': Timestamp('2025-07-24 10:46:07+0000', tz='UTC')},
  {'post_id': 441,
   'rank_score': 4.560013660314038,
   'description': '',
   'post_type': 'profile_picture',
   'created_at': Timestamp('2025-07-21 12:04:19+0000', tz='UTC')},
  {'post_id': 475,
   'rank_score': 2.5800214930121963,
   'description': 'Time is money',
   'post_type': 'general',
   'created_at': Timestamp('2025-07-28 07:14:09+0000', tz='UTC')},
  {'po

In [94]:
users_with_interactions = set(user_hist_df['user_id'].unique())
users_with_embeddings = set(user_embeddings.keys())
semi_cold_users = users_with_interactions - users_with_embeddings
semi_cold_users = list(semi_cold_users)

semi_cold_users[:10]

[]

In [95]:
len(user_hist_df)

14