In [None]:
import pandas as pd
from itertools import combinations
from collections import defaultdict, Counter
from tqdm import tqdm

# ------------------ Load ------------------
df = pd.read_csv("/content/personalized_test.tsv", sep="\t")
df.columns = [c.strip().lower() for c in df.columns]
df["click_list"] = df["clicknewsid"].apply(lambda x: [i.strip() for i in str(x).split(",") if i.strip()])
user_sequences = dict(zip(df["userid"], df["click_list"]))

print(f"Users: {len(user_sequences)}")

# ------------------ Helpers ------------------
def iou_confidence(seq_a, seq_b):
    A, B = set(seq_a), set(seq_b)
    if not A and not B:
        return 0.0
    return len(A & B) / len(A | B)

def find_anchor_indices(seq, a1, a2):
    """Return ordered (i, j) where i<j if both anchors present and distinct, else (None, None)."""
    if a1 in seq and a2 in seq:
        i, j = seq.index(a1), seq.index(a2)
        if i == j:  # same position (shouldn't happen), treat as invalid
            return None, None
        return (i, j) if i < j else (j, i)
    return None, None

# ------------------ Inverted Indexes ------------------
# item -> users containing it
item_users = defaultdict(set)
for uid, seq in user_sequences.items():
    for it in set(seq):
        item_users[it].add(uid)

# only items that truly enable cross-user swaps
shared_items = {it for it, users in item_users.items() if len(users) > 1}
print(f"Shared clicknewsIDs (appear in ≥2 users): {len(shared_items)}")

# Cache: for a pair (a,b) -> users that have BOTH
both_anchor_users = {}
def users_with_both(a, b):
    key = (a, b) if a <= b else (b, a)
    if key not in both_anchor_users:
        both_anchor_users[key] = item_users[a] & item_users[b]
    return both_anchor_users[key]

# ------------------ Exhaustive Cross-Trajectory Augmentation ------------------
Tc = -1.0  # as per spec: swap allowed if anchors co-exist; IoU threshold can be >0 if you want stricter
augmented = {}
reasons = {}   # uid -> reason if unchanged
diagnostics = []

for uid_a, Sa in tqdm(user_sequences.items(), desc="Cross-trajectory shuffling (exhaustive)"):
    # 1) generate ALL anchor pairs from THIS user's sequence, restricted to shared items
    usable = [x for x in Sa if x in shared_items]
    if len(usable) < 2:
        reasons[uid_a] = "no_shared_items_in_user"
        continue

    # generate all unique ordered pairs (io1, io2) from this sequence
    # we use the order as they appear in Sa to define the subsequence Sa[n1:n2+1]
    pos = {it: i for i, it in enumerate(Sa)}
    anchors_for_user = []
    for a, b in combinations(usable, 2):
        n1, n2 = find_anchor_indices(Sa, a, b)
        if n1 is not None:
            anchors_for_user.append((a, b))  # keep as items; indices recomputed as needed

    if not anchors_for_user:
        reasons[uid_a] = "no_valid_anchor_pairs_in_user"
        continue

    best = None  # (score_tuple, io_pair, partner_uid, Sa_pseudo, meta)

    # 2) for every anchor pair, consider ALL partner users that have both anchors
    for (io1, io2) in anchors_for_user:
        # partner set
        partners = users_with_both(io1, io2) - {uid_a}
        if not partners:
            continue

        # anchor indices and slice in Sa
        n1, n2 = find_anchor_indices(Sa, io1, io2)
        if n1 is None:
            continue
        Sa_s = Sa[n1:n2+1]

        for uid_b in partners:
            Sb = user_sequences[uid_b]
            m1, m2 = find_anchor_indices(Sb, io1, io2)
            if m1 is None:
                continue
            Sb_s = Sb[m1:m2+1]

            # must actually change: borrowed slice should differ
            if Sb_s == Sa_s:
                continue

            C = iou_confidence(Sa_s, Sb_s)
            if C < Tc:
                continue

            # construct the swapped sequence for Sa
            Sa_pseudo = Sa[:n1] + Sb_s + Sa[n2+1:]
            if Sa_pseudo == Sa:
                # extremely rare no-op; skip
                continue

            # scoring: prefer higher IoU, then longer borrowed slice, then more partner fanout
            score = (C, (m2 - m1 + 1), len(users_with_both(io1, io2)))

            if (best is None) or (score > best[0]):
                best = (score, (io1, io2), uid_b, Sa_pseudo, {
                    "iou": C,
                    "borrowed_len": m2 - m1 + 1,
                    "partner_fanout": len(users_with_both(io1, io2))
                })

    if best is not None:
        score, anchors, uid_b, Sa_pseudo, meta = best
        augmented[uid_a] = Sa_pseudo
        diagnostics.append({
            "userid": uid_a,
            "partner_user": uid_b,
            "anchors": anchors,
            "iou": round(meta["iou"], 4),
            "borrowed_len": meta["borrowed_len"],
            "partner_fanout": meta["partner_fanout"]
        })
    else:
        reasons[uid_a] = "no_partner_with_different_between_anchor_slice"

# ------------------ Attach & Save ------------------
def joined_or_original(u):
    return ",".join(augmented[u]) if u in augmented else ",".join(user_sequences[u])

df["augmented_clicknewsid"] = df["userid"].map(joined_or_original)

changed = (df["clicknewsid"] != df["augmented_clicknewsid"]).sum()
print(f"Cross-trajectory augmented users: {changed}/{len(df)} ({changed/len(df):.1%})")

diag_df = pd.DataFrame(diagnostics).sort_values(["iou","borrowed_len","partner_fanout"], ascending=False)
diag_df.to_csv("/content/cross_swap_diagnostics.tsv", sep="\t", index=False)
df.to_csv("/content/augmented_user_sequences_cross_only.tsv", sep="\t", index=False)

print("Saved:")
print("  • /content/augmented_user_sequences_cross_only.tsv")
print("  • /content/cross_swap_diagnostics.tsv")

# Optional: show users that could not be augmented and why (first 20)
unchanged = [u for u in user_sequences if u not in augmented]
print(f"Unchanged users: {len(unchanged)}")
from collections import Counter as Cn
print("Reasons:", Cn(reasons.get(u, "unknown") for u in unchanged).most_common())

# Preview
display(df[["userid","clicknewsid","augmented_clicknewsid"]])
display(diag_df)


Users: 103
Shared clicknewsIDs (appear in ≥2 users): 2156


Cross-trajectory shuffling (exhaustive): 100%|██████████| 103/103 [00:01<00:00, 52.55it/s]

Cross-trajectory augmented users: 103/103 (100.0%)
Saved:
  • /content/augmented_user_sequences_cross_only.tsv
  • /content/cross_swap_diagnostics.tsv
Unchanged users: 0
Reasons: []





Unnamed: 0,userid,clicknewsid,augmented_clicknewsid
0,NT1,"N108480,N38238,N35068,N110487,N94904,N72378,N4...","N108480,N38238,N35068,N110487,N94904,N72378,N4..."
1,NT2,"N34682,N113236,N119039,N90826,N63278,N27346,N5...","N34682,N113236,N119039,N90826,N63278,N27346,N5..."
2,NT3,"N106204,N74279,N55583,N90083,N117690,N91663,N9...","N106204,N74279,N55583,N90083,N117690,N91663,N9..."
3,NT4,"N61892,N41396,N42145,N24440,N74099,N73577,N123...","N61892,N41396,N42145,N24440,N74099,N73577,N123..."
4,NT5,"N79801,N52642,N19270,N112075,N37402,N120660,N3...","N79801,N52642,N19270,N112075,N37402,N120660,N3..."
...,...,...,...
98,NT99,"N74855,N70285,N97607,N14984,N101784,N65808,N28...","N74855,N70285,N64432,N111819,N80833,N72876,N79..."
99,NT100,"N80527,N42741,N32568,N95477,N86762,N77781,N533...","N80527,N42741,N32568,N95477,N86762,N77781,N533..."
100,NT101,"N14290,N116936,N110697,N110669,N57257,N94449,N...","N14290,N116936,N110697,N110669,N57257,N94449,N..."
101,NT102,"N101579,N19049,N116697,N106313,N76716,N106985,...","N101579,N19049,N116697,N106313,N76716,N106985,..."


Unnamed: 0,userid,partner_user,anchors,iou,borrowed_len,partner_fanout
78,NT79,NT91,"(N112410, N86007)",1.0000,2,3
90,NT91,NT79,"(N86007, N112410)",1.0000,2,3
62,NT63,NT3,"(N46554, N62959)",0.6667,3,2
77,NT78,NT48,"(N86564, N64905)",0.6667,3,2
101,NT102,NT23,"(N22114, N15543)",0.6667,3,2
...,...,...,...,...,...,...
59,NT60,NT49,"(N87913, N72791)",0.0541,17,2
66,NT67,NT102,"(N84596, N75229)",0.0513,4,2
51,NT52,NT96,"(N110111, N49085)",0.0455,27,2
41,NT42,NT33,"(N36468, N23901)",0.0385,27,2


In [None]:
df

Unnamed: 0,userid,clicknewsid,posnewid,rewrite_titles,click_list,augmented_clicknewsid
0,NT1,"N108480,N38238,N35068,N110487,N94904,N72378,N4...","N24110,N62769,N36186,N101669,N19241,N72921,N26...",Legal battle looms over Trump EPA's rule chang...,"[N108480, N38238, N35068, N110487, N94904, N72...","N108480,N38238,N35068,N110487,N94904,N72378,N4..."
1,NT2,"N34682,N113236,N119039,N90826,N63278,N27346,N5...","N51765,N37815,N109881,N64357,N13381,N45697,N57...",What You Need to Know About GMOs;;What's Up wi...,"[N34682, N113236, N119039, N90826, N63278, N27...","N34682,N113236,N119039,N90826,N63278,N27346,N5..."
2,NT3,"N106204,N74279,N55583,N90083,N117690,N91663,N9...","N96078,N11699,N13028,N36049,N87968,N105007,N11...",Don't Know What's Popular This Summer? We've G...,"[N106204, N74279, N55583, N90083, N117690, N91...","N106204,N74279,N55583,N90083,N117690,N91663,N9..."
3,NT4,"N61892,N41396,N42145,N24440,N74099,N73577,N123...","N15817,N104663,N10362,N69465,N16287,N70636,N83...",Summer heat putting your pets at risk;;Trip Ad...,"[N61892, N41396, N42145, N24440, N74099, N7357...","N61892,N41396,N42145,N24440,N74099,N73577,N123..."
4,NT5,"N79801,N52642,N19270,N112075,N37402,N120660,N3...","N61157,N69119,N101472,N122218,N92462,N67440,N5...",Top News Stories from Texas ;;Some Simple Tips...,"[N79801, N52642, N19270, N112075, N37402, N120...","N79801,N52642,N19270,N112075,N37402,N120660,N3..."
...,...,...,...,...,...,...
98,NT99,"N74855,N70285,N97607,N14984,N101784,N65808,N28...","N55099,N48939,N85789,N32617,N10476,N23495,N747...",National News Updates ;;Fruit Tea Expansion |...,"[N74855, N70285, N97607, N14984, N101784, N658...","N74855,N70285,N64432,N111819,N80833,N72876,N79..."
99,NT100,"N80527,N42741,N32568,N95477,N86762,N77781,N533...","N28172,N64220,N108207,N112458,N108750,N51009,N...",The Blue Jays lead by Richard beat the Royals ...,"[N80527, N42741, N32568, N95477, N86762, N7778...","N80527,N42741,N32568,N95477,N86762,N77781,N533..."
100,NT101,"N14290,N116936,N110697,N110669,N57257,N94449,N...","N33068,N120666,N85039,N26146,N46240,N122884,N6...",Murdered Father A Hero By Donating Organs To S...,"[N14290, N116936, N110697, N110669, N57257, N9...","N14290,N116936,N110697,N110669,N57257,N94449,N..."
101,NT102,"N101579,N19049,N116697,N106313,N76716,N106985,...","N26153,N93627,N122237,N120408,N105451,N66158,N...",Sacramento State Captiol building to hang Prid...,"[N101579, N19049, N116697, N106313, N76716, N1...","N101579,N19049,N116697,N106313,N76716,N106985,..."
