In [17]:
import pandas as pd
import numpy as np
import json
import plotly.express as px
import plotly.io as pio

# --- User parameters: ---
CSV_EMB = 'embeddings_output_filtered.csv'    # path to service embeddings
COMPANY_CSV = 'merged_filtered.csv'          # path to company services
# Define groups of company indices with names and colors
groups = {
    'London': {'indices': [0, 3, 7, 8, 10, 11, 12, 13, 14, 15, 16, 17, 18, 28, 29, 30, 32, 34, 38, 39, 43, 44, 45, 48, 50, 52, 53,
                                64, 65, 68, 70, 71, 72, 73, 74, 79, 81, 83, 85, 86, 90, 96, 98, 102, 103, 107, 110, 111, 114, 115, 119,
                                120, 126, 128, 130, 131, 133, 138, 140, 141, 146, 151, 157, 159, 165, 168, 169, 175, 177, 178, 186, 263,
                                313, 406, 436, 473, 485, 530, 593, 625, 725, 769, 811, 872, 945], 'color': 'red'},
    'India':   {'indices': [2, 4, 9, 25, 26, 36, 37, 40, 41, 51, 55, 60, 66, 69, 75, 84, 88, 91, 95, 97, 99, 117, 121, 124, 135, 137, 142, 145, 149, 160, 180, 181, 183, 184, 185, 190, 192, 196, 199, 206, 210, 212, 215, 224, 228, 234, 235, 244, 247, 248, 254, 255, 256, 258, 260, 264, 266, 269, 274, 275, 278, 285, 288, 289, 294, 297, 300, 308, 310, 319, 320, 324, 326, 327, 350, 356, 358, 362, 365, 366, 367, 370, 375, 377, 460, 474, 564, 650, 672, 730, 777, 831, 955], 'color': 'green'},
    # add more groups as needed
}
# ------------------------

# 1. Load service embeddings
services_df = pd.read_csv(CSV_EMB)
services_df['emb_vec'] = services_df['embedding'].apply(
    lambda e: np.array(json.loads(e)) if isinstance(e, str) else np.array(e)
)
emb_matrix = np.vstack(services_df['emb_vec'].values)

# 2. Define conceptual axes
axes = {
    'Ad/PR/Brand': [
        'advertisement', 'branding', 'brand strategy', 'brand identity', 'brand equity',
        'campaign', 'creative concept', 'tagline', 'logo', 'influencer marketing',
        'social media marketing', 'content marketing', 'guerilla marketing', 'experiential marketing',
        'public relations', 'press release', 'media buying', 'ad buying', 'digital advertising',
        'display advertising', 'programmatic', 'out-of-home advertising', 'direct marketing',
        'sponsorship', 'storytelling', 'narrative', 'reputation management', 'corporate communications',
        'brand positioning', 'brand messaging'
    ],
    'Media/Production': [
        'media', 'production', 'film production', 'video production', 'audio production',
        'post-production', 'editing', 'sound design', 'color grading', 'visual effects',
        'motion graphics', 'cinematography', 'lighting design', 'camera operation',
        'studio recording', 'live streaming', 'broadcast', 'videography', 'podcasting',
        'scriptwriting', 'storyboarding', 'directing', 'producing', 'location scouting',
        'voice-over', 'video encoding', 'transcoding', 'production management'
    ],
    'Design/UX/Web': [
        'graphic design', 'UI design', 'UX design', 'user experience', 'user interface',
        'web design', 'responsive design', 'mobile-first design', 'interaction design',
        'wireframing', 'prototyping', 'visual design', 'typography', 'layout design',
        'information architecture', 'navigation design', 'accessibility', 'usability testing',
        'user research', 'design systems', 'pattern library', 'iconography', 'illustration',
        'front-end development', 'HTML5', 'CSS3', 'JavaScript', 'SaaS design',
        'product design', 'branding design'
    ]
}

# 3. Compute axis basis
axis_vecs = {}
for name, kws in axes.items():
    mask = services_df['text'].str.contains('|'.join(kws), case=False, na=False)
    axis_vecs[name] = emb_matrix[mask.values].mean(axis=0)
A = np.vstack(list(axis_vecs.values()))  # (3, D)
Q, _ = np.linalg.qr(A.T)                # (D,3)
proj_matrix = Q.T                       # (3, D)

# 4. Project services
proj = emb_matrix.dot(proj_matrix.T)
services_df[['x','y','z']] = proj

# 5. Plot services as base layer
fig = px.scatter_3d(services_df, x='x', y='y', z='z', hover_name='text')
fig.update_traces(marker=dict(size=0.1, opacity=0))
fig.update_layout(
    scene=dict(
        xaxis_title='Ad/PR/Brand',
        yaxis_title='Media/Production',
        zaxis_title='Design/UX/Web',
        aspectmode='manual', aspectratio=dict(x=1.5,y=1.5,z=1)
    ), margin=dict(l=0,r=0,t=30,b=0)
)
fig.update_traces(marker=dict(size=1.3, opacity=0.1), selector=dict(mode='markers'))

# 6. Load companies & embedding map
company_df = pd.read_csv(COMPANY_CSV)
service_cols = [f'service_{i}' for i in range(1,25)]
emb_map = dict(zip(services_df['text'], services_df['emb_vec']))

# 7. Process and plot each group separately
for group_name, opts in groups.items():
    idxs = opts['indices']
    color = opts['color']
    group_embs, labels = [], []
    for idx in idxs:
        if idx < 0 or idx >= len(company_df):
            continue
        row = company_df.iloc[idx]
        vecs = [emb_map.get(row[c]) for c in service_cols if pd.notna(row[c])]
        vecs = [v for v in vecs if v is not None]
        if vecs:
            group_embs.append(np.mean(vecs, axis=0))
            labels.append(f"{idx}: {row.get('company_name','')}" )
    if not group_embs:
        continue
    comp_proj = np.vstack(group_embs).dot(proj_matrix.T)
    comp_df = pd.DataFrame(comp_proj, columns=['x','y','z'])
    comp_df['label'] = labels
    # add markers+text for this group
    fig.add_scatter3d(
        x=comp_df['x'], y=comp_df['y'], z=comp_df['z'],
        #mode='markers+text', text=comp_df['label'],
        mode='markers', text=comp_df['label'],
        marker=dict(size=2, color=color), name=group_name
    )

fig.update_layout(margin=dict(l=0,r=0,t=30,b=0))
pio.renderers.default = "browser"

fig.show()
