In [18]:
import pandas as pd
import numpy as np
import json
import plotly.express as px
import plotly.io as pio
# --- User parameters: ---
CSV_EMB = 'embeddings_output_filtered.csv'    # path to service embeddings
COMPANY_CSV = 'merged_filtered.csv'          # path to company services
# Specify which company indices to plot: can be a single int, list of ints, or range
 #company_indices = [0, 2, 5]
#       company_indices = list(range(10))  # first 10 companies
company_indices = list(range(len(pd.read_csv(COMPANY_CSV))))  # default: all companies
# ------------------------

# 1. Load service embeddings
services_df = pd.read_csv(CSV_EMB)
services_df['emb_vec'] = services_df['embedding'].apply(
    lambda e: np.array(json.loads(e)) if isinstance(e, str) else np.array(e)
)
emb_matrix = np.vstack(services_df['emb_vec'].values)

# 2. Define conceptual axes
axes = {
    'Ad/PR/Brand': [
        'advertisement', 'branding', 'brand strategy', 'brand identity', 'brand equity',
        'campaign', 'creative concept', 'tagline', 'logo', 'influencer marketing',
        'social media marketing', 'content marketing', 'guerilla marketing', 'experiential marketing',
        'public relations', 'press release', 'media buying', 'ad buying', 'digital advertising',
        'display advertising', 'programmatic', 'out-of-home advertising', 'direct marketing',
        'sponsorship', 'storytelling', 'narrative', 'reputation management', 'corporate communications',
        'brand positioning', 'brand messaging'
    ],

    'Media/Production': [
        'media', 'production', 'film production', 'video production', 'audio production',
        'post-production', 'editing', 'sound design', 'color grading', 'visual effects',
        'motion graphics', 'cinematography', 'lighting design', 'camera operation',
        'studio recording', 'live streaming', 'broadcast', 'videography', 'podcasting',
        'scriptwriting', 'storyboarding', 'directing', 'producing', 'location scouting',
        'voice-over', 'video encoding', 'transcoding', 'production management'
    ],

    'Design/UX/Web': [
        'graphic design', 'UI design', 'UX design', 'user experience', 'user interface',
        'web design', 'responsive design', 'mobile-first design', 'interaction design',
        'wireframing', 'prototyping', 'visual design', 'typography', 'layout design',
        'information architecture', 'navigation design', 'accessibility', 'usability testing',
        'user research', 'design systems', 'pattern library', 'iconography', 'illustration',
        'front-end development', 'HTML5', 'CSS3', 'JavaScript', 'SaaS design',
        'product design', 'branding design'
    ]
}
# (Keywords arrays elided here for brevity; use previous definitions)

# 3. Compute axis basis
axis_vecs = {}
for name, kws in axes.items():
    mask = services_df['text'].str.contains('|'.join(kws), case=False, na=False)
    axis_vecs[name] = emb_matrix[mask.values].mean(axis=0)
A = np.vstack(list(axis_vecs.values()))
Q, _ = np.linalg.qr(A.T)
proj_matrix = Q.T

# 4. Project services
proj = emb_matrix.dot(proj_matrix.T)
services_df[['x','y','z']] = proj

# 5. Base plot of services
fig = px.scatter_3d(services_df, x='x', y='y', z='z', hover_name='text')
fig.update_traces(marker=dict(size=2.5, opacity=0.1))
fig.update_layout(
    scene=dict(
        xaxis_title='Ad/PR/Brand',
        yaxis_title='Media/Production',
        zaxis_title='Design/UX/Web',
        aspectmode='manual', aspectratio=dict(x=1.5,y=1.5,z=1)
    ), margin=dict(l=0,r=0,t=30,b=0)
)

# 6. Load companies & compute embeddings
company_df = pd.read_csv(COMPANY_CSV)
service_cols = [f'service_{i}' for i in range(1,25)]
emb_map = dict(zip(services_df['text'], services_df['emb_vec']))
company_embs, company_labels = [], []
for idx, row in company_df.iterrows():
    if idx not in company_indices:
        continue  # skip unselected
    vecs = [emb_map.get(row[col]) for col in service_cols if pd.notna(row[col])]
    vecs = [v for v in vecs if v is not None]
    if vecs:
        company_embs.append(np.mean(vecs,axis=0))
        company_labels.append(f"{idx}: {row.get('company_name','')}")

# 7. Overlay selected companies
if company_embs:
    comp_proj = np.vstack(company_embs).dot(proj_matrix.T)
    comp_df = pd.DataFrame(comp_proj,columns=['x','y','z'])
    comp_df['label'] = company_labels
    fig.add_scatter3d(
        x=comp_df['x'], y=comp_df['y'], z=comp_df['z'],
       # mode='markers+text', text=comp_df['label'],
        mode='markers', text=comp_df['label'],
        marker=dict(size=2,color='red'), name='Selected Companies'
    )
#company_indices = [0, 2, 5]
fig.show()
