**Step 1
Upload the dataset from Kaggle & basic analysis**

In [None]:
#from google.colab import files
#uploaded = files.upload()

In [149]:
#Installing plotly
#!pip install plotly

In [150]:
#Basic Analysis
import pandas as pd

# Load the dataset
df = pd.read_csv('multipleChoiceResponses_2017.csv',encoding='ISO-8859-1')

# Display the first few rows
#df.head()


Columns (31,83,86,87,98,99,109,116,123,124,127,129,130,164) have mixed types. Specify dtype option on import or set low_memory=False.



**Step 2: I used rule based segmentation. However, other segmentation techniques also can be used**

In [151]:
#manual segmentation
job_clusters = {
    'DBA/Database Engineer': 'Analytics',
'Operations Research Practitioner': 'Research',
'Computer Scientist': 'Data Scientist',
'Data Scientist': 'Data Scientist',
'Software Developer/Software Engineer': 'Software Engineer',
'Business Analyst': 'Analytics',
'Engineer': 'Analytics',
'Scientist/Researcher': 'Research',
'Researcher': 'Research',
'Other': 'Other',
'Data Analyst': 'Analytics',
'Machine Learning Engineer': 'Analytics',
'Statistician': 'Data Scientist',
'Predictive Modeler': 'Data Scientist',
'Programmer': 'Software Engineer',
'Data Miner': 'Analytics'
}
df['JobCluster'] = df['CurrentJobTitleSelect'].map(job_clusters).fillna('Other')

**Group Data for Sankey**

In [152]:
#Prepare Data for Sankey Chart
df_grouped = df.groupby(['FormalEducation', 'JobCluster']).size().reset_index(name='Count')

In [153]:
#Create Label Mapping
labels = list(pd.unique(df_grouped[['FormalEducation', 'JobCluster']].values.ravel('K')))
label_to_index = {label: idx for idx, label in enumerate(labels)}

df_grouped['Source'] = df_grouped['FormalEducation'].map(label_to_index)
df_grouped['Target'] = df_grouped['JobCluster'].map(label_to_index)

In [154]:
#cleaning up the labels before coloring
df_grouped['FormalEducation'] = df_grouped['FormalEducation'].str.strip()
df_grouped['JobCluster'] = df_grouped['JobCluster'].str.strip()
df_grouped['FormalEducation'] = df_grouped['FormalEducation'].str.replace("’", "'", regex=False)

In [155]:
# defining colors
base_colors = {
    "Bachelor's degree": (70, 130, 180),
    "Master's degree": (100, 149, 237),
    "Doctoral degree": (0, 128, 128)
}

In [156]:
# Drop rows with "Other" as Education or JobCluster
unwanted_education = ['Other', 'Professional degree']
df_grouped = df_grouped[~df_grouped['FormalEducation'].isin(unwanted_education)]
df_grouped = df_grouped[df_grouped['JobCluster'] != 'Other']

df_grouped = df_grouped[
    (df_grouped['FormalEducation'] != "Some college/university study without earning a bachelor's degree")
                      ]
df_grouped = df_grouped[
    (df_grouped['FormalEducation'] != "I did not complete any formal education past high school")
                      ]
df_grouped = df_grouped[
    (df_grouped['FormalEducation'] != "I prefer not to answer")
                      ]

In [157]:
#Assign Colors Based on Source & Count
import matplotlib.pyplot as plt

def get_shade(base_rgb, intensity):
    # Adjust color brightness based on normalized intensity (0–1)
    r, g, b = base_rgb
    return f"rgba({int(r*intensity)}, {int(g*intensity)}, {int(b*intensity)}, 0.5)"

# Normalize counts between 0–1
norm = plt.Normalize(df_grouped['Count'].min(), df_grouped['Count'].max())

# Generate a color per link
link_colors = []

for _, row in df_grouped.iterrows():
    education = row['FormalEducation']
    count = row['Count']
    base_rgb = base_colors.get(education, (150, 150, 150))
    shade = get_shade(base_rgb, norm(count))
    link_colors.append(shade)

In [158]:
# Default to gray if not found
def rgb_to_rgba(rgb, alpha=0.8):
    return f'rgba({rgb[0]},{rgb[1]},{rgb[2]},{alpha})'

# Initialize node color list
node_colors = []

for label in labels:
    if label in base_colors:  # It's an Education node
        color = rgb_to_rgba(base_colors[label])
    else:  # It's a JobCluster
        # Find matching education for this cluster, if available
        matches = df_grouped[df_grouped['JobCluster'] == label]
        if not matches.empty:
            edu = matches['FormalEducation'].iloc[0]
            base_rgb = base_colors.get(edu, (150, 150, 150))
        else:
            base_rgb = (150, 150, 150)  # fallback gray
        color = rgb_to_rgba(base_rgb)

    node_colors.append(color)

In [159]:
#Plot Sankey Diagram
import plotly.graph_objects as go

df_grouped['LinkLabel'] = (
    df_grouped['FormalEducation'].astype(str) +
    " → " +
    df_grouped['JobCluster'].astype(str) +
    "<br>Count: " +
    df_grouped['Count'].astype(str)
)

fig = go.Figure(data=[go.Sankey(
    #new added
    node=dict(
    pad=15,
    thickness=20,
    label=labels,
    color=node_colors,  # ← Use our list of rgba colors
    #line=dict(color="black", width=0.5)
),
    link=dict(
        source=df_grouped['Source'],
        target=df_grouped['Target'],
        value=df_grouped['Count'],
        customdata=df_grouped['LinkLabel'],
        hovertemplate='%{customdata}<extra></extra>',
        color=link_colors  # 👈 this applies the gradient shading
    )
)])
fig.update_layout(
    width=600,  # default is ~1000–1200
    height=600,
    title_text="Education to Job Segment Sankey (Shaded by Category + Strength)",
    font_size=10
)
#fig.update_layout(title_text="Education to Job Segment Sankey (Shaded by Category + Strength)")
fig.show()