# Exploratory data analysis



In [95]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import plotly.express as px
from plotly.subplots import make_subplots
import seaborn as sns
import re

## Read data from file:

In [96]:
ds_survey_df = pd.read_csv('../Dataset/final_data.csv', low_memory=False)

## **Ask some question:**

### **Question 01: How do factors such as residential country, current role, programming experience, current industry affect a person's salary? Is there any difference between men's and women's salaries? If yes, explain the reason.**

To answer the first question, let's see how the income is distributed by each factor.

### **1.1 Income distribution by residential country**:
Firstly, by above exploration, we can see that there is over 50 percent of missing values for the column `Current income`. As we discuss earlier, these missing values come from survey participants that currently are student. There for, in this analysis, we just drop all nan values
 

In [97]:
income_df = ds_survey_df.dropna(subset= 'Current income')
income_df['Current income'].value_counts()

Current income
$0-999              1112
10,000-14,999        493
30,000-39,999        464
1,000-1,999          444
40,000-49,999        421
100,000-124,999      404
5,000-7,499          391
50,000-59,999        366
7,500-9,999          362
150,000-199,999      342
20,000-24,999        337
60,000-69,999        318
15,000-19,999        299
70,000-79,999        289
25,000-29,999        277
2,000-2,999          271
125,000-149,999      269
3,000-3,999          244
4,000-4,999          234
80,000-89,999        222
90,000-99,999        197
200,000-249,999      155
250,000-299,999       78
300,000-499,999       76
$500,000-999,999      48
>$1,000,000           23
Name: count, dtype: int64

Through the above result, we can see that the income is seperated into too many ranges, We will group these ranges and create a new income range which become more easier for our anslysis.

In [98]:
new_income_range = {
    '<$10,000': (0, 10000),
    '$10,000-50,000': (0, 50000),
    '$50,000-100,000': (50000, 100000),
    '$100,000-300,000' : (100000, 300000),
    '$300,000-500,000' : (300000, 500000),
    '>$500,000' : (500000, float('inf'))
}

Now, replace the current range with the new range for all sample in our dataset.

In [99]:
processed_income_df = income_df.copy()
new_range_list = []
for index, income_range in enumerate(income_df['Current income']):
    for new_range, (lower_bound, upper_bound) in new_income_range.items():
        lower = income_range.split('-')[0]
        lower = int(lower.replace(',', '').replace('$', '').replace('>', ''))
        if lower_bound <= lower < upper_bound:
            new_range_list.append(new_range)
            break;

processed_income_df.loc[:,'Current income'] = new_range_list
processed_income_df['Current income'].value_counts()

Current income
<$10,000            3058
$10,000-50,000      2291
$50,000-100,000     1392
$100,000-300,000    1248
$300,000-500,000      76
>$500,000             71
Name: count, dtype: int64

We have converted all the current income of the participants to the new range of income.From this, `processed_income_df` will be the main dataframe that we use for out analysis.     
Now, let's check how the income is distributed by the residential country throgh these steps:
- Step 01: Group the income data by residential country then count the value for each range of current income.
- Step 02: Find the percentage of each income range in each country.
- Step 03: Use a `scatter geo plot` to present the distribution

In [100]:

income_by_countries_df = processed_income_df.groupby('Residential country')['Current income'].value_counts().to_frame().reset_index()
income_by_countries_df['Percentage'] = income_by_countries_df.groupby('Residential country')['count'].transform(lambda x: x / x.sum() * 100)
income_by_countries_df['Percentage'] = income_by_countries_df['Percentage'].round(3)

In [101]:

fig = px.scatter_geo(
                    income_by_countries_df, 
                    locations='Residential country', 
                    locationmode='country names',
                    color='Current income',
                    size='Percentage',
                    labels={ 'Current income': '<b>Income Range</b>'},
                    category_orders={'Current income': [ '<$10,000', '$10,000-50,000', '$50,000-100,000', '$100,000-300,000',  '$300,000-500,000', '>$500,000']},
                    opacity = 0.7
                    )

fig.update_geos(showcountries=True, countrycolor="black", showland=True, showocean=True, oceancolor="#E3F4F4", landcolor = '#A9B388' )

fig.update_layout(
                title='<b>Percentage Distribution of Income Ranges by Country<b>',
                title_font_size = 25,
                width = 1000,
                height = 600,
                margin=dict(l=20, r=20, t=70, b=0),
                paper_bgcolor='#ffe6cc'
                 )

fig.update_layout(legend=dict(
                            orientation="h", yanchor="bottom", y=0.97, xanchor="right", x=1 ))

fig.update_traces(marker=dict(sizemode='area', sizeref= 0.1))
fig.update_traces(marker_autocolorscale=True, selector=dict(type='scattergeo'))
fig.update_traces(marker_line_color='black', marker_line_width=0.5, hovertemplate='%{marker.size} (%) of participants in %{location} have the income %{fullData.name}')
config = {'scrollZoom': True}

fig.show(config = config)



- The countries that have the high percentage of income in range `<$10,000` mostly located in `Asia`, `Africa`, `South Americas` (from about 40% upto over 80% in `Iran` or 90% in `Ethiopia`). Expecially, in `Middle East` and `Africa`, this income range is the most popular for participants in these region.

- Unlike range `<$10,000`,we can see that the range `$10,000-50,000` has different distribution in most of countries. Which lower percentage in `Africa` and `Middle East`(of course). There also a big increase percentage for countries in `Europe` while other countries still remain the same percentage of previous range. The thick density of this range in `Europe` show that this is the porpular range of income in this continent.

- For the range `%50,000-100,000` there are some nortable changes  for all countries:
    - With `Africa`, most of countries have no one that has income in this range. It show that this income is a high-level income for most of countries in `Africa`. Only `South Africa` have about 21,6% participant that can reach this income range.
    - In `Europe`, most countries have high percentage of income that are in this range, go along with range `$10,000-50,000`.
    - Some countries in `Asia` also have hight percentage of this range is: `China`, `Japan`, `South Korea`, `Taiwan`. They are all countries that have high development in `Asia`.
    - Another notable countries is `Canada`, `The USA` and `Australia`, have very low percentage in two older ranges, but with this range, The percentage is much more higher.
- With range is `$100,000-300,000`, for most of countries, the percentage is slightly decrease, except for `The USA`, `Australia`, `United Arab Emirates`. Specially, with `Israel`, while up to 65.9% of participants in this country have income in this range.
- With two other ranges, these are very high range of income, so most of countries have very little percentage in this range.

In conclusion, which the residential country, we can see that there is a strong relationship between them with the current income of people in countries. The region that we will have more potential to get higher income is `EU`, `North America` and `Australia`

### **1.2 Income distribution by curent role:**
In this part, we will see how the roles of pariticipants affect to their income.         
Firstly, in `Data exploration part 2`, we found that this is many roles that is not related to the data jobs, so before starting our analysis, we will eliminate them.

In [102]:
data_roles = ['Manager (Program, Project, Operations, Executive-level, etc)',
            'Machine Learning/ MLops Engineer',
            'Research Scientist', 
            'Data Scientist',
            'Data Analyst (Business, Marketing, Financial, Quantitative, etc)',
            'Statistician',
            'Teacher / professor']

processed_income_df_copy = processed_income_df.copy()

role_income_df = processed_income_df_copy.groupby('Current role')['Current income'].value_counts().reset_index(name='Count')
filtered_role_income_df = role_income_df[role_income_df['Current role'].isin(data_roles)].copy()

# Shorten the name of roles for better visualization
filtered_role_income_df.loc[:, 'Current role'] = filtered_role_income_df['Current role'].replace(r'^Data Analyst.*', 'Data Analyst', regex=True)
filtered_role_income_df.loc[:, 'Current role'] = filtered_role_income_df['Current role'].replace(r'^Manager.*', 'Manager', regex=True)
filtered_role_income_df.loc[:, 'Current role'] = filtered_role_income_df['Current role'].replace(r'^Machine Learning.*', 'ML Engineer', regex=True)

Then, we will find the `Percentage` of the income for each role

In [103]:
filtered_role_income_df['Percentage'] = (filtered_role_income_df['Count'] / filtered_role_income_df.groupby('Current role')['Count'].transform('sum')) * 100
filtered_role_income_df.loc[:, 'Percentage'] = filtered_role_income_df['Percentage'].round(2)
filtered_role_income_df = filtered_role_income_df.sort_values(by = ['Percentage'], ascending=[True])


Finally, make a visualization for a clearly view of the distribution. In this case, I will use stacked bar chart to show the distribution of income range for each data role.

In [104]:

fig = px.bar(filtered_role_income_df, 
            y = 'Current role', 
            x = 'Percentage',
            color = 'Current income',
            text = 'Percentage',
            labels = {'Current role': '<b>Current Role</b>', 'Current income': '<b>Current Income</b>', 'Percentage': '<b>Percentage</b>'},
            category_orders = {'Current income':  [ '<$10,000', '$10,000-50,000', '$50,000-100,000', '$100,000-300,000','$300,000-500,000', '>$500,000']},
            color_discrete_sequence = ['#F2F7A1', '#F05941', '#BE3144', '#973089', '#541c7b','#141E46'],
            barmode = 'stack',
            )

fig.update_layout(
    title = '<b>The percentage of participants\' current income for each role</b>',
    title_font_size = 25,
    width = 1100,
    height = 600,
    margin = dict(l=10, r=10, t=110, b=20),
    paper_bgcolor = '#ffe6cc',
    xaxis = dict(tickfont=dict(size=14)),
    yaxis = dict(tickfont=dict(size=14), ticksuffix = " "),
)

fig.update_layout(legend=dict(
    orientation="h", yanchor="bottom", y=1, xanchor="right", x = 1
))

fig.update_traces(marker_line_color='black', marker_line_width=1.2, hovertemplate='%{text} (%) of %{y}have the income %{fullData.name}')
fig.show()

Through the plot, we can get these conclusions:

- With `Statiscian`, we can see that majority of statiscian (59.26%) have the income `<$10,000`, also, there isn't any statiscian have income `>$500,000`,and just a little percentage (0.93%) get the income `%300,000-500,000`. That indicates that the potential to have the high income in `Statiscian` is quite limited.

- `Teacher` and `Data Analyst` have the quite similar distribution of income, with about 50% in range `<$10,000`, followed by 25% in `$10,000-50,000` and 16% in `$100,000-300,000`, also a very low percentage in `>$500,000`, but compare with `Statiscian`, the overall income raise a little bit.

- With `Machine learning engineer`, `Data Scientist` and `Reseach Scientist`, more paricipants get higher income with just about 30% have income `<$10,000`, also, the percentage of income in range `$100,000-300,000` and `>$500,000` increase alot, shown that these role have more potential to get a higher income.

- Among the roles, `Manager` have the overall income that outstanding other roles. with over 30% in `$100,000-300,000`, and 2.55% `>$500,000`. It is reasonable because the `Manager` is always the one who have higher position, also more the skills and the experience than the others.  

To sum up, the is a large differences of how the income is distributed for each role. We can consider that `Manager` can have the highest income, followed by `Machine learning engineer`, `Data Scientist` and `Reseach Scientist`

###  **1.3 Income distribution by the programming experience**
We all know that, in the field of Information Technology, programming skills are widely recorgnized as one of the most important part for every jobs in IT, and the data-related as well. So in this section, we will explore how the income is affect by the programming experience. Is the one who have more programming expericence will have the high potential to get higher income?

In [105]:
experience_income_df = processed_income_df.groupby('Programming experience')['Current income'].value_counts().reset_index(name = 'Count')
experience_income_df.loc[:,'Percentage'] = ((experience_income_df['Count'] / experience_income_df.groupby('Programming experience')['Count'].transform('sum'))*100)
experience_income_df['Percentage'] = experience_income_df['Percentage'].round(2)
experience_income_df = experience_income_df.sort_values(by = ['Programming experience', 'Percentage'], ascending=[True, False])

In [106]:

fig = px.bar(experience_income_df, 
            x='Programming experience', 
            y='Percentage',
            color='Current income',
            text='Percentage',
            labels={ 'Current income': '<b>Current Income</b>', 'Percentage': '<b>Percentage</b>', 'Programming experience': '<b>Programming Experience</b>'},
            category_orders={'Current income':  ['<$10,000', '$10,000-50,000', '$50,000-100,000', '$100,000-300,000','$300,000-500,000', '>$500,000'],
                            'Programming experience': ['I have never written code', '< 1 years', '1-3 years', '3-5 years', '5-10 years', '10-20 years', '20+ years']},
            color_discrete_sequence=['#F2F7A1', '#84ce69', '#309771','#1c737b','#14526e' ,'#071952'],
            barmode='stack'
            )

fig.update_layout(
    title_text='<b>Participants\' current income with the programming experience</b>',
    title_font_size=25,
    width=1000,
    height=600,
    margin=dict(l=20, r=30, t=70, b=0),
    paper_bgcolor='#ffe6cc',
    xaxis=dict(tickfont=dict(size=14)),
    yaxis=dict(tickfont=dict(size=14), ticksuffix = " "),
)

fig.update_traces(marker_line_color='black', marker_line_width=1, hovertemplate='%{text} (%) of participants that %{x} have the income %{fullData.name}')

fig.show()


From this plot, we can see the overall trend is when individuals get more programming experience, their income potential tends to increase. We will dicuss in detail below:
- With experience `<3 years`: This is the early state of participant to acess to a new programming language, aslo they wil get lower income, which over 50% of them is in `<$10,000`.
- With experience `3-10 years`: In this range, the icome of participants will increase with higher percentage of higher income range.
- With experience `>10 years`: While there still individuals that have low income, but at this experience, most of participants will get a higher income, expecially in range `$100,000-300,000` and `$300,000-500,000`.

###  **1.4 Income distribution by the current industry**

The current industry that a person is working in also affect alot to the income.

In [107]:
industry_income_df = processed_income_df.groupby('Current industry')['Current income'].value_counts().reset_index(name = 'Count')
industry_income_df.loc[:,'Percentage'] = ((industry_income_df['Count'] / industry_income_df.groupby('Current industry')['Count'].transform('sum'))*100)
industry_income_df['Percentage'] = industry_income_df['Percentage'].round(2)
industry_income_df = industry_income_df.sort_values(by='Percentage', ascending=False)

In [108]:

fig = px.bar(
            industry_income_df, 
            y='Percentage', 
            x='Current industry',
            color='Current income',
            text='Percentage',
            labels={'Percentage': '<b>Percentage</b>', 'Current industry': '<b>Industry</b>', 'Current income': '<b>Current Income</b>'},
            category_orders={'Current income':  [ '<$10,000', '$10,000-50,000', '$50,000-100,000', '$100,000-300,000','$300,000-500,000', '>$500,000']},
            color_discrete_sequence=['#8ADAB2', '#B5CB99', '#E48F45','#CD6688','#7A316F' ,'#461959'],
             )

fig.update_layout(
    title='<b>Percentage distribution of income ranges by Industry<b>',
    title_font_size=30,
    width=1100,
    height=700,
    margin=dict(l=20, r=20, t=70, b=20),
    paper_bgcolor='#ffe6cc',
    xaxis=dict(tickfont=dict(size=14)),
    yaxis=dict(tickfont=dict(size=14), ticksuffix = " "),
)

fig.update_traces(marker_line_color='black', marker_line_width=0.5, hovertemplate='%{text} (%) of participants that in %{x} industry have the income %{fullData.name}')

fig.show()


By this plot, we can get some conclusion below:

- With low income (in range `<$50,000`) `Academics/ Education`, and `Non-profit/ Service` industry, we can easily know that these industry may get lower income. As we discuss above, partipants who work in `Academics/ Education` generally is  `Teacher / professor`, so there is a lower potential to get higher income. 

- In range of medium income (`$50,000 - 300,000`), the high percentage is `Medical/Pharmaceutical`, `Insurance/ Risk Assessment` and `Online Service/ Internet-based Services`. This means , most individuals in these industries mostly get the income in this range. This is a good range of salary for everyone. So these industriese are suitable for someone who are finding jobs that have the stable income.

- With the high income (`$300,000` or more), `Online Service`, `Accounting/Finance` and `Computer/ Technology` have the high percentage of `$300,000-399,999` and `>$500,000` show that these industries have the most potential to reach the highest range of income. Also, there is a notable industry is `Non-profit/Service`, despite of the very high percentage of low income, but in the high income range, the percentage also very high (upto 1.14%), this means individuals in `Non-profit/Service` industry also can get very high income.



##### **Overall:**
Okay, with all the analysis above, let's take a overall view for what we have done up to now:
<br>
Based on the overall exploration of factors such as geographical location, current role, programming experience, and qualifications, we can draw the following conclusions regarding their impact on a person's salary:

- `Residential country`:  Different regions or countries often have various salary levels due to factors such as cost of living, economic development, and demand for specific skills. Therefore, the income can very different between each countries.

- `Current Role`: The specific role that a person holds in the workplace can significantly affect their salary. Roles that require specialized skills, expertise, and have lots of demand in the industry like `ML Engineer`, `Reseacher Scientist`, `Data Scientist` will give higher salaries. Additionally, managerial or leadership positions like `Mangager` tend to have higher earning potential compared to entry-level or junior positions.

- `Programming Experience`: Programming experience is a valuable asset in the field of IT and can positively impact a person's salary. Generally, individuals with more programming experience tend to have higher income potential. As individuals gain more experience and skills in programming, they often become better in their roles, handle more complex projects, and can get higher salaries than employers.

- `Industry`: Industry is also a very important factor to determine the income of a person. Industries with the highly development rate, or in a specific condition, will tend to pay more to get the employer.  
       
Beside that, it is important to note that these factors do not affect the income independently, and their impact on salary can vary depending on various combinations and interactions. Other factors like `company size`, `skill set` can also influence a person's salary.

Overall, a combination of `Residential country`, `Current role`, `Programming experience`, and `Industry` can contribute to a person's salary. By understanding their influence, we can make better decisions about our career paths, skill development, and potential earnings.

### **1.5 Is there any difference between the income of Male and Female**?

For the gender, in this analysis, we just consider participants that is Male and Female.

In [109]:
gender_income_df = processed_income_df.groupby('Gender')['Current income'].value_counts().reset_index(name = 'Count')
filtered_gender_income_df = gender_income_df[gender_income_df['Gender'].isin(['Man', 'Woman'])].copy()
filtered_gender_income_df.loc[:,'Percentage'] = ((filtered_gender_income_df.loc[:,'Count'] / filtered_gender_income_df.groupby('Gender')['Count'].transform('sum'))*100)
filtered_gender_income_df['Percentage'] = filtered_gender_income_df['Percentage'].round(2)

In [110]:

fig = px.bar(filtered_gender_income_df, 
            x='Gender', 
            y='Percentage',
            color='Current income',
            text='Percentage', 
            labels={'Gender': '<b>Gender</b>', 'Percentage': '<b>Percentage</b>', 'Current income': '<b>Current Income</b>'},
            category_orders={'Current income':  ['<$10,000', '$10,000-50,000', '$50,000-100,000', '$100,000-300,000','$300,000-500,000', '>$500,000'],},
            color_discrete_sequence=['#D2D79F', '#86b395', '#75969b','#646684','#66536c' ,'#483838'],
            barmode='stack'
            )

fig.update_layout(
                    width=1000,
                    height=600,
                    title_text='<b>The percentage of participants\' current income for each gender</b>',
                    title_font_size=30,
                    margin=dict(l=20, r=30, t=70, b=0),
                    paper_bgcolor='#ffe6cc',
                    yaxis=dict(range=[0, 100]),
                )
fig.update_traces(marker_line_color='black', marker_line_width=0.5, hovertemplate='%{text} (%) of %{x} have the income %{fullData.name}')

fig.show()


With the plot above, we can make some comparisions between income of man and woman and find out some differences below:
- There are higher incomes for Men: Men have a higher percentage in most of income ranges (from `$10,000` and more). This suggests that men have a higher likelihood to earn higher incomes compared to women.
- Women show a higher percentage in the lower income range of `<$10,000.` This indicates that a larger percentage of women fall into the category of lower-income earners.

To make a explain for these differences

### **Question 02: What is the tool set for each data roles?**

Tools like programming language, ide, Machine learning frameworks,etc are very important for a person who work in the data industry. For each roles, there will be several specific tools that s person should learn and master it. So, with this question, we will find out a set of tools that each roles in data-related jobs need.

In this question, we just concern about these roles: `Data Scientist`, `Data Analyst`, `Machine learning engineer`, `Research scientist`, `Statistician`.          
Also, the tools we wil consider here is `Programming language`, `IDE`, `Data visualization libraries`, `ML frameworks`, `ML Algorithms`, `NLP methods`, `Computer visions method`, `Data products`, `Business Intelligent tools`.`

In [111]:
data_roles = ['Machine Learning/ MLops Engineer',
            'Research Scientist', 
            'Data Scientist',
            'Data Analyst (Business, Marketing, Financial, Quantitative, etc)',
            'Statistician']

skills = ['Programming language', 'IDE', 'Data visualization libraries', 'ML frameworks', 'ML Algorithms', 'NLP methods', 'Computer visions method', 'Data products', 'Business Intelligent tools']
role_df = processed_income_df[processed_income_df.loc[:,'Current role'].isin(data_roles)].copy()

role_df.loc[:,'Current role'] = role_df['Current role'].replace(r'^Data Analyst.*', 'Data Analyst', regex=True)
role_df.loc[:,'Current role'] = role_df['Current role'].replace(r'^Machine Learning.*', 'ML Engineer', regex=True)

filtered_role_df = role_df.loc[:,role_df.columns.str.startswith(tuple(skills))]
filtered_role_df['Current role'] = role_df['Current role']

filtered_role_df = filtered_role_df.groupby('Current role').count()



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



After that, we will extract the dataframe for each tool and store them in a dictionary

In [112]:

skill_df_dict = {}
for skill in skills:
    sub_df = filtered_role_df.loc[:, filtered_role_df.columns.str.startswith(skill)]
    sub_df.index = sub_df.index.rename(name= None)
    skill_df_dict[skill] = sub_df.T
     

Then, find the percentage of participant that use each tool

In [113]:
for key, value in skill_df_dict.items():
    for role in value.columns:
        value[role+'_total'] = value[role].sum()
        value[role] = (value[role]/ value[role+'_total'])* 100
        value.drop(columns= [role +'_total'], inplace= True)
        # value = value.T


We just get the three most popular tools for each tool type

In [114]:
top_3_skill_dict_for_roles = {}

for key, value in skill_df_dict.items():
    skill_set_list = []
    for role in value.columns:
        if role not in top_3_skill_dict_for_roles:
            top_3_skill_dict_for_roles[role] = value.loc[:,role].nlargest(3).to_frame().T
        else:
            top_3_skill_dict_for_roles[role] = pd.concat([top_3_skill_dict_for_roles[role], value.loc[:, role].nlargest(3).to_frame().T], axis=1)
        

In [115]:
all_role_skill_dict = {}

for key, value in top_3_skill_dict_for_roles.items():
    skill_dict = {}
    for col in value.columns:
        match = re.search(r'(.+)\s+\(([^)]+)\)', col)
        if match:
            skill_group = match.group(1)
            skill = match.group(2)
            if skill_group not in skill_dict:
                skill_dict[skill_group] = []
            skill_dict[skill_group].append((skill, value.loc[key,col]))
    all_role_skill_dict[key] = skill_dict


Now, visualize and see the result for each role:

In [116]:

fig = make_subplots(
    rows = 3, cols = 2,
    subplot_titles = [f'<b>Tool set for {role}</b>' for role in all_role_skill_dict.keys()],
    specs = [[{'type': 'sunburst'}] *2] * 3,
    horizontal_spacing = 0,
    vertical_spacing=0.05,
)

for i, (role, skill_dict) in enumerate(all_role_skill_dict.items(),):

    df = pd.DataFrame(skill_dict)
    df = pd.melt(df)
    df['value1'] = df['value'].apply(lambda x: x[0])
    df['value2'] = df['value'].apply(lambda x: x[1])
    df.drop(columns='value', inplace=True)
    df.rename(columns={'value1': '<b>Tool</b>', 'variable': '<b>Tool type</b>', 'value2': 'Percentage'}, inplace=True)

    sunburst_fig = px.sunburst(df,
                               path=['<b>Tool type</b>', '<b>Tool</b>'],
                               values='Percentage',
                               branchvalues='total',
                               color_continuous_scale='Viridis'
                               )
    sunburst_data = sunburst_fig['data'][0]
    sunburst_data['meta'] = {'role': role}
    sunburst_data['hovertemplate'] = '%{label} is used by %{value} of %{meta.role}'
    row_idx = i // 2 + 1
    col_idx = i % 2 + 1

    fig.add_trace(sunburst_data, row=row_idx, col=col_idx)

# Update the layout
fig.update_layout(
    width=1100,
    height=1800,
    margin=dict(l=0, r=0, t=70, b=20),
    paper_bgcolor='#ffe6cc',
    showlegend=False,
)
fig.update_traces(marker_line_color='black', marker_line_width=0.5)

fig.show()


By these plot, we can see the toolset for each role. There are lots of similarities between the tool set of each role. Beside that, there also some tool are most specific for each role. Below is something special we can consider about these plot:

- With `Programming language`:  `Bash` is used by `Machine Learning Engineer`, and `Matlab` for `Research Scientist`

- With `IDE`: For `Statistician` and `Research Scientist`, `RStudio` is the most necessary because `R` is a significant language for statistical analysis.

- With `ML Algorithms`: `Convolational Neutral Network` is mostly used  by `Machine Learning Engineer` and `Research Scientist`, 


In conclusion, tools that is specific for each roles. From that, be can start to learn, and practice it, to prepare for our career in the future.