In [None]:
!git clone https://github.com/MohamedKhalifa1/Stack-Overflow-Annual-Developer-Survey-Analysis

Cloning into 'Stack-Overflow-Annual-Developer-Survey-Analysis'...
remote: Enumerating objects: 75, done.[K
remote: Counting objects: 100% (28/28), done.[K
remote: Compressing objects: 100% (26/26), done.[K
remote: Total 75 (delta 6), reused 21 (delta 1), pack-reused 47[K
Receiving objects: 100% (75/75), 30.90 MiB | 14.45 MiB/s, done.
Resolving deltas: 100% (14/14), done.


In [None]:
!unzip /content/Stack-Overflow-Annual-Developer-Survey-Analysis/data/preprocessed/01_preprocessed_data.zip

In [None]:
DATA_PATH = '/content/content/01_preprocessed_data.pkl'
ROLES_PATH = '/content/Stack-Overflow-Annual-Developer-Survey-Analysis/data/raw/roles_short_names.csv'
IMAGES_PATH = '/content/Stack-Overflow-Annual-Developer-Survey-Analysis/img'

In [None]:
ROLE_COLS      = ['DevType']
TECH_COLS      = ['LanguageHaveWorkedWith',
                  'DatabaseHaveWorkedWith',
                  'PlatformHaveWorkedWith',
                  'WebframeHaveWorkedWith',
                  'MiscTechHaveWorkedWith',
                  'ToolsTechHaveWorkedWith',
                  'NEWCollabToolsHaveWorkedWith']

In [None]:
# !unzip /content/Stack-Overflow-Annual-Developer-Survey-Analysis/data/raw/stack-overflow-developer-survey-2023.zip -d /content/Stack-Overflow-Annual-Developer-Survey-Analysis/data/raw

In [None]:
%cd /content/Stack-Overflow-Annual-Developer-Survey-Analysis

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import pickle
from sklearn.preprocessing import MultiLabelBinarizer
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots

# Read data and preprocess

In [None]:
raw_df  = pd.read_pickle(DATA_PATH)
roles_names = pd.read_csv(ROLES_PATH , sep=';')

In [None]:
raw_df.head()

In [None]:
raw_df.info()

In [None]:
raw_df.describe()

In [None]:
roles_names.head()

In [None]:
print(raw_df['DevType'].unique())

In [None]:
print(roles_names['Short name '].unique())

In [None]:
roles_names.info()

In [None]:
df = raw_df.copy()
encoded_dfs = {}
for col in ROLE_COLS + TECH_COLS:
    df[col].fillna('', inplace=True)

    df[col] = df[col].apply(lambda x: x if isinstance(x, list) else [x] if pd.notnull(x) else [])

    binarizer = MultiLabelBinarizer()
    encoded_df = pd.DataFrame(binarizer.fit_transform(df[col]),
                               columns=binarizer.classes_,
                               index=df.index)
    encoded_dfs[col] = encoded_df

In [None]:
df = pd.concat(encoded_dfs, axis=1)

In [None]:
df.head()

In [None]:
jobs_freq = df['DevType'].copy().sum().sort_values().reset_index()
jobs_freq

# Display Skills Frequancy

In [None]:
skills_freq = df.copy().drop('DevType', axis=1).sum().sort_values(ascending=False).reset_index()
skills_freq.columns = ['group', 'skill', 'freq']

In [None]:
skills_freq

In [None]:
!mkdir -p /content/Stack-Overflow-Annual-Developer-Survey-Analysis/img

In [None]:
fig = px.treemap(skills_freq,
                 title='Skills Frequancy',
                 path=['group', 'skill'],
                 color_continuous_scale='deep',
                 values='freq',color='freq')

fig.update_layout(width=1700, height=800)
fig.show()
fig.write_html(f'{IMAGES_PATH}/skills_freq.html')