In [1]:
# important paths
DATA_PATH = r"..\data\processed\processed_1.pkl"
EXPORT_PATH = r"..\data\processed\processed_2.pkl"

In [2]:
TECH_COLS = ['LanguageHaveWorkedWith', 'DatabaseHaveWorkedWith', 
             'PlatformHaveWorkedWith', 'WebframeHaveWorkedWith', 
             'MiscTechHaveWorkedWith', 'ToolsTechHaveWorkedWith']

TECH_NEXT_COLS = ['LanguageWantToWorkWith', 'DatabaseWantToWorkWith', 
                  'PlatformWantToWorkWith', 'WebframeWantToWorkWith', 
                  'MiscTechWantToWorkWith', 'ToolsTechWantToWorkWith']

ROLE_COLS = ['DevType']

In [3]:
# used libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import plotly.express as px
import plotly.graph_objects as go
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.manifold import TSNE
from sklearn.preprocessing import StandardScaler
import warnings


In [4]:
warnings.filterwarnings("ignore")


### Reading & One Hot Encdoing

In [5]:
processed_data = pd.read_pickle(DATA_PATH)
df = processed_data.copy()

In [6]:
# 1 hot encoding Tech_Cols
encoded_dfs = {}
for col in TECH_COLS :
    mlb = MultiLabelBinarizer()
    one_hot_encoded = pd.DataFrame(mlb.fit_transform(df[col]), columns=mlb.classes_, index=df.index)
    encoded_dfs[col] = one_hot_encoded
df = pd.concat( encoded_dfs, axis=1)


In [7]:
# 1 hot encoding DevType columns & making it doubled level
role_col = processed_data['DevType']
one_hot_single_col = pd.get_dummies(role_col)
multi_level_columns = pd.MultiIndex.from_product([['DevType'], one_hot_single_col.columns])
one_hot_single_col.columns = multi_level_columns
df = pd.concat([df, one_hot_single_col], axis=1)


### Apply T-SNE 

In [8]:
skills_ohe = df.drop('DevType', axis=1).copy()
std_skills = StandardScaler().fit_transform(skills_ohe)

In [None]:
tsne_projection = TSNE(n_components=2, 
                       perplexity=3,
                       learning_rate=0.01,
                       init='pca', 
                       n_jobs=-1, 
                       n_iter=10**10,
                       random_state=0).fit_transform(std_skills.T)

In [None]:
tsne_projection = pd.DataFrame(tsne_projection, index=skills_ohe.columns)

In [None]:
fig = px.scatter(x=tsne_projection[0], y=tsne_projection[1], text=tsne_projection.droplevel(0).index)
fig.update_traces(textposition='top center')
fig.update_layout(height=800, width=800, title_text='TNSE')
fig.show()