<span style="color:#b30000;font-size:25px;"><strong>The emphasis of this Notebook lies in analyzing the columns that will serve as features during the Modeling phase</strong></span>

<span style="color:#2929a3;font-size:20px;">Import Libraries</span>

In [55]:
import pandas as pd
import numpy as np
import plotly.express as px
from sklearn.preprocessing import MultiLabelBinarizer
import plotly.graph_objects as go

<span style="color:#2929a3;font-size:20px;">Read Dataset </span>

In [56]:
original_data = pd.read_csv('Data/Features.csv')

<span style="color:#2929a3;font-size:20px;">Show Sample of The Dataset </span>

In [57]:
original_data.sample(5)

Unnamed: 0,DevType,LanguageHaveWorkedWith,DatabaseHaveWorkedWith,PlatformHaveWorkedWith,WebframeHaveWorkedWith,MiscTechHaveWorkedWith,ToolsTechHaveWorkedWith,NEWCollabToolsHaveWorkedWith
40146,"Developer, mobile",Swift,MySQL;SQLite,Amazon Web Services (AWS);Firebase,,SwiftUI,,Xcode
46753,"Developer, full-stack",HTML/CSS;JavaScript;SQL,Elasticsearch,Amazon Web Services (AWS);Google Cloud,Express;Node.js;React,RabbitMQ;React Native,Docker;Homebrew;npm;Webpack,WebStorm
54860,"Developer, desktop or enterprise applications",C#;C++;PowerShell;Python,,,,.NET (5+) ;.NET Framework (1.0 - 4.8);MFC;Qt,Chocolatey;MSBuild;MSVC;NuGet;Pacman;Pip;Visua...,Eclipse;Nano;Notepad++;Qt Creator;Rad Studio (...
81722,"Developer, game or graphics",Groovy;HTML/CSS;Java;JavaScript;Kotlin;TypeScript,Dynamodb;PostgreSQL;Supabase,Amazon Web Services (AWS);Vercel,Express;Next.js;Node.js;React,Electron;Ktor;OpenGL,Ant;Chocolatey;Gradle;npm;Vite;Webpack;Yarn,Eclipse;IntelliJ IDEA;Nano;Visual Studio Code;...
58274,,HTML/CSS;Java;JavaScript;TypeScript,Oracle;PostgreSQL,Amazon Web Services (AWS),NestJS;Node.js;Vue.js,Apache Kafka,npm;Vite,IntelliJ IDEA;WebStorm


<span style="color:#2929a3;font-size:20px;">Show Shape of Dataset </span>

In [58]:
original_data.shape

(89184, 8)

<span style="color:#2929a3;font-size:20px;">Check Duplication </span>

In [59]:
original_data.duplicated().sum()

1961

In [60]:
original_data.drop_duplicates(inplace=True)

In [61]:
original_data.reset_index(drop=True, inplace=True)

<span style="color:#2929a3;font-size:20px;">Check Null Values</span>

In [62]:
original_data.isnull().sum()

DevType                         10812
LanguageHaveWorkedWith            409
DatabaseHaveWorkedWith          13850
PlatformHaveWorkedWith          23620
WebframeHaveWorkedWith          20329
MiscTechHaveWorkedWith          30255
ToolsTechHaveWorkedWith          9406
NEWCollabToolsHaveWorkedWith     1639
dtype: int64

<span style="color:#2929a3;font-size:20px;">Read Info of The Dataset</span>

In [63]:
original_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 87223 entries, 0 to 87222
Data columns (total 8 columns):
 #   Column                        Non-Null Count  Dtype 
---  ------                        --------------  ----- 
 0   DevType                       76411 non-null  object
 1   LanguageHaveWorkedWith        86814 non-null  object
 2   DatabaseHaveWorkedWith        73373 non-null  object
 3   PlatformHaveWorkedWith        63603 non-null  object
 4   WebframeHaveWorkedWith        66894 non-null  object
 5   MiscTechHaveWorkedWith        56968 non-null  object
 6   ToolsTechHaveWorkedWith       77817 non-null  object
 7   NEWCollabToolsHaveWorkedWith  85584 non-null  object
dtypes: object(8)
memory usage: 5.3+ MB


<span style="color:#2929a3;font-size:20px;">Drop Rows that has Null values in DevType(Target)</span>

In [64]:
target_drop_idx = original_data[original_data['DevType'].isnull() == 1].index

In [65]:
original_data.drop(target_drop_idx, axis=0, inplace=True)

In [66]:
original_data.reset_index(drop=True, inplace=True)

<span style="color:#2929a3;font-size:20px;">Apply Split_Answer function</span>

In [67]:
## Create Split_Answers Funcion
def Split_Answers(answer):
    if isinstance(answer, str):
        return answer.split(';')
    else:
        ## if answer is null function returns empty list
        return []

In [68]:
## Apply Split_Answer Funcion to each column in the dataset
data = pd.DataFrame()
for col in original_data.drop('DevType', axis=1).columns:
    data[col] = original_data[col].apply(Split_Answers)
data['DevType'] = original_data['DevType']

In [69]:
data.sample(5)

Unnamed: 0,LanguageHaveWorkedWith,DatabaseHaveWorkedWith,PlatformHaveWorkedWith,WebframeHaveWorkedWith,MiscTechHaveWorkedWith,ToolsTechHaveWorkedWith,NEWCollabToolsHaveWorkedWith,DevType
59488,[C#],[MySQL],"[Amazon Web Services (AWS), Google Cloud]",[WordPress],[],"[Unity 3D, Visual Studio Solution]","[Notepad++, Visual Studio, Visual Studio Code]","Developer, game or graphics"
51019,"[C#, F#, Rust]","[Microsoft SQL Server, PostgreSQL]","[Digital Ocean, Microsoft Azure, Render]","[Angular, ASP.NET CORE, Svelte]",[.NET (5+) ],"[Cargo, NuGet]","[CLion, Goland, Visual Studio, Visual Studio C...",DevOps specialist
46366,"[Bash/Shell (all shells), HTML/CSS, Java, Java...","[MongoDB, PostgreSQL]",[Amazon Web Services (AWS)],"[Express, jQuery, Node.js, React, Spring Boot,...",[Spring Framework],"[Composer, Docker, Maven (build tool), npm, We...","[IntelliJ IDEA, Notepad++, Vim, Visual Studio ...","Developer, full-stack"
47792,"[HTML/CSS, Java, Python, SQL]","[MariaDB, Oracle, SQLite]",[],[Flask],[],"[Docker, Maven (build tool), Pip]","[IntelliJ IDEA, Sublime Text, Vim]","Developer, full-stack"
8221,"[HTML/CSS, JavaScript, PHP]",[],[],[jQuery],[Xamarin],[],"[Atom, Visual Studio Code]","Developer, full-stack"


<span style="color:#2929a3;font-size:20px;">Apply MultiLabel Binarizer to each column</span>

In [70]:
encoded_dfs = {}
for col in data.drop('DevType', axis=1).columns:
    mlb = MultiLabelBinarizer()
    mlb_df = pd.DataFrame(mlb.fit_transform(data[col]), columns=mlb.classes_)
    encoded_dfs[col] = mlb_df

In [71]:
## Merge encoded_dfs in one dataframe
df = pd.concat(encoded_dfs, axis=1)
df['DevType'] = data['DevType']

In [72]:
df.sample(5) 

Unnamed: 0_level_0,LanguageHaveWorkedWith,LanguageHaveWorkedWith,LanguageHaveWorkedWith,LanguageHaveWorkedWith,LanguageHaveWorkedWith,LanguageHaveWorkedWith,LanguageHaveWorkedWith,LanguageHaveWorkedWith,LanguageHaveWorkedWith,LanguageHaveWorkedWith,...,NEWCollabToolsHaveWorkedWith,NEWCollabToolsHaveWorkedWith,NEWCollabToolsHaveWorkedWith,NEWCollabToolsHaveWorkedWith,NEWCollabToolsHaveWorkedWith,NEWCollabToolsHaveWorkedWith,NEWCollabToolsHaveWorkedWith,NEWCollabToolsHaveWorkedWith,NEWCollabToolsHaveWorkedWith,DevType
Unnamed: 0_level_1,APL,Ada,Apex,Assembly,Bash/Shell (all shells),C,C#,C++,Clojure,Cobol,...,Sublime Text,TextMate,VSCodium,Vim,Visual Studio,Visual Studio Code,WebStorm,Xcode,condo,Unnamed: 21_level_1
72503,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,"Developer, front-end"
20141,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,"Developer, full-stack"
14496,0,0,0,0,0,0,1,0,0,0,...,0,0,0,0,0,1,0,0,0,"Developer, full-stack"
43081,0,0,0,0,0,0,1,0,0,0,...,1,0,0,0,0,0,0,1,0,Security professional
33822,0,0,0,0,0,0,1,0,0,0,...,0,0,0,0,1,1,0,0,0,"Developer, desktop or enterprise applications"


In [73]:
## Display DevType Frequency
df['DevType'].value_counts().reset_index()

Unnamed: 0,DevType,count
0,"Developer, full-stack",25689
1,"Developer, back-end",13705
2,"Developer, front-end",5018
3,"Developer, desktop or enterprise applications",3848
4,Other (please specify):,3025
5,"Developer, mobile",2520
6,Engineering manager,2025
7,Student,1966
8,"Developer, embedded applications or devices",1837
9,Data scientist or machine learning specialist,1585


<span style="color:#2929a3;font-size:20px;">Display tree map for all skills  <span> 

In [74]:
skills_freq = df.drop('DevType', axis=1).sum().reset_index()
skills_freq.columns = ['Group', 'Skill', 'Freq']
#skills_freq

In [75]:
fig_tree = px.treemap(skills_freq, path=['Group', 'Skill'],
                      values='Freq',
                      color='Freq', 
                      color_continuous_scale='deep')
fig_tree.show()

<span style="color:#2929a3;font-size:20px;">Create Jobs & Skills Heatmap<span> 

In [76]:
sorted_jobs = df['DevType'].value_counts(ascending=True).index.tolist()

In [77]:
sorted_skills = df.drop('DevType', axis=1).sum().sort_values(ascending=False).droplevel(level=0).index.tolist()

In [78]:
# dict for percentage of each skill exist in each skill column for each job
skills_jobs = {}

# iterate for each job
for job in sorted_jobs:
    skills_job = {} # dict for percentage of each skill exist in each skill column for one job
    
    # iterate for each skill column 
    for skill in data.drop('DevType', axis=1).columns:
        skill_job = df[df['DevType'] == job][skill].mean() * 100 
        skills_job[skill] = skill_job
        
    # concat dict of each skill column percentage for one job
    skills_job = pd.concat(skills_job)
    skills_jobs[job] = skills_job
    
# concat dict of each skill percentage of each job
skills_jobs = pd.concat(skills_jobs, axis=1)

In [79]:
skills_jobs = skills_jobs.reset_index(level=0, drop=True)
skills_jobs = skills_jobs.loc[sorted_skills]
skills_jobs = skills_jobs.T

In [80]:
skills_jobs

Unnamed: 0,Visual Studio Code,JavaScript,HTML/CSS,SQL,Docker,Python,npm,PostgreSQL,Amazon Web Services (AWS),TypeScript,...,TiDB,build2,Raku,condo,tunit,snitch,lest,CUTE,ELFspy,liblittletest
Marketing or sales professional,57.718121,56.375839,55.704698,40.939597,28.187919,45.637584,32.214765,30.872483,32.214765,16.778523,...,2.684564,2.013423,0.671141,1.342282,1.342282,0.671141,0.671141,1.342282,0.671141,0.671141
Developer Advocate,73.584906,63.679245,56.603774,46.698113,52.830189,42.45283,43.867925,44.339623,50.943396,34.433962,...,0.0,0.0,0.0,0.0,0.471698,0.0,0.471698,0.0,0.0,0.0
Database administrator,51.764706,29.803922,31.764706,82.745098,25.882353,42.745098,11.372549,30.588235,29.803922,6.666667,...,0.784314,0.392157,0.392157,0.392157,0.392157,0.392157,0.0,0.0,0.0,0.0
Designer,59.70696,63.003663,64.468864,30.769231,21.978022,31.135531,31.501832,17.216117,21.978022,18.681319,...,0.0,0.3663,0.732601,0.3663,0.0,0.0,0.0,0.0,0.0,0.0
Hardware Engineer,59.363958,25.795053,27.915194,22.261484,26.855124,70.671378,15.90106,13.074205,14.840989,7.067138,...,0.0,0.0,1.060071,0.353357,0.353357,0.0,0.0,0.353357,0.0,0.353357
Blockchain,68.322981,62.42236,38.509317,30.434783,52.795031,41.925466,50.621118,50.310559,50.931677,52.484472,...,0.621118,0.931677,0.310559,0.621118,0.621118,0.310559,0.621118,0.310559,0.310559,0.310559
Developer Experience,73.312883,55.214724,38.343558,36.503067,55.521472,49.693252,41.411043,36.503067,51.226994,40.184049,...,0.613497,0.920245,0.0,1.226994,0.306748,0.306748,0.306748,0.613497,0.306748,0.0
Scientist,52.161383,27.95389,25.360231,28.242075,29.682997,78.097983,12.391931,20.172911,21.613833,7.204611,...,0.288184,0.288184,0.288184,0.576369,0.288184,0.288184,0.288184,0.0,0.0,0.288184
Educator,65.859564,58.11138,56.416465,46.004843,28.087167,63.438257,31.476998,29.782082,25.181598,20.096852,...,0.0,0.0,0.242131,0.242131,0.0,0.0,0.0,0.0,0.0,0.0
"Engineer, site reliability",71.896956,49.882904,40.046838,44.496487,72.131148,71.896956,38.875878,55.503513,64.40281,31.850117,...,0.234192,0.0,0.234192,0.234192,0.0,0.0,0.234192,0.0,0.0,0.0


In [81]:
fig = go.Figure(data=go.Heatmap(z=skills_jobs, x = skills_jobs.columns , y = skills_jobs.index))
fig.update_layout(width=5000, height=800)
fig.show()
fig.write_html('Skills_Jobs.html')

<span style="color:#2929a3;font-size:20px;">Create Scaled Jobs & Skills Heatmap<span> 

In [82]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
scaled_skills_jobs = pd.DataFrame(scaler.fit_transform(skills_jobs), columns=skills_jobs.columns, index=skills_jobs.index)

In [83]:
fig = go.Figure(data=go.Heatmap(z=scaled_skills_jobs, x = scaled_skills_jobs.columns , y = scaled_skills_jobs.index))
fig.update_layout(width=5000, height=800)
fig.show()
fig.write_html('Scaled_Skills_Jobs.html')

## From the previous visualizations we noticed that scaling the data makes the value of each skill more expressive to each job

 
- For example before scaling the percentage of Keras in Data Science is 34.3 out of 100 but after scaling the value becomes 5.02 out of 6 which is more expressive

- Another example , before scaling Unity 3D in Game/Graphics Developing is 44.7 out of 100 but after scaling the value becomes 5.4 out of 6 which is also more expressive

- Another example , before scaling Visual Studio Code in Database Administration is 51.7 out of 100 but after scaling the value becomes -2.1 out of 6 which is also more expressive .. etc

<span style="color:#2929a3;font-size:20px;">Jop Profiles<span> 

In [84]:
#job = np.random.choice(sorted_jobs)
job = 'Data scientist or machine learning specialist'
#job = 'Marketing or sales professional'
#job = 'Developer, back-end'
#job = 'Developer, front-end'
#job = 'Developer, full-stack'
#job = 'Data or business analyst'
#job = 'Database administrator'

In [85]:
job

'Data scientist or machine learning specialist'

In [86]:
single_job_skills = pd.concat([skills_jobs.loc[job], scaled_skills_jobs.loc[job]], axis=1)
single_job_skills.columns = ['Percentage', 'Scaled_Value']
single_job_skills = single_job_skills.sort_values('Percentage')

In [87]:
single_job_skills.head()

Unnamed: 0,Percentage,Scaled_Value
liblittletest,0.0,-0.606721
CUTE,0.0,-0.537147
lest,0.0,-0.855274
snitch,0.0,-0.788875
tunit,0.0,-0.662249


In [88]:
## Comparing skills before and after scaling for random job
threshold = 25

single_job_skills = single_job_skills[single_job_skills['Percentage'] > threshold]

fig = px.bar(x=single_job_skills['Percentage'],
             y=single_job_skills.index,
             color=single_job_skills['Scaled_Value'],
             color_continuous_scale='orrd',
            range_color=[scaled_skills_jobs.min().min(), scaled_skills_jobs.max().max()])
fig.update_layout(title=job)
fig.show()

<span style="color:#2929a3;font-size:20px;">Display Dendogram for this Data<span> 

In [89]:
## Dict for replace long columns names
dict_Devtype = {'Senior Executive (C-Suite, VP, etc.)':'Senior Executive',
                'Developer, back-end':'Back-End',
                'Developer, front-end':'Front-End',
                'Developer, full-stack':'Full-Stack',
                'Developer, desktop or enterprise applications':'Desktop Developer',
                'Developer, QA or test':'Tester',
                'Data scientist or machine learning specialist':'Data Scientist',
                'Data or business analyst':'Data Analyst',
                'Developer, embedded applications or devices':'Embedded Developer',
                'Marketing or sales professional':'Marketing/Sales',  
}

short_labels = scaled_skills_jobs.index.to_series().replace(dict_Devtype).tolist()

import scipy.cluster.hierarchy as sch
import plotly.figure_factory as ff

fig_dendo = ff.create_dendrogram(scaled_skills_jobs, labels=short_labels, orientation='left')
fig_dendo.update_layout(width=1000, height=600)
fig_dendo.show()

<span style="color:#2929a3;font-size:20px;">Drop rows that has illogical Number of skills from original dataframe (Data) <span> 

In [90]:
data.head()

Unnamed: 0,LanguageHaveWorkedWith,DatabaseHaveWorkedWith,PlatformHaveWorkedWith,WebframeHaveWorkedWith,MiscTechHaveWorkedWith,ToolsTechHaveWorkedWith,NEWCollabToolsHaveWorkedWith,DevType
0,"[HTML/CSS, JavaScript, Python]",[Supabase],"[Amazon Web Services (AWS), Netlify, Vercel]","[Next.js, React, Remix, Vue.js]","[Electron, React Native, Tauri]","[Docker, Kubernetes, npm, Pip, Vite, Webpack, ...","[Vim, Visual Studio Code]","Senior Executive (C-Suite, VP, etc.)"
1,"[Bash/Shell (all shells), Go]",[],"[Amazon Web Services (AWS), Google Cloud, Open...",[],[],"[Cargo, Docker, Kubernetes, Make, Nix]","[Emacs, Helix]","Developer, back-end"
2,"[Bash/Shell (all shells), HTML/CSS, JavaScript...","[PostgreSQL, Redis]","[Cloudflare, Heroku]","[Node.js, React, Ruby on Rails, Vue.js, WordPr...",[],"[Homebrew, npm, Vite, Webpack, Yarn]","[IntelliJ IDEA, Vim, Visual Studio Code, WebSt...","Developer, front-end"
3,"[HTML/CSS, JavaScript, TypeScript]","[BigQuery, Elasticsearch, MongoDB, PostgreSQL]","[Amazon Web Services (AWS), Firebase, Heroku, ...","[Express, Gatsby, NestJS, Next.js, Node.js, Re...",[],"[Docker, npm, Webpack, Yarn]","[Vim, Visual Studio Code]","Developer, full-stack"
4,"[Bash/Shell (all shells), HTML/CSS, JavaScript...","[BigQuery, Cloud Firestore, PostgreSQL, Redis]","[Amazon Web Services (AWS), Cloudflare, Google...","[Angular, Express, NestJS, Node.js]",[],"[Docker, Homebrew, Kubernetes, npm, pnpm, Terr...","[Helix, Neovim]","Developer, back-end"


In [91]:
def Count_Total_Skills(row):
    return sum(len(List) for List in row)

In [92]:
Total_Skills_Count = data[data.columns[1:]].apply(Count_Total_Skills, axis=1)

In [93]:
px.histogram(Total_Skills_Count, title='Total Skills Count')

In [94]:
## Drop rows that has No of Skills more thar 40 or less than 4
drop_skills_idx = Total_Skills_Count[(Total_Skills_Count > 40) | (Total_Skills_Count < 4)].index

In [95]:
## Drop from both data and original data 
data.drop(drop_skills_idx, axis=0, inplace=True)
original_data.drop(drop_skills_idx, axis=0, inplace=True)

# reset index of both data and original data
data.reset_index(drop=True, inplace=True)
original_data.reset_index(drop=True, inplace=True)

<span style="color:#2929a3;font-size:20px;">Drop rows that has general jobs or not important jobs<span> 

In [96]:
excluded_jobs = [
    'Other (please specify):',
    'Student',
    'Designer',
    'Educator',
    'Marketing or sales professional',
    'Engineering manager',
    'Senior Executive (C-Suite, VP, etc.)',
    'Product manager',
    'Project manager',
    'Developer Advocate',
    'Developer Experience',
    'Scientist',
    'Engineer, site reliability',
    'Academic researcher',
    'Research & Development role']

In [97]:
drop_DevType_idx = data[data['DevType'].isin(excluded_jobs)].index

In [98]:
## Drop from both data and original data
data.drop(drop_DevType_idx, axis=0, inplace=True)
original_data.drop(drop_DevType_idx, axis=0, inplace=True)

## Reset index of both data and original data
data.reset_index(drop=True, inplace=True)
original_data.reset_index(drop=True, inplace=True)

<span style="color:#2929a3;font-size:20px;">Merge Close Jobs<span> 

In [99]:
## Merge Embedded System Developer With Hardware Developer
close_job1_idx = data[data['DevType'].isin(['Developer, embedded applications or devices', 'Hardware Engineer'])].index

In [100]:
data['DevType'].loc[close_job1_idx] = "Developer, Hardware/Embedded Systems"
original_data['DevType'].loc[close_job1_idx] = "Developer, Hardware/Embedded Systems"

In [101]:
## Merge System Adminstration with Database Adminstration
close_job2_idx = data[data['DevType'].isin(['Database administrator','System administrator'])].index

In [102]:
data['DevType'].loc[close_job2_idx] = "Database/System Admininstrator"
original_data['DevType'].loc[close_job2_idx] = "Database/System Admininstrator"

In [103]:
## Merge Front-End , Back-End and Full-Stack as Web Developer
close_job3_idx = data[data['DevType'].isin(['Developer, back-end','Developer, front-end', 'Developer, full-stack'])].index

In [104]:
data['DevType'].loc[close_job3_idx] = "Web Developer"
original_data['DevType'].loc[close_job3_idx] = "Web Developer"

In [105]:
original_data['DevType'].loc[close_job1_idx]

40       Developer, Hardware/Embedded Systems
519      Developer, Hardware/Embedded Systems
655      Developer, Hardware/Embedded Systems
732      Developer, Hardware/Embedded Systems
1008     Developer, Hardware/Embedded Systems
                         ...                 
38036    Developer, Hardware/Embedded Systems
38076    Developer, Hardware/Embedded Systems
38132    Developer, Hardware/Embedded Systems
38228    Developer, Hardware/Embedded Systems
38316    Developer, Hardware/Embedded Systems
Name: DevType, Length: 263, dtype: object

In [106]:
## Check Final Shape
data.shape

(38545, 8)

In [109]:
data

Unnamed: 0,LanguageHaveWorkedWith,DatabaseHaveWorkedWith,PlatformHaveWorkedWith,WebframeHaveWorkedWith,MiscTechHaveWorkedWith,ToolsTechHaveWorkedWith,NEWCollabToolsHaveWorkedWith,DevType
0,"[Bash/Shell (all shells), Go]",[],"[Amazon Web Services (AWS), Google Cloud, Open...",[],[],"[Cargo, Docker, Kubernetes, Make, Nix]","[Emacs, Helix]",Web Developer
1,"[Bash/Shell (all shells), HTML/CSS, JavaScript...","[PostgreSQL, Redis]","[Cloudflare, Heroku]","[Node.js, React, Ruby on Rails, Vue.js, WordPr...",[],"[Homebrew, npm, Vite, Webpack, Yarn]","[IntelliJ IDEA, Vim, Visual Studio Code, WebSt...",Web Developer
2,"[Bash/Shell (all shells), HTML/CSS, JavaScript...","[BigQuery, Cloud Firestore, PostgreSQL, Redis]","[Amazon Web Services (AWS), Cloudflare, Google...","[Angular, Express, NestJS, Node.js]",[],"[Docker, Homebrew, Kubernetes, npm, pnpm, Terr...","[Helix, Neovim]",Web Developer
3,"[C, C++, Python, Rust]",[Redis],[],[],[],"[Cargo, CMake, Docker, GNU GCC, Make]","[Code::Blocks, Sublime Text, Vim, Xcode]","Developer, QA or test"
4,"[Java, Perl, TypeScript]",[MySQL],[],"[Fastify, Node.js, React]",[Spring Framework],"[Kubernetes, Yarn]",[Visual Studio Code],Web Developer
...,...,...,...,...,...,...,...,...
38540,"[Bash/Shell (all shells), HTML/CSS, JavaScript...","[Oracle, PostgreSQL, SQLite]",[VMware],"[Express, jQuery, Node.js]",[],"[APT, GNU GCC, Make, npm, Pip, Webpack]",[Vim],Web Developer
38541,"[C#, HTML/CSS, JavaScript, R, SQL]","[Microsoft SQL Server, MySQL]",[],"[ASP.NET CORE, WordPress]",[Cordova],[npm],"[RStudio, Visual Studio, Visual Studio Code]",Web Developer
38542,"[Java, Kotlin]","[H2, MongoDB, MySQL, SQLite]",[VMware],[Spring Boot],"[Opencv, OpenGL, Spring Framework]","[Docker, Gradle, Maven (build tool)]","[Android Studio, Atom, Eclipse, IntelliJ IDEA,...","Developer, mobile"
38543,"[JavaScript, TypeScript]",[Dynamodb],[Amazon Web Services (AWS)],"[Express, Node.js, Spring Boot]",[Apache Kafka],"[Docker, Homebrew, Yarn]","[IntelliJ IDEA, Nano, Visual Studio Code]",Web Developer


In [107]:
original_data.shape

(38545, 8)

In [108]:
## Dump Final Features 
data.to_pickle('Data/Cleaned_Features.pkl')
original_data.to_pickle('Data/Cleaned_original_data.pkl')

In [110]:
original_data

Unnamed: 0,DevType,LanguageHaveWorkedWith,DatabaseHaveWorkedWith,PlatformHaveWorkedWith,WebframeHaveWorkedWith,MiscTechHaveWorkedWith,ToolsTechHaveWorkedWith,NEWCollabToolsHaveWorkedWith
0,Web Developer,Bash/Shell (all shells);Go,,Amazon Web Services (AWS);Google Cloud;OpenSta...,,,Cargo;Docker;Kubernetes;Make;Nix,Emacs;Helix
1,Web Developer,Bash/Shell (all shells);HTML/CSS;JavaScript;PH...,PostgreSQL;Redis,Cloudflare;Heroku,Node.js;React;Ruby on Rails;Vue.js;WordPress,,Homebrew;npm;Vite;Webpack;Yarn,IntelliJ IDEA;Vim;Visual Studio Code;WebStorm
2,Web Developer,Bash/Shell (all shells);HTML/CSS;JavaScript;Ru...,BigQuery;Cloud Firestore;PostgreSQL;Redis,Amazon Web Services (AWS);Cloudflare;Google Cloud,Angular;Express;NestJS;Node.js,,Docker;Homebrew;Kubernetes;npm;pnpm;Terraform,Helix;Neovim
3,"Developer, QA or test",C;C++;Python;Rust,Redis,,,,Cargo;CMake;Docker;GNU GCC;Make,Code::Blocks;Sublime Text;Vim;Xcode
4,Web Developer,Java;Perl;TypeScript,MySQL,,Fastify;Node.js;React,Spring Framework,Kubernetes;Yarn,Visual Studio Code
...,...,...,...,...,...,...,...,...
38540,Web Developer,Bash/Shell (all shells);HTML/CSS;JavaScript;Pe...,Oracle;PostgreSQL;SQLite,VMware,Express;jQuery;Node.js,,APT;GNU GCC;Make;npm;Pip;Webpack,Vim
38541,Web Developer,C#;HTML/CSS;JavaScript;R;SQL,Microsoft SQL Server;MySQL,,ASP.NET CORE;WordPress,Cordova,npm,RStudio;Visual Studio;Visual Studio Code
38542,"Developer, mobile",Java;Kotlin,H2;MongoDB;MySQL;SQLite,VMware,Spring Boot,Opencv;OpenGL;Spring Framework,Docker;Gradle;Maven (build tool),Android Studio;Atom;Eclipse;IntelliJ IDEA;Netb...
38543,Web Developer,JavaScript;TypeScript,Dynamodb,Amazon Web Services (AWS),Express;Node.js;Spring Boot,Apache Kafka,Docker;Homebrew;Yarn,IntelliJ IDEA;Nano;Visual Studio Code
