In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

# Load Data

In [2]:
RAW_DATA_PATH = "../data/raw/survey_results_public.csv"

In [3]:
raw_df = pd.read_csv(RAW_DATA_PATH)
raw_df

Unnamed: 0,ResponseId,MainBranch,Age,Employment,RemoteWork,Check,CodingActivities,EdLevel,LearnCode,LearnCodeOnline,...,JobSatPoints_6,JobSatPoints_7,JobSatPoints_8,JobSatPoints_9,JobSatPoints_10,JobSatPoints_11,SurveyLength,SurveyEase,ConvertedCompYearly,JobSat
0,1,I am a developer by profession,Under 18 years old,"Employed, full-time",Remote,Apples,Hobby,Primary/elementary school,Books / Physical media,,...,,,,,,,,,,
1,2,I am a developer by profession,35-44 years old,"Employed, full-time",Remote,Apples,Hobby;Contribute to open-source projects;Other...,"Bachelor’s degree (B.A., B.S., B.Eng., etc.)",Books / Physical media;Colleague;On the job tr...,Technical documentation;Blogs;Books;Written Tu...,...,0.0,0.0,0.0,0.0,0.0,0.0,,,,
2,3,I am a developer by profession,45-54 years old,"Employed, full-time",Remote,Apples,Hobby;Contribute to open-source projects;Other...,"Master’s degree (M.A., M.S., M.Eng., MBA, etc.)",Books / Physical media;Colleague;On the job tr...,Technical documentation;Blogs;Books;Written Tu...,...,,,,,,,Appropriate in length,Easy,,
3,4,I am learning to code,18-24 years old,"Student, full-time",,Apples,,Some college/university study without earning ...,"Other online resources (e.g., videos, blogs, f...",Stack Overflow;How-to videos;Interactive tutorial,...,,,,,,,Too long,Easy,,
4,5,I am a developer by profession,18-24 years old,"Student, full-time",,Apples,,"Secondary school (e.g. American high school, G...","Other online resources (e.g., videos, blogs, f...",Technical documentation;Blogs;Written Tutorial...,...,,,,,,,Too short,Easy,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
65432,65433,I am a developer by profession,18-24 years old,"Employed, full-time",Remote,Apples,Hobby;School or academic work,"Bachelor’s degree (B.A., B.S., B.Eng., etc.)","On the job training;School (i.e., University, ...",,...,,,,,,,,,,
65433,65434,I am a developer by profession,25-34 years old,"Employed, full-time",Remote,Apples,Hobby;Contribute to open-source projects,,,,...,,,,,,,,,,
65434,65435,I am a developer by profession,25-34 years old,"Employed, full-time",In-person,Apples,Hobby,"Bachelor’s degree (B.A., B.S., B.Eng., etc.)","Other online resources (e.g., videos, blogs, f...",Technical documentation;Stack Overflow;Social ...,...,,,,,,,,,,
65435,65436,I am a developer by profession,18-24 years old,"Employed, full-time","Hybrid (some remote, some in-person)",Apples,Hobby;Contribute to open-source projects;Profe...,"Secondary school (e.g. American high school, G...",On the job training;Other online resources (e....,Technical documentation;Blogs;Written Tutorial...,...,0.0,0.0,0.0,0.0,0.0,0.0,,,,


In [4]:
# Get the data from a random Respondent

raw_df.sample(1).T

Unnamed: 0,55370
ResponseId,55371
MainBranch,I am a developer by profession
Age,35-44 years old
Employment,"Independent contractor, freelancer, or self-em..."
RemoteWork,Remote
...,...
JobSatPoints_11,0.0
SurveyLength,Too long
SurveyEase,Easy
ConvertedCompYearly,


In [5]:
raw_df.shape

(65437, 114)

- Change the column names: JobsatPoints, Knowledge

In [16]:
# TODO : Understand and fix why the info does show all the columns and their null_values count
# We have tried the following fixes but they do not work in this env: 
#   - change the max_cols argument to a small number (example: max_cols=5)
#   - change the show_count argument to true 
#   - change the verbose argument to true: this shows all the columns without their null_values count

raw_df.info() 


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 65437 entries, 0 to 65436
Columns: 114 entries, ResponseId to JobSat
dtypes: float64(13), int64(1), object(100)
memory usage: 56.9+ MB


In [17]:
# This is an alternative of the previous cell. 
# The aim: calculate the number of NULL values
raw_df.isnull().sum().sort_values(ascending=False)

AINextMuch less integrated    64289
AINextLess integrated         63082
AINextNo change               52939
AINextMuch more integrated    51999
EmbeddedAdmired               48704
                              ...  
MainBranch                        0
Age                               0
Employment                        0
Check                             0
ResponseId                        0
Length: 114, dtype: int64

In 65437 entries, we have some columns that reaches +60000 null values. We can delete these features for two reasons: 
- They are hard to oversample
- They are not crutial features for our business case 

In [19]:
# Calculate the portion of null values
raw_df.isnull().sum().sort_values(ascending=True) / raw_df.shape[0]

ResponseId                    0.000000
MainBranch                    0.000000
Age                           0.000000
Employment                    0.000000
Check                         0.000000
                                ...   
EmbeddedAdmired               0.744288
AINextMuch more integrated    0.794642
AINextNo change               0.809007
AINextLess integrated         0.964011
AINextMuch less integrated    0.982456
Length: 114, dtype: float64

In [20]:
# Check the datatypes 
raw_df.info(verbose=True)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 65437 entries, 0 to 65436
Data columns (total 114 columns):
 #    Column                          Dtype  
---   ------                          -----  
 0    ResponseId                      int64  
 1    MainBranch                      object 
 2    Age                             object 
 3    Employment                      object 
 4    RemoteWork                      object 
 5    Check                           object 
 6    CodingActivities                object 
 7    EdLevel                         object 
 8    LearnCode                       object 
 9    LearnCodeOnline                 object 
 10   TechDoc                         object 
 11   YearsCode                       object 
 12   YearsCodePro                    object 
 13   DevType                         object 
 14   OrgSize                         object 
 15   PurchaseInfluence               object 
 16   BuyNewTool                      object 
 17   BuildvsBuy

In [31]:
NUM_FEATS = ["Age", "YearsCode", "YearsCodePro", "OrgSize", "TimeSearching", "TimeAnswering"]
raw_df[NUM_FEATS].sample().iloc[0]

Age                       25-34 years old
YearsCode                              19
YearsCodePro                            5
OrgSize          10,000 or more employees
TimeSearching         30-60 minutes a day
TimeAnswering        60-120 minutes a day
Name: 39862, dtype: object

In [32]:
INV_FEATS = ['YearsCode', 'YearsCodePro'] # Features to check (why they datatype is object. They are supposed to be numerical)
for inv_feat in INV_FEATS:
    unique_values = raw_df[inv_feat].unique()
    print(f".. {inv_feat}: {unique_values}")
    print(f"--------------------------------")


.. YearsCode: [nan '20' '37' '4' '9' '10' '7' '1' '15' '30' '31' '6' '12' '22' '5' '36'
 '25' '44' '24' '18' '3' '8' 'More than 50 years' '11' '29' '40' '39' '2'
 '42' '34' '19' '35' '16' '33' '13' '23' '14' '28' '17' '21' '43' '46'
 '26' '32' '41' '45' '27' '38' '50' '48' '47' 'Less than 1 year' '49']
--------------------------------
.. YearsCodePro: [nan '17' '27' '7' '11' '25' '12' '10' '3' 'Less than 1 year' '18' '37'
 '15' '20' '6' '2' '16' '8' '14' '4' '45' '1' '24' '29' '5' '30' '26' '9'
 '33' '13' '35' '23' '22' '31' '19' '21' '28' '34' '32' '40' '50' '39'
 '44' '42' '41' '36' '38' 'More than 50 years' '43' '47' '48' '46' '49']
--------------------------------


In [33]:
for feat in NUM_FEATS:
    unique_values = raw_df[feat].unique()
    print(f".. {feat}: {unique_values}")
    print(f"--------------------------------")

.. Age: ['Under 18 years old' '35-44 years old' '45-54 years old'
 '18-24 years old' '25-34 years old' '55-64 years old' 'Prefer not to say'
 '65 years or older']
--------------------------------
.. YearsCode: [nan '20' '37' '4' '9' '10' '7' '1' '15' '30' '31' '6' '12' '22' '5' '36'
 '25' '44' '24' '18' '3' '8' 'More than 50 years' '11' '29' '40' '39' '2'
 '42' '34' '19' '35' '16' '33' '13' '23' '14' '28' '17' '21' '43' '46'
 '26' '32' '41' '45' '27' '38' '50' '48' '47' 'Less than 1 year' '49']
--------------------------------
.. YearsCodePro: [nan '17' '27' '7' '11' '25' '12' '10' '3' 'Less than 1 year' '18' '37'
 '15' '20' '6' '2' '16' '8' '14' '4' '45' '1' '24' '29' '5' '30' '26' '9'
 '33' '13' '35' '23' '22' '31' '19' '21' '28' '34' '32' '40' '50' '39'
 '44' '42' '41' '36' '38' 'More than 50 years' '43' '47' '48' '46' '49']
--------------------------------
.. OrgSize: [nan '100 to 499 employees' '2 to 9 employees'
 'Just me - I am a freelancer, sole proprietor, etc.' '10 to 19 empl

In [41]:
FEATS = ["LanguageHaveWorkedWith", 
         "DatabaseHaveWorkedWith", 
         "PlatformHaveWorkedWith", 
         "WebframeHaveWorkedWith", 
         "EmbeddedHaveWorkedWith", 
         "MiscTechHaveWorkedWith", 
         "ToolsTechHaveWorkedWith", 
         "NEWCollabToolsHaveWorkedWith", 
         "OpSysPersonal use", 
         "OpSysProfessional use", 
         "OfficeStackAsyncHaveWorkedWith", 
         "AISearchDevHaveWorkedWith"]

LABEL = ["DevType"]

raw_df[FEATS + LABEL].sample(1).iloc[0]

LanguageHaveWorkedWith            C;C#;Dart;HTML/CSS;Java;JavaScript;Kotlin;PHP;...
DatabaseHaveWorkedWith                    MariaDB;Microsoft SQL Server;MySQL;SQLite
PlatformHaveWorkedWith                                              Firebase;VMware
WebframeHaveWorkedWith                                        Spring Boot;WordPress
EmbeddedHaveWorkedWith                                          Arduino;Rasberry Pi
MiscTechHaveWorkedWith            .NET (5+) ;.NET Framework (1.0 - 4.8);Flutter;...
ToolsTechHaveWorkedWith           Dagger;Gradle;Homebrew;Maven (build tool);npm;...
NEWCollabToolsHaveWorkedWith      Android Studio;Eclipse;IntelliJ IDEA;Notepad++...
OpSysPersonal use                           Android;iOS;iPadOS;MacOS;Ubuntu;Windows
OpSysProfessional use                                      Android;iOS;iPadOS;MacOS
OfficeStackAsyncHaveWorkedWith    Confluence;GitHub Discussions;Jira;Markdown Fi...
AISearchDevHaveWorkedWith                                     ChatGPT;Google

In [38]:
raw_df['DevType'].unique()

array([nan, 'Developer, full-stack', 'Developer Experience', 'Student',
       'Academic researcher', 'Project manager', 'Developer Advocate',
       'Developer, back-end', 'Other (please specify):',
       'Developer, front-end', 'Database administrator',
       'Developer, desktop or enterprise applications',
       'Cloud infrastructure engineer',
       'Data scientist or machine learning specialist',
       'Research & Development role',
       'Developer, embedded applications or devices',
       'System administrator', 'DevOps specialist', 'Engineering manager',
       'Designer', 'Security professional',
       'Senior Executive (C-Suite, VP, etc.)', 'Developer, mobile',
       'Developer, game or graphics', 'Data or business analyst',
       'Educator', 'Developer, QA or test', 'Product manager',
       'Developer, AI', 'Scientist', 'Engineer, site reliability',
       'Blockchain', 'Marketing or sales professional',
       'Hardware Engineer', 'Data engineer'], dtype=object)

In [40]:
# The portion of students in the dataset 
# Student is not a job. We will delete this DevType 

students_df = raw_df[raw_df['DevType'] == 'Student']

print("Portion of students in the dataset: ")
print(f".. {students_df.shape[0] / raw_df.shape[0] * 100:.2f}%")

Portion of students in the dataset: 
.. 7.80%


In [47]:
FEATS = ["LanguageHaveWorkedWith", 
         "DatabaseHaveWorkedWith", 
         "PlatformHaveWorkedWith", 
         "WebframeHaveWorkedWith", 
         "EmbeddedHaveWorkedWith", 
         "MiscTechHaveWorkedWith", 
         "ToolsTechHaveWorkedWith", 
         "NEWCollabToolsHaveWorkedWith", 
         "OpSysPersonal use", 
         "OpSysProfessional use", 
         "OfficeStackAsyncHaveWorkedWith", 
         "AISearchDevHaveWorkedWith"]

LABEL = ["DevType"]

OTHER_FEATS = []
for feature in raw_df.columns:
    if feature not in FEATS + LABEL:
        OTHER_FEATS.append(feature)
        
raw_df[OTHER_FEATS].sample(1).iloc[0]


ResponseId                                      25642
MainBranch             I am a developer by profession
Age                                   25-34 years old
Employment                        Employed, full-time
RemoteWork                                  In-person
                                    ...              
JobSatPoints_11                                   NaN
SurveyLength                                 Too long
SurveyEase                                       Easy
ConvertedCompYearly                           24037.0
JobSat                                            NaN
Name: 25641, Length: 101, dtype: object

In [48]:
unique_main_branch = raw_df['MainBranch'].unique()
print(f"unique_main_branch: {unique_main_branch}")

unique_main_branch: ['I am a developer by profession' 'I am learning to code'
 'I code primarily as a hobby'
 'I am not primarily a developer, but I write code sometimes as part of my work/studies'
 'I used to be a developer by profession, but no longer am']


In [59]:
PRO_MAIN_BRANCH = ["I am a developer by profession", 
                   "I am not primarily a developer, but I write code sometimes as part of my work/studies", 
                   "I used to be a developer by profession, but no longer am"
                   ]

portion_sum = 0
for branch in PRO_MAIN_BRANCH:
    print(f"Portion of {branch} in the dataset")
    print(f".. {branch}: {raw_df[raw_df['MainBranch'] == branch].shape[0] / raw_df.shape[0] * 100:.2f}%")
    portion_sum += raw_df[raw_df['MainBranch'] == branch].shape[0] / raw_df.shape[0] * 100
    
print(f"------------------------------------------")
print(f"The sum of these portions is: {portion_sum:.2f}%")
print(f"The portion of people who code as a hobby: {(100 - portion_sum):.2f}%")

Portion of I am a developer by profession in the dataset
.. I am a developer by profession: 76.73%
Portion of I am not primarily a developer, but I write code sometimes as part of my work/studies in the dataset
.. I am not primarily a developer, but I write code sometimes as part of my work/studies: 9.95%
Portion of I used to be a developer by profession, but no longer am in the dataset
.. I used to be a developer by profession, but no longer am: 2.31%
------------------------------------------
The sum of these portions is: 88.98%
The portion of people who code as a hobby: 11.02%



These values insure as that the dataset could be used in our business case. 
- We have only 7% of students in the dataset
- Among professionals, we have 76% of developers that write code as part of their job and only 11% of programmers that code as a hobby
These values indicates that the tech stacks will be correlated with the DevType. In addition, they reflect a real relationship.

END OF NOTEBOOK