# DATA PREPARATION AND EXPORT

### 1. Import pandas library and read CSV file

In [13]:
import pandas as pd
import numpy as np

df = pd.read_csv('data/survey_results_public.csv', sep=",", header=0)
print(f'Number of rows: {len(df)}')
df.head(5)

Number of rows: 89184


Unnamed: 0,ResponseId,Q120,MainBranch,Age,Employment,RemoteWork,CodingActivities,EdLevel,LearnCode,LearnCodeOnline,...,Frequency_1,Frequency_2,Frequency_3,TimeSearching,TimeAnswering,ProfessionalTech,Industry,SurveyLength,SurveyEase,ConvertedCompYearly
0,1,I agree,None of these,18-24 years old,,,,,,,...,,,,,,,,,,
1,2,I agree,I am a developer by profession,25-34 years old,"Employed, full-time",Remote,Hobby;Contribute to open-source projects;Boots...,"Bachelor’s degree (B.A., B.S., B.Eng., etc.)",Books / Physical media;Colleague;Friend or fam...,Formal documentation provided by the owner of ...,...,1-2 times a week,10+ times a week,Never,15-30 minutes a day,15-30 minutes a day,DevOps function;Microservices;Automated testin...,"Information Services, IT, Software Development...",Appropriate in length,Easy,285000.0
2,3,I agree,I am a developer by profession,45-54 years old,"Employed, full-time","Hybrid (some remote, some in-person)",Hobby;Professional development or self-paced l...,"Bachelor’s degree (B.A., B.S., B.Eng., etc.)",Books / Physical media;Colleague;On the job tr...,Formal documentation provided by the owner of ...,...,6-10 times a week,6-10 times a week,3-5 times a week,30-60 minutes a day,30-60 minutes a day,DevOps function;Microservices;Automated testin...,"Information Services, IT, Software Development...",Appropriate in length,Easy,250000.0
3,4,I agree,I am a developer by profession,25-34 years old,"Employed, full-time","Hybrid (some remote, some in-person)",Hobby,"Bachelor’s degree (B.A., B.S., B.Eng., etc.)",Colleague;Friend or family member;Other online...,Formal documentation provided by the owner of ...,...,1-2 times a week,10+ times a week,1-2 times a week,15-30 minutes a day,30-60 minutes a day,Automated testing;Continuous integration (CI) ...,,Appropriate in length,Easy,156000.0
4,5,I agree,I am a developer by profession,25-34 years old,"Employed, full-time;Independent contractor, fr...",Remote,Hobby;Contribute to open-source projects;Profe...,"Bachelor’s degree (B.A., B.S., B.Eng., etc.)",Books / Physical media;Online Courses or Certi...,Formal documentation provided by the owner of ...,...,1-2 times a week,1-2 times a week,3-5 times a week,60-120 minutes a day,30-60 minutes a day,Microservices;Automated testing;Observability ...,Other,Appropriate in length,Neither easy nor difficult,23456.0


### 2. Data Cleaning

#### I. Drop unnecessary columns

In [14]:
# Q120 - only one unique value - irrelevant information, checking if there are more column like this

for column in df:
    unique_values = df[column].unique()
    if len(unique_values) < 5:
        print(f'Column name: {column:20} | unique values: {unique_values}')

cols = ["Q120", "SurveyLength", "SurveyEase", 'SOAI', 'YearsCodePro']
df.drop(cols, inplace=True, axis=1)
# SurveyLentgh, SurveyEase - irrelevant information for further analysis
# SOAI long strings containing user opinions about AI, great for NLP, not exactly for this project

Column name: Q120                 | unique values: ['I agree']
Column name: RemoteWork           | unique values: [nan 'Remote' 'Hybrid (some remote, some in-person)' 'In-person']
Column name: PurchaseInfluence    | unique values: [nan 'I have a great deal of influence' 'I have some influence'
 'I have little or no influence']
Column name: TechList             | unique values: [nan 'Investigate' 'Given a list' 'Other']
Column name: SOAccount            | unique values: [nan 'Yes' 'No' "Not sure/can't remember"]
Column name: AISelect             | unique values: [nan 'Yes' "No, and I don't plan to" 'No, but I plan to soon']
Column name: TBranch              | unique values: [nan 'Yes' 'No']
Column name: ICorPM               | unique values: [nan 'People manager' 'Individual contributor']
Column name: SurveyLength         | unique values: [nan 'Appropriate in length' 'Too long' 'Too short']
Column name: SurveyEase           | unique values: [nan 'Easy' 'Neither easy nor difficult' 'Diffi

#### II. Remove duplicate and Incomplete Cases

In [15]:
# Removing rows where more than 60% of columns is null
n_columns = len(df.columns)
mostly_nulls = df.isnull().sum(axis=1)
mostly_nulls_count = mostly_nulls[mostly_nulls/n_columns > 0.6]
df.drop(mostly_nulls_count.index, inplace=True)

print(f'Removed {len(mostly_nulls_count)}')

Removed 5975


In [16]:
# Check for duplicates
duplicates = df.iloc[:,1:].duplicated().any()
print(duplicates)
# unique values
duplicates = df['ResponseId'].duplicated().any()
print(duplicates)

False
False


#### III. Handle data types

In [17]:
# Check data type for each column
with pd.option_context('display.max_rows', None):
    display(df.dtypes)

ResponseId                               int64
MainBranch                              object
Age                                     object
Employment                              object
RemoteWork                              object
CodingActivities                        object
EdLevel                                 object
LearnCode                               object
LearnCodeOnline                         object
LearnCodeCoursesCert                    object
YearsCode                               object
DevType                                 object
OrgSize                                 object
PurchaseInfluence                       object
TechList                                object
BuyNewTool                              object
Country                                 object
Currency                                object
CompTotal                              float64
LanguageHaveWorkedWith                  object
LanguageWantToWorkWith                  object
DatabaseHaveW

#### IV. Threat null values

In [18]:
# replace nulls in text columns with 'N/A'
cols = [column for column in df if df[column].isnull().sum()>0 and df[column].dtype == 'object']
df = df.fillna(dict.fromkeys(cols, 'N/A'))
with pd.option_context('display.max_rows', None):
    display(df.isnull().sum())

ResponseId                                 0
MainBranch                                 0
Age                                        0
Employment                                 0
RemoteWork                                 0
CodingActivities                           0
EdLevel                                    0
LearnCode                                  0
LearnCodeOnline                            0
LearnCodeCoursesCert                       0
YearsCode                                  0
DevType                                    0
OrgSize                                    0
PurchaseInfluence                          0
TechList                                   0
BuyNewTool                                 0
Country                                    0
Currency                                   0
CompTotal                              35164
LanguageHaveWorkedWith                     0
LanguageWantToWorkWith                     0
DatabaseHaveWorkedWith                     0
DatabaseWa

In [19]:
# Drop rows where code years are not specified
df.drop(df['YearsCode'][df['YearsCode']  == 'N/A'].index, inplace=True)

In [20]:
# Convert text values to numeric at first, later categorize data by specified ranges
df.loc[df['YearsCode'] == 'Less than 1 year', 'YearsCode'] = '0'
df.loc[df['YearsCode'] == 'More than 50 years', 'YearsCode'] = '51'

df['YearsCode'] = df['YearsCode'].astype(np.int32)

bins = [-np.inf, 1, 3, 7, 13, 20, 30, 40, 50, np.inf]
names = ['<1', '1-3', '4-7', '8-13', '14-20', '21-30', '31-40', '41-50', '50<']

df['YearsCode'] = pd.cut(df['YearsCode'], bins, labels=names)

#### V. Check and remove nonsense answers and unreadable data

In [21]:
pd.options.display.max_rows = 100
pd.options.display.max_colwidth = 200

listed_LC = df['CodingActivities'].str.split(';')
print(f'CodingActivities answers: ')
display(listed_LC.explode().unique())

listed_LC = df['LearnCode'].str.split(';')
print(f'LearnCode answers: ')
display(listed_LC.explode().unique())

listed_LCO = df['LearnCodeOnline'].str.split(';')
print('LearnCodeOnline answers:')
display(listed_LCO.explode().unique())
# Nonsense answer to delete
#display(listed_LCO.explode()[listed_LCO.explode() == 'Click to write Choice 20'])

listed_LCCC = df['LearnCodeCoursesCert'].str.split(';')
print('LearnCodeCoursesCert answers:')
display(listed_LCCC.explode().unique())

listed_DT = df['DevType'].str.split(';')
print('DevType answers:')
display(listed_DT.explode().unique())

listed_TL = df['TechList'].str.split(';')
print('TechList answers:')
display(listed_TL.explode().unique())

listed_BNT = df['BuyNewTool'].str.split(';')
print('BuyNewTool answers:')
display(listed_BNT.explode().unique())

listed_L = df['LanguageHaveWorkedWith'].str.split(';')
print('Language answers:')
display(listed_L.explode().unique())

# It seems like questions of multiple choice together with user inputs doesn't have this data,
# because of that further checking of these type of columns will be omitted.
pd.reset_option('display.max_rows')
pd.reset_option('display.max_colwidth')

CodingActivities answers: 


array(['Hobby', 'Contribute to open-source projects',
       'Bootstrapping a business',
       'Professional development or self-paced learning from online courses',
       'N/A', 'I don’t code outside of work', 'Freelance/contract work',
       'School or academic work', 'Other (please specify):'], dtype=object)

LearnCode answers: 


array(['Books / Physical media', 'Colleague', 'Friend or family member',
       'Hackathons (virtual or in-person)',
       'Online Courses or Certification', 'On the job training',
       'Other online resources (e.g., videos, blogs, forum)',
       'School (i.e., University, College, etc)', 'Coding Bootcamp',
       'Other (please specify):', 'N/A'], dtype=object)

LearnCodeOnline answers:


array(['Formal documentation provided by the owner of the tech',
       'Blogs with tips and tricks', 'Books', 'Recorded coding sessions',
       'How-to videos', 'Video-based Online Courses',
       'Written-based Online Courses',
       'Auditory material (e.g., podcasts)',
       'Online challenges (e.g., daily or weekly coding challenges)',
       'Written Tutorials', 'Click to write Choice 20', 'Stack Overflow',
       'Interactive tutorial', 'Certification videos', 'N/A',
       'Programming Games', 'Other (Please specify):'], dtype=object)

LearnCodeCoursesCert answers:


array(['Other', 'N/A', 'Codecademy', 'edX', 'Udemy', 'Pluralsight',
       'Coursera', 'Udacity', 'Skillsoft'], dtype=object)

DevType answers:


array(['Senior Executive (C-Suite, VP, etc.)', 'Developer, back-end',
       'Developer, front-end', 'Developer, full-stack',
       'System administrator',
       'Developer, desktop or enterprise applications',
       'Developer, QA or test', 'Designer',
       'Data scientist or machine learning specialist',
       'Data or business analyst', 'Security professional',
       'Research & Development role', 'N/A', 'Other (please specify):',
       'Developer, mobile', 'Database administrator',
       'Developer, embedded applications or devices', 'Student',
       'Engineer, data', 'Product manager', 'Academic researcher',
       'Developer, game or graphics', 'Cloud infrastructure engineer',
       'Engineering manager', 'Developer Experience', 'Project manager',
       'DevOps specialist', 'Engineer, site reliability', 'Blockchain',
       'Developer Advocate', 'Educator', 'Scientist', 'Hardware Engineer',
       'Marketing or sales professional'], dtype=object)

TechList answers:


array(['Investigate', 'Given a list', 'N/A', 'Other'], dtype=object)

BuyNewTool answers:


array(['Start a free trial', 'Ask developers I know/work with',
       'Visit developer communities like Stack Overflow',
       'Other (please specify):',
       'Research companies that have advertised on sites I visit',
       'Read ratings or reviews on third party sites like G2 Crowd',
       'Ask a generative AI tool',
       'Research companies that have emailed me', 'N/A'], dtype=object)

Language answers:


array(['HTML/CSS', 'JavaScript', 'Python', 'Bash/Shell (all shells)',
       'Go', 'PHP', 'Ruby', 'SQL', 'TypeScript', 'Ada', 'Clojure',
       'Elixir', 'Java', 'Lisp', 'OCaml', 'Raku', 'Scala', 'Swift', 'Zig',
       'Rust', 'C#', 'PowerShell', 'C++', 'Kotlin', 'Solidity', 'C',
       'Perl', 'Dart', 'Haskell', 'Assembly', 'Delphi', 'R', 'Lua', 'VBA',
       'Visual Basic (.Net)', 'Julia', 'MATLAB', 'F#', 'Groovy', 'APL',
       'Objective-C', 'GDScript', 'Crystal', 'Erlang', 'Cobol', 'Fortran',
       'Prolog', 'Apex', 'N/A', 'SAS', 'Nim', 'Flow'], dtype=object)

In [22]:
print(df.columns)

Index(['ResponseId', 'MainBranch', 'Age', 'Employment', 'RemoteWork',
       'CodingActivities', 'EdLevel', 'LearnCode', 'LearnCodeOnline',
       'LearnCodeCoursesCert', 'YearsCode', 'DevType', 'OrgSize',
       'PurchaseInfluence', 'TechList', 'BuyNewTool', 'Country', 'Currency',
       'CompTotal', 'LanguageHaveWorkedWith', 'LanguageWantToWorkWith',
       'DatabaseHaveWorkedWith', 'DatabaseWantToWorkWith',
       'PlatformHaveWorkedWith', 'PlatformWantToWorkWith',
       'WebframeHaveWorkedWith', 'WebframeWantToWorkWith',
       'MiscTechHaveWorkedWith', 'MiscTechWantToWorkWith',
       'ToolsTechHaveWorkedWith', 'ToolsTechWantToWorkWith',
       'NEWCollabToolsHaveWorkedWith', 'NEWCollabToolsWantToWorkWith',
       'OpSysPersonal use', 'OpSysProfessional use',
       'OfficeStackAsyncHaveWorkedWith', 'OfficeStackAsyncWantToWorkWith',
       'OfficeStackSyncHaveWorkedWith', 'OfficeStackSyncWantToWorkWith',
       'AISearchHaveWorkedWith', 'AISearchWantToWorkWith',
       'AIDevHave

In [23]:
rows = df[df["ConvertedCompYearly"] > 1e6]
print(f'Removed {len(rows)}')
df.drop(rows.index, inplace=True)

Removed 60


### Save File

In [24]:
df.to_csv('data/pandas_processed.csv', sep=',', header=1, index=None)

### Testing section

In [22]:
for col in df.columns:
    if df[col].dtype != 'float64' and df[col].dtype !='int64':
        print(f'{col}: {df[col].str.len().max()}')

MainBranch: 85
Age: 18
Employment: 212
RemoteWork: 36
CodingActivities: 205
EdLevel: 82
LearnCode: 274
LearnCodeOnline: 419
LearnCodeCoursesCert: 65
YearsCode: 11
DevType: 45
OrgSize: 50
PurchaseInfluence: 32
TechList: 12
BuyNewTool: 303
Country: 52
Currency: 43
LanguageHaveWorkedWith: 340
LanguageWantToWorkWith: 340
DatabaseHaveWorkedWith: 314
DatabaseWantToWorkWith: 314
PlatformHaveWorkedWith: 278
PlatformWantToWorkWith: 278
WebframeHaveWorkedWith: 274
WebframeWantToWorkWith: 274
MiscTechHaveWorkedWith: 345
MiscTechWantToWorkWith: 345
ToolsTechHaveWorkedWith: 421
ToolsTechWantToWorkWith: 421
NEWCollabToolsHaveWorkedWith: 370
NEWCollabToolsWantToWorkWith: 370
OpSysPersonal use: 180
OpSysProfessional use: 180
OfficeStackAsyncHaveWorkedWith: 386
OfficeStackAsyncWantToWorkWith: 386
OfficeStackSyncHaveWorkedWith: 212
OfficeStackSyncWantToWorkWith: 212
AISearchHaveWorkedWith: 104
AISearchWantToWorkWith: 104
AIDevHaveWorkedWith: 122
AIDevWantToWorkWith: 122
NEWSOSites: 151
SOVisitFreq: 35
S