In [2]:
import pandas as pd 

In [3]:
DATA_PATH = "../data/raw/"
RAW_DATA = "survey_results_public.csv"

In [5]:
df = pd.read_csv(DATA_PATH + RAW_DATA) 

In [6]:
# Check dataset size

print(f"Dataset size : {df.shape}")

Dataset size : (83439, 48)


In [12]:
# Check how many null objects in the datset. 

for column in df.columns : 
    print(f"Number of NaN in {column} : {df[column].isna().sum()}")
    missing_percentage = (df[column].isna().sum() / df.shape[0]) * 100
    formatted_percentage = "{:.2f}".format(missing_percentage)
    print(f"Which is {formatted_percentage}% of the dataset.")
    print("---------------------------------------------------------")

Number of NaN in ResponseId : 0
Which is 0.00% of the dataset.
---------------------------------------------------------
Number of NaN in MainBranch : 0
Which is 0.00% of the dataset.
---------------------------------------------------------
Number of NaN in Employment : 116
Which is 0.14% of the dataset.
---------------------------------------------------------
Number of NaN in Country : 0
Which is 0.00% of the dataset.
---------------------------------------------------------
Number of NaN in US_State : 68519
Which is 82.12% of the dataset.
---------------------------------------------------------
Number of NaN in UK_Country : 79021
Which is 94.71% of the dataset.
---------------------------------------------------------
Number of NaN in EdLevel : 313
Which is 0.38% of the dataset.
---------------------------------------------------------
Number of NaN in Age1stCode : 196
Which is 0.23% of the dataset.
---------------------------------------------------------
Number of NaN in LearnCo

In [26]:
# Save the column names where the misssing_percentage is greater than 80% 

high_missing_percentage_columns = []

for column in df.columns : 
    missing_percentage = (df[column].isna().sum() / df.shape[0]) * 100
    formatted_percentage = "{:.2f}".format(missing_percentage)
    if missing_percentage > 50 : 
        high_missing_percentage_columns.append(column)
        print(f"- {column}")
        print("---------------------------------------------------------")

- US_State
---------------------------------------------------------
- UK_Country
---------------------------------------------------------
- PlatformWantToWorkWith
---------------------------------------------------------
- MiscTechWantToWorkWith
---------------------------------------------------------


In [22]:
# Visualize some values of US_State and UK_Country

index = df[high_missing_percentage_columns].sample(1).index[0]
df.iloc[index]

ResponseId                                                                  20884
MainBranch                                         I am a developer by profession
Employment                      Independent contractor, freelancer, or self-em...
Country                                                                    France
US_State                                                                      NaN
UK_Country                                                                    NaN
EdLevel                         Some college/university study without earning ...
Age1stCode                                                          11 - 17 years
LearnCode                       Other online resources (ex: videos, blogs, etc...
YearsCode                                                                      24
YearsCodePro                                                                   24
DevType                         Developer, front-end;Developer, desktop or ent...
OrgSize         

In [40]:
index = df[df['Country'] == 'United States of America'].sample(1).index[0]
df.iloc[index]

ResponseId                                                                  52731
MainBranch                                         I am a developer by profession
Employment                                                     Employed full-time
Country                                                  United States of America
US_State                                                                    Idaho
UK_Country                                                                    NaN
EdLevel                              Bachelor’s degree (B.A., B.S., B.Eng., etc.)
Age1stCode                                                          11 - 17 years
LearnCode                                          School;Other (please specify):
YearsCode                                                                       8
YearsCodePro                                                                    7
DevType                                                     Developer, full-stack
OrgSize         

In [27]:
# check how many numerical values on the dataset 

df.info() 

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 83439 entries, 0 to 83438
Data columns (total 48 columns):
 #   Column                        Non-Null Count  Dtype  
---  ------                        --------------  -----  
 0   ResponseId                    83439 non-null  int64  
 1   MainBranch                    83439 non-null  object 
 2   Employment                    83323 non-null  object 
 3   Country                       83439 non-null  object 
 4   US_State                      14920 non-null  object 
 5   UK_Country                    4418 non-null   object 
 6   EdLevel                       83126 non-null  object 
 7   Age1stCode                    83243 non-null  object 
 8   LearnCode                     82963 non-null  object 
 9   YearsCode                     81641 non-null  object 
 10  YearsCodePro                  61216 non-null  object 
 11  DevType                       66484 non-null  object 
 12  OrgSize                       60726 non-null  object 
 13  C

In [39]:
# Check how many numerical values in the dataset 

column_type_counts = {
    'non_numerical_columns': 0,
    'numerical_columns': 0
}

for column in df.columns : 
    if df[column].dtype == numpy.dtypes.ObjectDType : 
        column_type_counts['non_numerical_columns'] += 1
    else : 
        column_type_counts['numerical_columns'] += 1
        
column_type_counts

# Based on the results, this is not accurate. 
# The results shows that we have 3 numerical values 
# in fact, we have other 2 numerical features that are 
# considered as object

{'non_numerical_columns': 45, 'numerical_columns': 3}

In [42]:
# Why the two numerical features are considered as object 

num_feats = ['YearsCode', 'YearsCodePro'] 

for feat in num_feats : 
    print(f"Unique values: \n{df[feat].unique()}") 
    print("-------------------------------------")

Unique values: 
[nan '7' '17' '3' '4' '6' '16' '12' '15' '10' '40' '9' '26' '14' '39' '20'
 '8' '19' '5' 'Less than 1 year' '22' '2' '1' '34' '21' '13' '25' '24'
 '30' '31' '18' '38' 'More than 50 years' '27' '41' '42' '35' '23' '28'
 '11' '37' '44' '43' '36' '33' '45' '29' '50' '46' '32' '47' '49' '48']
-------------------------------------
Unique values: 
[nan '10' '4' '5' '6' '2' '30' '9' '18' '12' '21' '1' '16'
 'Less than 1 year' '15' '3' '35' '7' '8' '17' '14' '26' '25' '20' '50'
 '34' '11' '24' '22' '13' '31' '23' '39' '41' '27' '28' '19' '33'
 'More than 50 years' '37' '29' '32' '43' '40' '38' '45' '42' '46' '36'
 '44' '47' '48' '49']
-------------------------------------


In [43]:
# This two features contains string values. Therefore, pandas 
# is not able to interpret these columns as numerical. 
# We should handle this issue in the pre-processing step. 

In [48]:
# Check other potential numerical features 

other_pnum_feats = ['Age1stCode', 'CompTotal']
for feat in other_pnum_feats : 
    print(f"Unique values: \n{df[feat].unique()}") 
    print("-------------------------------------")

Unique values: 
['18 - 24 years' '11 - 17 years' '5 - 10 years' '25 - 34 years'
 '35 - 44 years' 'Younger than 5 years' '45 - 54 years' '55 - 64 years'
 nan 'Older than 64 years']
-------------------------------------
Unique values: 
[4.800e+03       nan 4.200e+04 ... 8.880e+04 6.500e+01 1.605e+05]
-------------------------------------


In [47]:
df['Age'].unique()

array(['25-34 years old', '18-24 years old', '35-44 years old',
       'Prefer not to say', '45-54 years old', 'Under 18 years old',
       '65 years or older', '55-64 years old', nan], dtype=object)

In [None]:
# We have also the SOAccount feature that have a form 
# of boolean feature with three possible values 
# Yes, No, not sure which could be translated to 1, -1, 0

# ---------------------------------------------------------------------

# SOComm is another feature that have a form of slider 
# of 5/6 possible values