### Loading data

In [2]:
import pandas as pd

In [3]:
df = pd.read_csv('../../data-2019/survey_results_public.csv')

In [4]:
df.shape

(88883, 85)

In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 88883 entries, 0 to 88882
Data columns (total 85 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   Respondent              88883 non-null  int64  
 1   MainBranch              88331 non-null  object 
 2   Hobbyist                88883 non-null  object 
 3   OpenSourcer             88883 non-null  object 
 4   OpenSource              86842 non-null  object 
 5   Employment              87181 non-null  object 
 6   Country                 88751 non-null  object 
 7   Student                 87014 non-null  object 
 8   EdLevel                 86390 non-null  object 
 9   UndergradMajor          75614 non-null  object 
 10  EduOther                84260 non-null  object 
 11  OrgSize                 71791 non-null  object 
 12  DevType                 81335 non-null  object 
 13  YearsCode               87938 non-null  object 
 14  Age1stCode              87634 non-null

In [6]:
schema_df = pd.read_csv('../../data-2019/survey_results_schema.csv')

In [7]:
schema_df

Unnamed: 0,Column,QuestionText
0,Respondent,Randomized respondent ID number (not in order ...
1,MainBranch,Which of the following options best describes ...
2,Hobbyist,Do you code as a hobby?
3,OpenSourcer,How often do you contribute to open source?
4,OpenSource,How do you feel about the quality of open sour...
...,...,...
80,Sexuality,Which of the following do you currently identi...
81,Ethnicity,Which of the following do you identify as? Ple...
82,Dependents,"Do you have any dependents (e.g., children, el..."
83,SurveyLength,How do you feel about the length of the survey...


In [8]:
# pd.set_option('display.max_columns', 85) which is used to set maximum columns that displayed.
# pd.set_option('display.max_rows', 85) set maximum rows that displayed.

certain number of rows

In [9]:
df.head(2)

Unnamed: 0,Respondent,MainBranch,Hobbyist,OpenSourcer,OpenSource,Employment,Country,Student,EdLevel,UndergradMajor,...,WelcomeChange,SONewContent,Age,Gender,Trans,Sexuality,Ethnicity,Dependents,SurveyLength,SurveyEase
0,1,I am a student who is learning to code,Yes,Never,The quality of OSS and closed source software ...,"Not employed, and not looking for work",United Kingdom,No,Primary/elementary school,,...,Just as welcome now as I felt last year,Tech articles written by other developers;Indu...,14.0,Man,No,Straight / Heterosexual,,No,Appropriate in length,Neither easy nor difficult
1,2,I am a student who is learning to code,No,Less than once per year,The quality of OSS and closed source software ...,"Not employed, but looking for work",Bosnia and Herzegovina,"Yes, full-time","Secondary school (e.g. American high school, G...",,...,Just as welcome now as I felt last year,Tech articles written by other developers;Indu...,19.0,Man,No,Straight / Heterosexual,,No,Appropriate in length,Neither easy nor difficult


In [10]:
df.tail(2)

Unnamed: 0,Respondent,MainBranch,Hobbyist,OpenSourcer,OpenSource,Employment,Country,Student,EdLevel,UndergradMajor,...,WelcomeChange,SONewContent,Age,Gender,Trans,Sexuality,Ethnicity,Dependents,SurveyLength,SurveyEase
88881,88816,,No,Never,"OSS is, on average, of HIGHER quality than pro...","Independent contractor, freelancer, or self-em...",,,,,...,,,,,,,,,,
88882,88863,,Yes,Less than once per year,"OSS is, on average, of HIGHER quality than pro...","Not employed, and not looking for work",Spain,"Yes, full-time","Professional degree (JD, MD, etc.)","Computer science, computer engineering, or sof...",...,Somewhat less welcome now than last year,Tech articles written by other developers;Indu...,18.0,Man,No,Straight / Heterosexual,Hispanic or Latino/Latina;White or of European...,No,Appropriate in length,Easy


### DataFrame basic

In [11]:
# convert dict into dataframe
people = {
    'first_name': ['Corey', 'Jane', 'Lee'],
    'last_name': ['Chou', 'Kalvin', 'Leo'],
    'email': ['aaa', 'bbb', 'ccc']
}

In [14]:
df_people = pd.DataFrame(people)
df_people

Unnamed: 0,first_name,last_name,email
0,Corey,Chou,aaa
1,Jane,Kalvin,bbb
2,Lee,Leo,ccc


In [16]:
# Series is 1-D DataFrame (rows of a column)
print(df_people['last_name'])
print(type(df_people['last_name']))

0      Chou
1    Kalvin
2       Leo
Name: last_name, dtype: object
<class 'pandas.core.series.Series'>


In [17]:
# accessing multiple columns
df_people[['last_name','email']]

Unnamed: 0,last_name,email
0,Chou,aaa
1,Kalvin,bbb
2,Leo,ccc


In [19]:
df_people.columns

Index(['first_name', 'last_name', 'email'], dtype='object')

In [24]:
# accessing a row
# iloc integer-location
print(df_people.iloc[0]) # the first row data
df_people.iloc[[0,1]] # the first and second row data

first_name    Corey
last_name      Chou
email           aaa
Name: 0, dtype: object


Unnamed: 0,first_name,last_name,email
0,Corey,Chou,aaa
1,Jane,Kalvin,bbb


In [25]:
# iloc for specific rows and columns
df_people.iloc[[0,1], 2]

0    aaa
1    bbb
Name: email, dtype: object

In [28]:
# accessing data with label
df_people.loc[[0, 1]] # 0 is default label for each row

Unnamed: 0,first_name,last_name,email
0,Corey,Chou,aaa
1,Jane,Kalvin,bbb


In [31]:
# columns labels is not by default coz we set it
df_people.loc[[0,1],['email', 'last_name']]

Unnamed: 0,email,last_name
0,aaa,Chou
1,bbb,Kalvin


Back to our survey data

In [32]:
df.shape

(88883, 85)

In [33]:
df.columns

Index(['Respondent', 'MainBranch', 'Hobbyist', 'OpenSourcer', 'OpenSource',
       'Employment', 'Country', 'Student', 'EdLevel', 'UndergradMajor',
       'EduOther', 'OrgSize', 'DevType', 'YearsCode', 'Age1stCode',
       'YearsCodePro', 'CareerSat', 'JobSat', 'MgrIdiot', 'MgrMoney',
       'MgrWant', 'JobSeek', 'LastHireDate', 'LastInt', 'FizzBuzz',
       'JobFactors', 'ResumeUpdate', 'CurrencySymbol', 'CurrencyDesc',
       'CompTotal', 'CompFreq', 'ConvertedComp', 'WorkWeekHrs', 'WorkPlan',
       'WorkChallenge', 'WorkRemote', 'WorkLoc', 'ImpSyn', 'CodeRev',
       'CodeRevHrs', 'UnitTests', 'PurchaseHow', 'PurchaseWhat',
       'LanguageWorkedWith', 'LanguageDesireNextYear', 'DatabaseWorkedWith',
       'DatabaseDesireNextYear', 'PlatformWorkedWith',
       'PlatformDesireNextYear', 'WebFrameWorkedWith',
       'WebFrameDesireNextYear', 'MiscTechWorkedWith',
       'MiscTechDesireNextYear', 'DevEnviron', 'OpSys', 'Containers',
       'BlockchainOrg', 'BlockchainIs', 'BetterLife'

In [35]:
# get the statistic values out of the series 
df['Hobbyist'].value_counts()

Yes    71257
No     17626
Name: Hobbyist, dtype: int64

In [40]:
df.loc[[0,1,2],['Hobbyist']]

Unnamed: 0,Hobbyist
0,Yes
1,No
2,Yes


In [43]:
# [0,1,2] is cumbersome, using slicing
df.loc[:4, 'Hobbyist':'Country']

Unnamed: 0,Hobbyist,OpenSourcer,OpenSource,Employment,Country
0,Yes,Never,The quality of OSS and closed source software ...,"Not employed, and not looking for work",United Kingdom
1,No,Less than once per year,The quality of OSS and closed source software ...,"Not employed, but looking for work",Bosnia and Herzegovina
2,Yes,Never,The quality of OSS and closed source software ...,Employed full-time,Thailand
3,No,Never,The quality of OSS and closed source software ...,Employed full-time,United States
4,Yes,Once a month or more often,"OSS is, on average, of HIGHER quality than pro...",Employed full-time,Ukraine
