In [32]:
import pandas as pd
people = {
    'first_name': ['Corey', 'Jane', 'Lee', 'Lee'],
    'last_name': ['Chou', 'Kalvin', 'Leo', 'Dan'],
    'email': ['aaa', 'bbb', 'ccc', 'ddd']
}
df_people = pd.DataFrame(people)

In [33]:
df_people['email']

0    aaa
1    bbb
2    ccc
3    ddd
Name: email, dtype: object

### Setting index
The default index is '0,1,2...' at the far left column without name.

In [34]:
df_people.index

RangeIndex(start=0, stop=4, step=1)

In [35]:
df_people.set_index('email', inplace=True) # it won't save it without 'inplace'

In [36]:
df_people.index

Index(['aaa', 'bbb', 'ccc', 'ddd'], dtype='object', name='email')

In [37]:
df_people.loc['aaa']

first_name    Corey
last_name      Chou
Name: aaa, dtype: object

In [38]:
# it will show an error when you changing the default index
# df_people.loc[0] # loc doesn't work
df_people.iloc[0] # iloc still work

first_name    Corey
last_name      Chou
Name: aaa, dtype: object

In [39]:
# reset the index into default
df_people.reset_index(inplace=True)

In [40]:
df_people.index

RangeIndex(start=0, stop=4, step=1)

In [41]:
df = pd.read_csv('../../data-2019/survey_results_public.csv')
schema_df = pd.read_csv('../../data-2019/survey_results_schema.csv')

In [42]:
df.head(2)

Unnamed: 0,Respondent,MainBranch,Hobbyist,OpenSourcer,OpenSource,Employment,Country,Student,EdLevel,UndergradMajor,...,WelcomeChange,SONewContent,Age,Gender,Trans,Sexuality,Ethnicity,Dependents,SurveyLength,SurveyEase
0,1,I am a student who is learning to code,Yes,Never,The quality of OSS and closed source software ...,"Not employed, and not looking for work",United Kingdom,No,Primary/elementary school,,...,Just as welcome now as I felt last year,Tech articles written by other developers;Indu...,14.0,Man,No,Straight / Heterosexual,,No,Appropriate in length,Neither easy nor difficult
1,2,I am a student who is learning to code,No,Less than once per year,The quality of OSS and closed source software ...,"Not employed, but looking for work",Bosnia and Herzegovina,"Yes, full-time","Secondary school (e.g. American high school, G...",,...,Just as welcome now as I felt last year,Tech articles written by other developers;Indu...,19.0,Man,No,Straight / Heterosexual,,No,Appropriate in length,Neither easy nor difficult


In [43]:
# we can using Respondent columns as the index
df.set_index('Respondent', inplace=True)

In [44]:
df.head(2)

Unnamed: 0_level_0,MainBranch,Hobbyist,OpenSourcer,OpenSource,Employment,Country,Student,EdLevel,UndergradMajor,EduOther,...,WelcomeChange,SONewContent,Age,Gender,Trans,Sexuality,Ethnicity,Dependents,SurveyLength,SurveyEase
Respondent,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,I am a student who is learning to code,Yes,Never,The quality of OSS and closed source software ...,"Not employed, and not looking for work",United Kingdom,No,Primary/elementary school,,"Taught yourself a new language, framework, or ...",...,Just as welcome now as I felt last year,Tech articles written by other developers;Indu...,14.0,Man,No,Straight / Heterosexual,,No,Appropriate in length,Neither easy nor difficult
2,I am a student who is learning to code,No,Less than once per year,The quality of OSS and closed source software ...,"Not employed, but looking for work",Bosnia and Herzegovina,"Yes, full-time","Secondary school (e.g. American high school, G...",,Taken an online course in programming or softw...,...,Just as welcome now as I felt last year,Tech articles written by other developers;Indu...,19.0,Man,No,Straight / Heterosexual,,No,Appropriate in length,Neither easy nor difficult


In [45]:
# survey_results_schema.csv is the sheet explaining what does exactly the features mean.
# The best way to utilize the sheet is setting the features' names as index.
schema_df.set_index('Column', inplace=True)

In [46]:
schema_df.loc['MgrIdiot']
# the text is actually truncated because of length
# passing the column index as well to solve this
print(schema_df.loc['MgrIdiot', 'QuestionText'])

How confident are you that your manager knows what they’re doing?


In [47]:
schema_df.sort_index()

Unnamed: 0_level_0,QuestionText
Column,Unnamed: 1_level_1
Age,What is your age (in years)? If you prefer not...
Age1stCode,At what age did you write your first line of c...
BetterLife,Do you think people born today will have a bet...
BlockchainIs,Blockchain / cryptocurrency technology is prim...
BlockchainOrg,How is your organization thinking about or imp...
...,...
WorkPlan,How structured or planned is your work?
WorkRemote,How often do you work remotely?
WorkWeekHrs,"On average, how many hours per week do you work?"
YearsCode,"Including any education, how many years have y..."


### Filter

In [48]:
df_people

Unnamed: 0,email,first_name,last_name
0,aaa,Corey,Chou
1,bbb,Jane,Kalvin
2,ccc,Lee,Leo
3,ddd,Lee,Dan


In [49]:
print(df_people['first_name'] == 'Lee')
filt = df_people['first_name'] == 'Lee'
print(type(filt))

0    False
1    False
2     True
3     True
Name: first_name, dtype: bool
<class 'pandas.core.series.Series'>


In [50]:
print(df_people[filt])

  email first_name last_name
2   ccc        Lee       Leo
3   ddd        Lee       Dan


In [51]:
# filter + loc
df_people.loc[filt, 'email']

2    ccc
3    ddd
Name: email, dtype: object

logical operation '&' and, '|' or

In [52]:
filt = (df_people['first_name']=='Lee') & (df_people['last_name']=='Dan')

In [54]:
df_people.loc[~filt] # NOT operator

Unnamed: 0,email,first_name,last_name
0,aaa,Corey,Chou
1,bbb,Jane,Kalvin
2,ccc,Lee,Leo


In [None]:
# if you want to display all data of the row
pd.set_option('display.max_rows', None)
schema_df.sort_index()

find the all rows that salary more than 70000

In [67]:
high_salary = (df['ConvertedComp']>70000)

In [72]:
df.loc[high_salary, ['Country', 'LanguageWorkedWith']].shape

(22289, 2)

Filter multiple values

In [73]:
countries = ['United States', 'India', 'United Kingdom', 'Germany', 'Canada']
filt_country = df['Country'].isin(countries)

In [74]:
df.loc[filt_country].shape

(45008, 84)

filter of the content in the cell

In [75]:
df['LanguageWorkedWith'].head(5)

Respondent
1                HTML/CSS;Java;JavaScript;Python
2                            C++;HTML/CSS;Python
3                                       HTML/CSS
4                            C;C++;C#;Python;SQL
5    C++;HTML/CSS;Java;JavaScript;Python;SQL;VBA
Name: LanguageWorkedWith, dtype: object

In [76]:
filt_language = df['LanguageWorkedWith'].str.contains('Python', na=False)
df.loc[filt_language, 'LanguageWorkedWith'].shape

(36443,)