### Update data

In [22]:
import pandas as pd
people = {
    'first_name': ['Corey', 'Jane', 'Lee', 'Lee'],
    'last_name': ['Chou', 'Kalvin', 'Leo', 'Dan'],
    'email': ['aaa', 'bbb', 'ccc', 'ddd']
}
df_people = pd.DataFrame(people)

In [2]:
df_people.head(2)

Unnamed: 0,first_name,last_name,email
0,Corey,Chou,aaa
1,Jane,Kalvin,bbb


In [26]:
# change the column header
df_people.columns = ['first_n', 'last_n', 'email']

In [4]:
df_people.head(2)

Unnamed: 0,first_n,last_n,email
0,Corey,Chou,aaa
1,Jane,Kalvin,bbb


In [27]:
df_people.columns = [feature_name.upper() for feature_name in df_people.columns]

In [6]:
df_people.head(2)

Unnamed: 0,FIRST_N,LAST_N,EMAIL
0,Corey,Chou,aaa
1,Jane,Kalvin,bbb


In [28]:
df_people.columns = [feature_name.lower() for feature_name in df_people.columns]

Change some specific columns

In [29]:
df_people.rename(columns={'first_n':'first', 'last_n':'last'}, inplace=True)

In [30]:
df_people.head(2)

Unnamed: 0,first,last,email
0,Corey,Chou,aaa
1,Jane,Kalvin,bbb


Change the values

In [31]:
df_people.loc[1] = ['John', 'Smith', 'johnsmith@gmail.com']
df_people.loc[3, ['first', 'last', 'email']] = ['Doe', 'Reeves', 'doereeves@email.com']

In [32]:
df_people

Unnamed: 0,first,last,email
0,Corey,Chou,aaa
1,John,Smith,johnsmith@gmail.com
2,Lee,Leo,ccc
3,Doe,Reeves,doereeves@email.com


In [35]:
# .at accessing a single value for a row/column label pair which is similar to .loc
df_people.at[2, 'email']='leeleo@gmail.com'
df_people

Unnamed: 0,first,last,email
0,Corey,Chou,aaa
1,John,Smith,johnsmith@gmail.com
2,Lee,Leo,leeleo@gmail.com
3,Doe,Reeves,doereeves@email.com


Filter using for accessing value not for updating.  
.loc and .at is the key for updating.

In [40]:
filt = (df_people['email']=='leeleo@gmail.com')
df_people[filt]['last'] = 'Smith'

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_people[filt]['last'] = 'Smith'


Updating multiple rows

apply, map, applymap, replace

In [41]:
# apply a function
df_people['email'].apply(len)

0     3
1    19
2    16
3    19
Name: email, dtype: int64

In [42]:
def update_email(email):
    return email.upper()

In [45]:
df_people['email'] = df_people['email'].apply(update_email)

In [46]:
df_people

Unnamed: 0,first,last,email
0,Corey,Chou,AAA
1,John,Smith,JOHNSMITH@GMAIL.COM
2,Lee,Leo,LEELEO@GMAIL.COM
3,Doe,Reeves,DOEREEVES@EMAIL.COM


In [47]:
# change email values back to lower cases
df_people['email'] = df_people['email'].apply(lambda x: x.lower())

In [48]:
df_people

Unnamed: 0,first,last,email
0,Corey,Chou,aaa
1,John,Smith,johnsmith@gmail.com
2,Lee,Leo,leeleo@gmail.com
3,Doe,Reeves,doereeves@email.com


apply functions for DataFrame not just Series

In [49]:
df_people.apply(len) # which is not we expected

first    4
last     4
email    4
dtype: int64

In [50]:
df_people.apply(pd.Series.min) # return min value for each column

first    Corey
last      Chou
email      aaa
dtype: object

In [51]:
# using lambda to realize same effect
df_people.apply(lambda x: x.min()) # x at here represents each Series

first    Corey
last      Chou
email      aaa
dtype: object

applymap for DataFrame

In [52]:
df_people.applymap(len)

Unnamed: 0,first,last,email
0,5,4,3
1,4,5,19
2,3,3,16
3,3,6,19


In [57]:
df_people = df_people.applymap(str.lower)

In [58]:
df_people

Unnamed: 0,first,last,email
0,corey,chou,aaa
1,john,smith,johnsmith@gmail.com
2,lee,leo,leeleo@gmail.com
3,doe,reeves,doereeves@email.com


map only works on Series

In [59]:
df_people['first'].map({'corey':'Chris', 'john':'Willam', 'lee':'Lee'})
# pay attention on 'NaN'

0     Chris
1    Willam
2       Lee
3       NaN
Name: first, dtype: object

In [61]:
df_people['first'].replace({'corey':'Chris', 'john':'Willam', 'lee':'Lee'})

0     Chris
1    Willam
2       Lee
3       doe
Name: first, dtype: object

### Example on large data set

In [62]:
df = pd.read_csv('../../data-2019/survey_results_public.csv')
schema_df = pd.read_csv('../../data-2019/survey_results_schema.csv')

In [65]:
df.rename(columns={'ConvertedComp':'SalaryUSD'}, inplace=True)
df.columns

Index(['Respondent', 'MainBranch', 'Hobbyist', 'OpenSourcer', 'OpenSource',
       'Employment', 'Country', 'Student', 'EdLevel', 'UndergradMajor',
       'EduOther', 'OrgSize', 'DevType', 'YearsCode', 'Age1stCode',
       'YearsCodePro', 'CareerSat', 'JobSat', 'MgrIdiot', 'MgrMoney',
       'MgrWant', 'JobSeek', 'LastHireDate', 'LastInt', 'FizzBuzz',
       'JobFactors', 'ResumeUpdate', 'CurrencySymbol', 'CurrencyDesc',
       'CompTotal', 'CompFreq', 'SalaryUSD', 'WorkWeekHrs', 'WorkPlan',
       'WorkChallenge', 'WorkRemote', 'WorkLoc', 'ImpSyn', 'CodeRev',
       'CodeRevHrs', 'UnitTests', 'PurchaseHow', 'PurchaseWhat',
       'LanguageWorkedWith', 'LanguageDesireNextYear', 'DatabaseWorkedWith',
       'DatabaseDesireNextYear', 'PlatformWorkedWith',
       'PlatformDesireNextYear', 'WebFrameWorkedWith',
       'WebFrameDesireNextYear', 'MiscTechWorkedWith',
       'MiscTechDesireNextYear', 'DevEnviron', 'OpSys', 'Containers',
       'BlockchainOrg', 'BlockchainIs', 'BetterLife', 'I

In [66]:
df['Hobbyist']

0        Yes
1         No
2        Yes
3         No
4        Yes
        ... 
88878    Yes
88879     No
88880     No
88881     No
88882    Yes
Name: Hobbyist, Length: 88883, dtype: object

In [69]:
# change the 'yes/no' to 'true and false'
df['Hobbyist'] = df['Hobbyist'].map({'Yes':True, 'No':False})