In [1]:
# Loading a Sample Pandas DataFrame
import pandas as pd
df = pd.DataFrame({
    'name': ['James', 'Jane', 'Melissa', 'Ed', 'Neil'],
    'age': [30, 40, 32, 67, 43],
    'score': ['90%', '95%', '100%', '82%', '87%'],
    'age_missing_data': [30, 40, 32, 67, None],
    'income':[100000, 80000, 55000, 62000, 120000]
})
print(df)

      name  age score  age_missing_data  income
0    James   30   90%              30.0  100000
1     Jane   40   95%              40.0   80000
2  Melissa   32  100%              32.0   55000
3       Ed   67   82%              67.0   62000
4     Neil   43   87%               NaN  120000


In [5]:
# vectorized fcns are fcns which are optimized to work in parallel on data

# Creating a dictionary of genders
genders = {'James': 'Male', 'Jane': 'Female', 'Melissa': 'Female', 'Ed': 'Male', 'Neil': 'Male'}
# map with dict
df['gender'] = df['name'].map(genders)
print(df)

mean_income = df['income'].mean()

# map with fcn
df['more_than_avg_income'] = df['income'].map(lambda x: x > mean_income)
print(df)


      name  age score  age_missing_data  income  gender  more_than_avg_income
0    James   30   90%              30.0  100000    Male                  True
1     Jane   40   95%              40.0   80000  Female                 False
2  Melissa   32  100%              32.0   55000  Female                 False
3       Ed   67   82%              67.0   62000    Male                 False
4     Neil   43   87%               NaN  120000    Male                  True
      name  age score  age_missing_data  income  gender  more_than_avg_income
0    James   30   90%              30.0  100000    Male                  True
1     Jane   40   95%              40.0   80000  Female                 False
2  Melissa   32  100%              32.0   55000  Female                 False
3       Ed   67   82%              67.0   62000    Male                 False
4     Neil   43   87%               NaN  120000    Male                  True


In [17]:
last_names = pd.Series(['Doe', 'Miller', 'Edwards', 'Nelson', 'Raul'], index=['James', 'Jane', 'Melissa', 'Ed', 'Neil'])
df['last_name'] = df['name'].map(last_names)
print(df)

      name  age score  age_missing_data  income  gender  more_than_avg_income  \
0    James   30   90%              30.0  100000    Male                  True   
1     Jane   40   95%              40.0   80000  Female                 False   
2  Melissa   32  100%              32.0   55000  Female                 False   
3       Ed   67   82%              67.0   62000    Male                 False   
4     Neil   43   87%               NaN  120000    Male                  True   

  last_name  
0       Doe  
1    Miller  
2   Edwards  
3    Nelson  
4      Raul  


In [25]:
# apply() fcn 

#only works with callable as argument

def interview(row):
    return row['age'] < 45 and row['income'] > 7500

df['interview'] = df.apply(interview, axis=1)

print(df)

def bonus(row, amount, give=False):
    if give:
        return row['income'] / row['age'] * amount
    else:
        return False
    
df['bonus'] = df.apply(bonus, args=(0.25,), give=True, axis=1)

print(df)


      name  age score  age_missing_data  income  gender  more_than_avg_income  \
0    James   30   90%              30.0  100000    Male                  True   
1     Jane   40   95%              40.0   80000  Female                 False   
2  Melissa   32  100%              32.0   55000  Female                 False   
3       Ed   67   82%              67.0   62000    Male                 False   
4     Neil   43   87%               NaN  120000    Male                  True   

  last_name  interview bonuse bonus  
0       Doe       True   None  None  
1    Miller       True   None  None  
2   Edwards       True   None  None  
3    Nelson      False   None  None  
4      Raul       True   None  None  
      name  age score  age_missing_data  income  gender  more_than_avg_income  \
0    James   30   90%              30.0  100000    Male                  True   
1     Jane   40   95%              40.0   80000  Female                 False   
2  Melissa   32  100%              32.0   

In [27]:
# vectorized so faster way


df['more_than_avg_income'] = df['income'] > mean_income
print(df)

      name  age score  age_missing_data  income  gender  more_than_avg_income  \
0    James   30   90%              30.0  100000    Male                  True   
1     Jane   40   95%              40.0   80000  Female                 False   
2  Melissa   32  100%              32.0   55000  Female                 False   
3       Ed   67   82%              67.0   62000    Male                 False   
4     Neil   43   87%               NaN  120000    Male                  True   

  last_name  interview bonuse       bonus  
0       Doe       True   None  833.333333  
1    Miller       True   None  500.000000  
2   Edwards       True   None  429.687500  
3    Nelson      False   None  231.343284  
4      Raul       True   None  697.674419  


In [31]:
df.drop('bonuse', axis=1)

Unnamed: 0,name,age,score,age_missing_data,income,gender,more_than_avg_income,last_name,interview,bonus
0,James,30,90%,30.0,100000,Male,True,Doe,True,833.333333
1,Jane,40,95%,40.0,80000,Female,False,Miller,True,500.0
2,Melissa,32,100%,32.0,55000,Female,False,Edwards,True,429.6875
3,Ed,67,82%,67.0,62000,Male,False,Nelson,False,231.343284
4,Neil,43,87%,,120000,Male,True,Raul,True,697.674419


In [44]:
df['score_ratio'] = df['score'].map(lambda x: int(x[0:-1])/100)
df


Unnamed: 0,name,age,score,age_missing_data,income,gender,more_than_avg_income,last_name,interview,bonuse,bonus,score_ratio
0,James,30,90%,30.0,100000,Male,True,Doe,True,,833.333333,0.9
1,Jane,40,95%,40.0,80000,Female,False,Miller,True,,500.0,0.95
2,Melissa,32,100%,32.0,55000,Female,False,Edwards,True,,429.6875,1.0
3,Ed,67,82%,67.0,62000,Male,False,Nelson,False,,231.343284,0.82
4,Neil,43,87%,,120000,Male,True,Raul,True,,697.674419,0.87
