In [273]:
person = {
    "first": "Corey", 
    "last": "Schafer", 
    "email": "CoreyMSchafer@gmail.com"
}

In [274]:
people = {
    "first": ["Corey"], 
    "last": ["Schafer"], 
    "email": ["CoreyMSchafer@gmail.com"]
}

In [275]:
people = {
    "first": ["Corey", 'Jane', 'John'], 
    "last": ["Schafer", 'Doe', 'Doe'], 
    "email": ["CoreyMSchafer@gmail.com", 'JaneDoe@email.com', 'JohnDoe@email.com']
}

In [276]:
people['email']

['CoreyMSchafer@gmail.com', 'JaneDoe@email.com', 'JohnDoe@email.com']

In [277]:
import pandas as pd 

In [278]:
df = pd.DataFrame(people)

In [279]:
df

Unnamed: 0,first,last,email
0,Corey,Schafer,CoreyMSchafer@gmail.com
1,Jane,Doe,JaneDoe@email.com
2,John,Doe,JohnDoe@email.com


In [280]:
df['email']

0    CoreyMSchafer@gmail.com
1          JaneDoe@email.com
2          JohnDoe@email.com
Name: email, dtype: object

In [281]:
# data frame is similar to dic in python 
# but much more than dic 
type(df['email'])

pandas.core.series.Series

In [282]:
# another way to access serires information 
df.email

0    CoreyMSchafer@gmail.com
1          JaneDoe@email.com
2          JohnDoe@email.com
Name: email, dtype: object

In [283]:
df[['last', 'email']]

Unnamed: 0,last,email
0,Schafer,CoreyMSchafer@gmail.com
1,Doe,JaneDoe@email.com
2,Doe,JohnDoe@email.com


In [284]:
df.columns

Index(['first', 'last', 'email'], dtype='object')

In [285]:
# iloc access row integer locations
# information of first row 
df.iloc[0]

first                      Corey
last                     Schafer
email    CoreyMSchafer@gmail.com
Name: 0, dtype: object

In [286]:
# first two row of data frame 
df.iloc[[0,1]]

Unnamed: 0,first,last,email
0,Corey,Schafer,CoreyMSchafer@gmail.com
1,Jane,Doe,JaneDoe@email.com


In [287]:
# first two row and email col
df.iloc[[0,1], [2]]

Unnamed: 0,email
0,CoreyMSchafer@gmail.com
1,JaneDoe@email.com


In [288]:
df

Unnamed: 0,first,last,email
0,Corey,Schafer,CoreyMSchafer@gmail.com
1,Jane,Doe,JaneDoe@email.com
2,John,Doe,JohnDoe@email.com


In [289]:
df.loc[[0,1]]

Unnamed: 0,first,last,email
0,Corey,Schafer,CoreyMSchafer@gmail.com
1,Jane,Doe,JaneDoe@email.com


In [290]:
df.loc[[0,1], 'email']

0    CoreyMSchafer@gmail.com
1          JaneDoe@email.com
Name: email, dtype: object

In [291]:
df.loc[[0,1], ['email', 'last']]

Unnamed: 0,email,last
0,CoreyMSchafer@gmail.com,Schafer
1,JaneDoe@email.com,Doe


In [292]:
df['email']

0    CoreyMSchafer@gmail.com
1          JaneDoe@email.com
2          JohnDoe@email.com
Name: email, dtype: object

In [293]:
# set index with email
df.set_index('email', inplace=True)

In [294]:
df

Unnamed: 0_level_0,first,last
email,Unnamed: 1_level_1,Unnamed: 2_level_1
CoreyMSchafer@gmail.com,Corey,Schafer
JaneDoe@email.com,Jane,Doe
JohnDoe@email.com,John,Doe


In [295]:
df.index

Index(['CoreyMSchafer@gmail.com', 'JaneDoe@email.com', 'JohnDoe@email.com'], dtype='object', name='email')

In [296]:
df.loc['CoreyMSchafer@gmail.com', 'last']

'Schafer'

In [297]:
# df.loc[0] is error cause we changed the index
# but we can use iloc
df.iloc[0]

first      Corey
last     Schafer
Name: CoreyMSchafer@gmail.com, dtype: object

In [298]:
# reset data frame 
df.reset_index(inplace=True)

In [299]:
df

Unnamed: 0,email,first,last
0,CoreyMSchafer@gmail.com,Corey,Schafer
1,JaneDoe@email.com,Jane,Doe
2,JohnDoe@email.com,John,Doe


In [300]:
df['last'] == 'Doe'

0    False
1     True
2     True
Name: last, dtype: bool

In [301]:
filt = df['last'] == 'Doe'

In [302]:
filt

0    False
1     True
2     True
Name: last, dtype: bool

In [303]:
#df[filt]

In [304]:
# or we can do filter in one line 
df[df['last'] == 'Doe']

Unnamed: 0,email,first,last
1,JaneDoe@email.com,Jane,Doe
2,JohnDoe@email.com,John,Doe


In [305]:
# third way do filter with dot operator
#df.loc[filt]
# filter all last equals to Doe and email column 
df.loc[filt, 'email']

1    JaneDoe@email.com
2    JohnDoe@email.com
Name: email, dtype: object

In [306]:
# & for and, | for or 
filt = (df['last'] == 'Doe') & (df['first'] == 'John')

In [307]:
df.loc[filt]

Unnamed: 0,email,first,last
2,JohnDoe@email.com,John,Doe


In [308]:
filt = (df['last'] == 'Schafer') | (df['first'] == 'John')
df.loc[filt]

Unnamed: 0,email,first,last
0,CoreyMSchafer@gmail.com,Corey,Schafer
2,JohnDoe@email.com,John,Doe


In [309]:
# negate the filt result  
df.loc[~filt]

Unnamed: 0,email,first,last
1,JaneDoe@email.com,Jane,Doe


In [310]:
df = pd.DataFrame(people)
df.columns

Index(['first', 'last', 'email'], dtype='object')

In [311]:
df.columns = ['first_name', 'last_name', 'email']
df

Unnamed: 0,first_name,last_name,email
0,Corey,Schafer,CoreyMSchafer@gmail.com
1,Jane,Doe,JaneDoe@email.com
2,John,Doe,JohnDoe@email.com


In [312]:
df.columns = [x.lower() for x in df.columns]
df

Unnamed: 0,first_name,last_name,email
0,Corey,Schafer,CoreyMSchafer@gmail.com
1,Jane,Doe,JaneDoe@email.com
2,John,Doe,JohnDoe@email.com


In [313]:
# replase space in columns string into '_'
df.columns = df.columns.str.replace(' ', '_')
df

Unnamed: 0,first_name,last_name,email
0,Corey,Schafer,CoreyMSchafer@gmail.com
1,Jane,Doe,JaneDoe@email.com
2,John,Doe,JohnDoe@email.com


In [314]:
df.rename(columns={'first_name': 'first', 'last_name': 'last'}, inplace=True)
df

Unnamed: 0,first,last,email
0,Corey,Schafer,CoreyMSchafer@gmail.com
1,Jane,Doe,JaneDoe@email.com
2,John,Doe,JohnDoe@email.com


In [315]:
# just change one entry
df.loc[2] = ['John', 'Smith', 'JhonSmith@email.com']

In [316]:
df

Unnamed: 0,first,last,email
0,Corey,Schafer,CoreyMSchafer@gmail.com
1,Jane,Doe,JaneDoe@email.com
2,John,Smith,JhonSmith@email.com


In [317]:
df.loc[2, ['last', 'email']] = ['Doe', 'JohnDoe@email.com']
df

Unnamed: 0,first,last,email
0,Corey,Schafer,CoreyMSchafer@gmail.com
1,Jane,Doe,JaneDoe@email.com
2,John,Doe,JohnDoe@email.com


In [318]:
df.loc[2, 'last'] = 'Smith'

In [319]:
df

Unnamed: 0,first,last,email
0,Corey,Schafer,CoreyMSchafer@gmail.com
1,Jane,Doe,JaneDoe@email.com
2,John,Smith,JohnDoe@email.com


In [320]:
df.at[2, 'last'] = 'Doe'
df

Unnamed: 0,first,last,email
0,Corey,Schafer,CoreyMSchafer@gmail.com
1,Jane,Doe,JaneDoe@email.com
2,John,Doe,JohnDoe@email.com


In [321]:
filt = (df['email'] == 'JohnDoe@email.com')
df[filt]['last'] = 'Smith'

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[filt]['last'] = 'Smith'


In [322]:
filt = (df['email'] == 'JohnDoe@email.com')
df.loc[filt, 'last'] = 'Smith'

In [323]:
# change more than one targets
df['email'] = df['email'].str.lower()

In [324]:
df

Unnamed: 0,first,last,email
0,Corey,Schafer,coreymschafer@gmail.com
1,Jane,Doe,janedoe@email.com
2,John,Smith,johndoe@email.com


In [325]:
# apply, map, applymap, replace 
# use them for change series of information in dataframe

# the email length of email 
df['email'].apply(len)

0    23
1    17
2    17
Name: email, dtype: int64

In [326]:
def update_email(email):
    return email.upper()

In [327]:
df['email'].apply(update_email)

0    COREYMSCHAFER@GMAIL.COM
1          JANEDOE@EMAIL.COM
2          JOHNDOE@EMAIL.COM
Name: email, dtype: object

In [328]:
df['email'] = df['email'].apply(update_email)

In [329]:
df

Unnamed: 0,first,last,email
0,Corey,Schafer,COREYMSCHAFER@GMAIL.COM
1,Jane,Doe,JANEDOE@EMAIL.COM
2,John,Smith,JOHNDOE@EMAIL.COM


In [330]:
# we can also use lambda to achive the same reuslt 
df['email'] = df['email'].apply(lambda x: x.lower())

In [331]:
df

Unnamed: 0,first,last,email
0,Corey,Schafer,coreymschafer@gmail.com
1,Jane,Doe,janedoe@email.com
2,John,Smith,johndoe@email.com


In [332]:
df['email'].apply(len)

0    23
1    17
2    17
Name: email, dtype: int64

In [333]:
# df.apply(len) --> counting at downward
df.apply(len, axis='columns') # counting from left to right 

0    3
1    3
2    3
dtype: int64

In [334]:
len(df['email'])

3

In [335]:
df.apply(pd.Series.min)

first                      Corey
last                         Doe
email    coreymschafer@gmail.com
dtype: object

In [336]:
df.apply(lambda x: x.min())

first                      Corey
last                         Doe
email    coreymschafer@gmail.com
dtype: object

In [337]:
# apply len in each element in the data frame 
df.applymap(len)

Unnamed: 0,first,last,email
0,5,7,23
1,4,3,17
2,4,5,17


In [338]:
# does not work for numbers 
df.applymap(str.lower)

Unnamed: 0,first,last,email
0,corey,schafer,coreymschafer@gmail.com
1,jane,doe,janedoe@email.com
2,john,smith,johndoe@email.com


In [339]:
# sub some values out 
df['first'].map({'Corey': 'Chris', 'Jane': 'Mary'})
# the problem is that we only sub two of three
# thats why we have NaN for last one, cause map does sign the Nan value since we did nto 
# give a value for the thrid one 

0    Chris
1     Mary
2      NaN
Name: first, dtype: object

In [340]:
# if we want to only sub out two out three
# we can use replace
df['first'].replace({'Corey': 'Chris', 'Jane': 'Mary'})

0    Chris
1     Mary
2     John
Name: first, dtype: object

In [341]:
df

Unnamed: 0,first,last,email
0,Corey,Schafer,coreymschafer@gmail.com
1,Jane,Doe,janedoe@email.com
2,John,Smith,johndoe@email.com


In [342]:
df = pd.DataFrame(people)

In [343]:
df['first'] + ' ' + df['last']

0    Corey Schafer
1         Jane Doe
2         John Doe
dtype: object

In [344]:
# add new columns 
df['full_name'] = df['first'] + ' ' + df['last']

In [345]:
df

Unnamed: 0,first,last,email,full_name
0,Corey,Schafer,CoreyMSchafer@gmail.com,Corey Schafer
1,Jane,Doe,JaneDoe@email.com,Jane Doe
2,John,Doe,JohnDoe@email.com,John Doe


In [346]:
# drop method for remove columns 
df.drop(columns=['first', 'last'])
# this is no inplace 

Unnamed: 0,email,full_name
0,CoreyMSchafer@gmail.com,Corey Schafer
1,JaneDoe@email.com,Jane Doe
2,JohnDoe@email.com,John Doe


In [347]:
df.drop(columns=['first', 'last'], inplace=True)

In [348]:
df

Unnamed: 0,email,full_name
0,CoreyMSchafer@gmail.com,Corey Schafer
1,JaneDoe@email.com,Jane Doe
2,JohnDoe@email.com,John Doe


In [349]:
# reverse the drop prcess wiht expand argument
df['full_name'].str.split(' ', expand=True)

Unnamed: 0,0,1
0,Corey,Schafer
1,Jane,Doe
2,John,Doe


In [350]:
# adding back first and last columns 
df[['first', 'last']] = df['full_name'].str.split(' ', expand=True)
df

Unnamed: 0,email,full_name,first,last
0,CoreyMSchafer@gmail.com,Corey Schafer,Corey,Schafer
1,JaneDoe@email.com,Jane Doe,Jane,Doe
2,JohnDoe@email.com,John Doe,John,Doe


In [351]:
# add single row 
#df.append({'first': 'Tony'})   # error because of no index problem 
df.append({'first': 'Tony'}, ignore_index=True)

Unnamed: 0,email,full_name,first,last
0,CoreyMSchafer@gmail.com,Corey Schafer,Corey,Schafer
1,JaneDoe@email.com,Jane Doe,Jane,Doe
2,JohnDoe@email.com,John Doe,John,Doe
3,,,Tony,


In [352]:
people = {
    "first": ["Tony", 'Steve'], 
    "last": ["Stark", "Rogers"], 
    "email": ["IronMan@avenge.com", "Cap@avenge.com"]
}
df2 = pd.DataFrame(people)

In [353]:
df2

Unnamed: 0,first,last,email
0,Tony,Stark,IronMan@avenge.com
1,Steve,Rogers,Cap@avenge.com


In [354]:
#df.append(df2, ignore_index=True)

In [355]:
# inplace  NOTE append method does not have inplace argument
df = df.append(df2, ignore_index=True)

In [356]:
df

Unnamed: 0,email,full_name,first,last
0,CoreyMSchafer@gmail.com,Corey Schafer,Corey,Schafer
1,JaneDoe@email.com,Jane Doe,Jane,Doe
2,JohnDoe@email.com,John Doe,John,Doe
3,IronMan@avenge.com,,Tony,Stark
4,Cap@avenge.com,,Steve,Rogers


In [357]:
df.drop(index=4)

Unnamed: 0,email,full_name,first,last
0,CoreyMSchafer@gmail.com,Corey Schafer,Corey,Schafer
1,JaneDoe@email.com,Jane Doe,Jane,Doe
2,JohnDoe@email.com,John Doe,John,Doe
3,IronMan@avenge.com,,Tony,Stark


In [358]:
# drop with filter 
filt = df['last'] == 'Doe'
df.drop(index=df[filt].index)

Unnamed: 0,email,full_name,first,last
0,CoreyMSchafer@gmail.com,Corey Schafer,Corey,Schafer
3,IronMan@avenge.com,,Tony,Stark
4,Cap@avenge.com,,Steve,Rogers


In [359]:
# sorting 

In [360]:
people = {
    "first": ["Corey", 'Jane', 'John'], 
    "last": ["Schafer", 'Doe', 'Doe'], 
    "email": ["CoreyMSchafer@gmail.com", 'JaneDoe@email.com', 'JohnDoe@email.com']
}
df = pd.DataFrame(people)

In [361]:
df

Unnamed: 0,first,last,email
0,Corey,Schafer,CoreyMSchafer@gmail.com
1,Jane,Doe,JaneDoe@email.com
2,John,Doe,JohnDoe@email.com


In [362]:
df.sort_values(by='last', ascending=False)

Unnamed: 0,first,last,email
0,Corey,Schafer,CoreyMSchafer@gmail.com
1,Jane,Doe,JaneDoe@email.com
2,John,Doe,JohnDoe@email.com


In [363]:
# what if we want to sort base on two attributes 
# sort base on first attribute and then the second
# in this case, we can pass a list of attributes for sorting 
df.sort_values(by=['last', 'first'], ascending=False)

Unnamed: 0,first,last,email
0,Corey,Schafer,CoreyMSchafer@gmail.com
2,John,Doe,JohnDoe@email.com
1,Jane,Doe,JaneDoe@email.com


In [364]:
# what if we want to sort base on first attribute ascending and second decending order?
# adding one more name for demo 
people = {
    "first": ["Corey", 'Jane', 'John', 'Adam'], 
    "last": ["Schafer", 'Doe', 'Doe', 'Doe'], 
    "email": ["CoreyMSchafer@gmail.com", 'JaneDoe@email.com', 'JohnDoe@email.com', 'A@email.com']
}
df = pd.DataFrame(people)

In [365]:
df

Unnamed: 0,first,last,email
0,Corey,Schafer,CoreyMSchafer@gmail.com
1,Jane,Doe,JaneDoe@email.com
2,John,Doe,JohnDoe@email.com
3,Adam,Doe,A@email.com


In [366]:
# last name is in decending order, but first name is in ascending order 
df.sort_values(by=['last', 'first'], ascending=[False, True], inplace=True)

In [367]:
df

Unnamed: 0,first,last,email
0,Corey,Schafer,CoreyMSchafer@gmail.com
3,Adam,Doe,A@email.com
1,Jane,Doe,JaneDoe@email.com
2,John,Doe,JohnDoe@email.com


In [368]:
# for reset we can simply sort it base on the index
df.sort_index()

Unnamed: 0,first,last,email
0,Corey,Schafer,CoreyMSchafer@gmail.com
1,Jane,Doe,JaneDoe@email.com
2,John,Doe,JohnDoe@email.com
3,Adam,Doe,A@email.com


In [369]:
df['last'].sort_values()

3        Doe
1        Doe
2        Doe
0    Schafer
Name: last, dtype: object

In [370]:
# missing data 

In [380]:
import numpy as np
people = {
    "first": ["Corey", 'Jane', 'John', 'Chris', np.nan, None, 'NA'], 
    "last": ["Schafer", 'Doe', 'Doe', 'Schafer', np.nan, np.nan, 'Missing'], 
    "email": ["CoreyMSchafer@gmail.com", 'JaneDoe@email.com', 'JohnDoe@email.com', None, np.nan, 'Anonymous@email.com', 'NA'],
    "age": ['33', '55', '63', '36', None, None, 'Missing']
}
df = pd.DataFrame(people)

# how to handle customized missing value like "NA"
# we can change 'NA' value when we load it into dataframe


df.replace('NA', np.nan, inplace=True)
df.replace('Missing', np.nan, inplace=True)

In [381]:
df

Unnamed: 0,first,last,email,age
0,Corey,Schafer,CoreyMSchafer@gmail.com,33.0
1,Jane,Doe,JaneDoe@email.com,55.0
2,John,Doe,JohnDoe@email.com,63.0
3,Chris,Schafer,,36.0
4,,,,
5,,,Anonymous@email.com,
6,,,,


In [382]:
# dropna using default arguments
df.dropna()

Unnamed: 0,first,last,email,age
0,Corey,Schafer,CoreyMSchafer@gmail.com,33
1,Jane,Doe,JaneDoe@email.com,55
2,John,Doe,JohnDoe@email.com,63


In [383]:
# what are the default elements
# axis: can be rows or columns 
# how: how we want to drop these, which drop all rows that has missing values
#    : how='all', only drops the row that contains all missing values
df.dropna(axis='index', how='any')

Unnamed: 0,first,last,email,age
0,Corey,Schafer,CoreyMSchafer@gmail.com,33
1,Jane,Doe,JaneDoe@email.com,55
2,John,Doe,JohnDoe@email.com,63


In [384]:
df.dropna(axis='index', how='all')

Unnamed: 0,first,last,email,age
0,Corey,Schafer,CoreyMSchafer@gmail.com,33.0
1,Jane,Doe,JaneDoe@email.com,55.0
2,John,Doe,JohnDoe@email.com,63.0
3,Chris,Schafer,,36.0
5,,,Anonymous@email.com,


In [385]:
df.dropna(axis='index', how='any', subset=['email']) # keep rows with no email missing value

Unnamed: 0,first,last,email,age
0,Corey,Schafer,CoreyMSchafer@gmail.com,33.0
1,Jane,Doe,JaneDoe@email.com,55.0
2,John,Doe,JohnDoe@email.com,63.0
5,,,Anonymous@email.com,


In [386]:
df.dropna(axis='index', how='all', subset=['last','email'])
# last and email, at least one of them is not a missing value

Unnamed: 0,first,last,email,age
0,Corey,Schafer,CoreyMSchafer@gmail.com,33.0
1,Jane,Doe,JaneDoe@email.com,55.0
2,John,Doe,JohnDoe@email.com,63.0
3,Chris,Schafer,,36.0
5,,,Anonymous@email.com,


In [387]:
df.isna() # see if there is missing values

Unnamed: 0,first,last,email,age
0,False,False,False,False
1,False,False,False,False
2,False,False,False,False
3,False,False,True,False
4,True,True,True,True
5,True,True,False,True
6,True,True,True,True


In [389]:
df.fillna('0')

Unnamed: 0,first,last,email,age
0,Corey,Schafer,CoreyMSchafer@gmail.com,33
1,Jane,Doe,JaneDoe@email.com,55
2,John,Doe,JohnDoe@email.com,63
3,Chris,Schafer,0,36
4,0,0,0,0
5,0,0,Anonymous@email.com,0
6,0,0,0,0


In [390]:
df.dtypes

first    object
last     object
email    object
age      object
dtype: object

In [391]:
df['age'].mean()

TypeError: can only concatenate str (not "int") to str

In [395]:
# df['age'] = df['age'].astype(int)  --> error
df['age'] = df['age'].astype(float)

In [397]:
df.dtypes

first     object
last      object
email     object
age      float64
dtype: object

In [398]:
df['age'].mean()

46.75

In [399]:
df.astype()

TypeError: astype() missing 1 required positional argument: 'dtype'