In [1]:
import pandas as pd

In [2]:
people = {
    "first": ["Corey", "Jane", "John"],
    "last": ["Schafer", "Doe", "Doe"],
    "email": ["CoreySchafer@gmail.com", "JaneDoe@email.com", 
              "JohnDoe@email.com"]
}

In [3]:
people['first']

['Corey', 'Jane', 'John']

In [4]:
# Make a Python dictionary into a Pandas dataframe
df = pd.DataFrame(people)
df

Unnamed: 0,first,last,email
0,Corey,Schafer,CoreySchafer@gmail.com
1,Jane,Doe,JaneDoe@email.com
2,John,Doe,JohnDoe@email.com


In [5]:
df['email']  # Access columns as a series

0    CoreySchafer@gmail.com
1         JaneDoe@email.com
2         JohnDoe@email.com
Name: email, dtype: object

In [6]:
type(df['email'])

pandas.core.series.Series

In [7]:
df.iloc[0]

first                     Corey
last                    Schafer
email    CoreySchafer@gmail.com
Name: 0, dtype: object

In [8]:
type(df.iloc[0])

pandas.core.series.Series

In [9]:
df[["last", "email"]] # Access multiple columns as a dataframe

Unnamed: 0,last,email
0,Schafer,CoreySchafer@gmail.com
1,Doe,JaneDoe@email.com
2,Doe,JohnDoe@email.com


In [10]:
df.columns

Index(['first', 'last', 'email'], dtype='object')

In [11]:
df.iloc[0] # Access row by index number

first                     Corey
last                    Schafer
email    CoreySchafer@gmail.com
Name: 0, dtype: object

In [12]:
df.iloc[[0, 1]] # Access multiple rows as a dataframe

Unnamed: 0,first,last,email
0,Corey,Schafer,CoreySchafer@gmail.com
1,Jane,Doe,JaneDoe@email.com


In [13]:
df.iloc[[0, 1], [1, 2]] # Second argument for columns

Unnamed: 0,last,email
0,Schafer,CoreySchafer@gmail.com
1,Doe,JaneDoe@email.com


In [14]:
df.loc[[0, 1], ["last", "email"]]

Unnamed: 0,last,email
0,Schafer,CoreySchafer@gmail.com
1,Doe,JaneDoe@email.com


In [15]:
# Set index:
df.set_index("email", inplace = True)  # Don't repeat this command
df

Unnamed: 0_level_0,first,last
email,Unnamed: 1_level_1,Unnamed: 2_level_1
CoreySchafer@gmail.com,Corey,Schafer
JaneDoe@email.com,Jane,Doe
JohnDoe@email.com,John,Doe


In [16]:
df.index

Index(['CoreySchafer@gmail.com', 'JaneDoe@email.com', 'JohnDoe@email.com'], dtype='object', name='email')

In [17]:
df.columns

Index(['first', 'last'], dtype='object')

In [18]:
df.loc['CoreySchafer@gmail.com']

first      Corey
last     Schafer
Name: CoreySchafer@gmail.com, dtype: object

In [19]:
df.reset_index(inplace = True)
df

Unnamed: 0,email,first,last
0,CoreySchafer@gmail.com,Corey,Schafer
1,JaneDoe@email.com,Jane,Doe
2,JohnDoe@email.com,John,Doe


In [20]:
# Filtering
filt = (df['last'] == 'Doe')
df[filt]

Unnamed: 0,email,first,last
1,JaneDoe@email.com,Jane,Doe
2,JohnDoe@email.com,John,Doe


In [21]:
# Alternatively
df.loc[filt]

Unnamed: 0,email,first,last
1,JaneDoe@email.com,Jane,Doe
2,JohnDoe@email.com,John,Doe


In [22]:
df.loc[filt, 'email']

1    JaneDoe@email.com
2    JohnDoe@email.com
Name: email, dtype: object

In [23]:
# Composite filters
filt = (df['last'] == 'Doe') & (df['first'] == 'John')
df.loc[filt]

Unnamed: 0,email,first,last
2,JohnDoe@email.com,John,Doe


In [24]:
filt = (df['last'] == 'Schafer') | (df['first'] == 'John')
df.loc[filt]

Unnamed: 0,email,first,last
0,CoreySchafer@gmail.com,Corey,Schafer
2,JohnDoe@email.com,John,Doe


In [25]:
# Negate filter
df.loc[~filt]

Unnamed: 0,email,first,last
1,JaneDoe@email.com,Jane,Doe


In [26]:
# Updating Columns and Rows
df.columns = ['email', 'first_name', 'last_name']
df

Unnamed: 0,email,first_name,last_name
0,CoreySchafer@gmail.com,Corey,Schafer
1,JaneDoe@email.com,Jane,Doe
2,JohnDoe@email.com,John,Doe


In [27]:
df.columns = [x.upper() for x in df.columns] # List comprehension
df

Unnamed: 0,EMAIL,FIRST_NAME,LAST_NAME
0,CoreySchafer@gmail.com,Corey,Schafer
1,JaneDoe@email.com,Jane,Doe
2,JohnDoe@email.com,John,Doe


In [28]:
df.columns = df.columns.str.replace('_', ' ') # Replacing space for _
df

Unnamed: 0,EMAIL,FIRST NAME,LAST NAME
0,CoreySchafer@gmail.com,Corey,Schafer
1,JaneDoe@email.com,Jane,Doe
2,JohnDoe@email.com,John,Doe


In [29]:
df.rename(columns = {'FIRST NAME': 'first', 'LAST NAME': 'last'}, 
          inplace = True)
df

Unnamed: 0,EMAIL,first,last
0,CoreySchafer@gmail.com,Corey,Schafer
1,JaneDoe@email.com,Jane,Doe
2,JohnDoe@email.com,John,Doe


In [30]:
df.loc[2, ['last', 'EMAIL']] = ['Smith', 'JohnSmith@email.com']
df

Unnamed: 0,EMAIL,first,last
0,CoreySchafer@gmail.com,Corey,Schafer
1,JaneDoe@email.com,Jane,Doe
2,JohnSmith@email.com,John,Smith


In [31]:
df.at[2, 'last'] = 'Doe' # Can interchange with loc
df

Unnamed: 0,EMAIL,first,last
0,CoreySchafer@gmail.com,Corey,Schafer
1,JaneDoe@email.com,Jane,Doe
2,JohnSmith@email.com,John,Doe


In [32]:
df['EMAIL'] = df['EMAIL'].str.lower()
df

Unnamed: 0,EMAIL,first,last
0,coreyschafer@gmail.com,Corey,Schafer
1,janedoe@email.com,Jane,Doe
2,johnsmith@email.com,John,Doe


In [33]:
# Four methods: apply(), map(), applymap(), replace()
df['EMAIL'].apply(len)

0    22
1    17
2    19
Name: EMAIL, dtype: int64

In [34]:
def update_email(email):
    return email.upper()

df['EMAIL'] = df['EMAIL'].apply(update_email)
df

Unnamed: 0,EMAIL,first,last
0,COREYSCHAFER@GMAIL.COM,Corey,Schafer
1,JANEDOE@EMAIL.COM,Jane,Doe
2,JOHNSMITH@EMAIL.COM,John,Doe


In [35]:
df['EMAIL'] = df['EMAIL'].apply(lambda x : x.lower())
df

Unnamed: 0,EMAIL,first,last
0,coreyschafer@gmail.com,Corey,Schafer
1,janedoe@email.com,Jane,Doe
2,johnsmith@email.com,John,Doe


In [36]:
df.apply(len) # The method can be applied to the entired dataframe
              # as well, but returns info about each series.

EMAIL    3
first    3
last     3
dtype: int64

In [37]:
df.apply(len, axis = 'columns')

0    3
1    3
2    3
dtype: int64

In [38]:
df.apply(pd.Series.min)

EMAIL    coreyschafer@gmail.com
first                     Corey
last                        Doe
dtype: object

In [39]:
df.apply(lambda x : x.min())

EMAIL    coreyschafer@gmail.com
first                     Corey
last                        Doe
dtype: object

In [40]:
df.applymap(len)  #applymap() is only a method for dataframes

Unnamed: 0,EMAIL,first,last
0,22,5,7
1,17,4,3
2,19,4,3


In [41]:
df.applymap(str.lower)

Unnamed: 0,EMAIL,first,last
0,coreyschafer@gmail.com,corey,schafer
1,janedoe@email.com,jane,doe
2,johnsmith@email.com,john,doe


In [42]:
df['first'].map({'Corey': 'Chris', 'Jane': 'Mary'})

0    Chris
1     Mary
2      NaN
Name: first, dtype: object

In [43]:
df['first'].replace({'Corey': 'Chris', 'Jane': 'Mary'})

0    Chris
1     Mary
2     John
Name: first, dtype: object

In [44]:
# Combining two columns
df['full_name'] = df['first'] + ' ' + df['last']
df

Unnamed: 0,EMAIL,first,last,full_name
0,coreyschafer@gmail.com,Corey,Schafer,Corey Schafer
1,janedoe@email.com,Jane,Doe,Jane Doe
2,johnsmith@email.com,John,Doe,John Doe


In [45]:
# Removing columns
df.drop(columns = ['first', 'last'], inplace = True)
df

Unnamed: 0,EMAIL,full_name
0,coreyschafer@gmail.com,Corey Schafer
1,janedoe@email.com,Jane Doe
2,johnsmith@email.com,John Doe


In [46]:
# Reversing the drop
df[['first', 'last']] = df['full_name'].str.split(' ', expand =  True)
df

Unnamed: 0,EMAIL,full_name,first,last
0,coreyschafer@gmail.com,Corey Schafer,Corey,Schafer
1,janedoe@email.com,Jane Doe,Jane,Doe
2,johnsmith@email.com,John Doe,John,Doe


In [47]:
# Add a single row of data
df = df.append({'first': 'Tony'}, ignore_index = True)
df

Unnamed: 0,EMAIL,full_name,first,last
0,coreyschafer@gmail.com,Corey Schafer,Corey,Schafer
1,janedoe@email.com,Jane Doe,Jane,Doe
2,johnsmith@email.com,John Doe,John,Doe
3,,,Tony,


In [48]:
# Concatenating two dataframes
people = {
    "first": ["Tony", "Steve"],
    "last": ["Stark", "Rogers"],
    "EMAIL": ["IronMan@avenge.com", "Cap@avenge.com"]
}
df2 = pd.DataFrame(people)

In [49]:
df = df.append(df2, ignore_index = True)
df.columns = df.columns.str.lower()
df

Unnamed: 0,email,full_name,first,last
0,coreyschafer@gmail.com,Corey Schafer,Corey,Schafer
1,janedoe@email.com,Jane Doe,Jane,Doe
2,johnsmith@email.com,John Doe,John,Doe
3,,,Tony,
4,IronMan@avenge.com,,Tony,Stark
5,Cap@avenge.com,,Steve,Rogers


In [50]:
# Remove rows
df.drop(index = [3, 5], inplace = True)
df

Unnamed: 0,email,full_name,first,last
0,coreyschafer@gmail.com,Corey Schafer,Corey,Schafer
1,janedoe@email.com,Jane Doe,Jane,Doe
2,johnsmith@email.com,John Doe,John,Doe
4,IronMan@avenge.com,,Tony,Stark


In [51]:
filt = df['last'] == 'Doe'
df.drop(index = df[filt].index, inplace = True)
df

Unnamed: 0,email,full_name,first,last
0,coreyschafer@gmail.com,Corey Schafer,Corey,Schafer
4,IronMan@avenge.com,,Tony,Stark


In [52]:
# Sort data
people = {
    "first": ["Corey", "Jane", "John", "Adam"],
    "last": ["Schafer", "Doe", "Doe", "Doe"],
    "email": ["CoreySchafer@gmail.com", "JaneDoe@email.com", 
              "JohnDoe@email.com", "Adam@email.com"]
}
df = pd.DataFrame(people)

In [53]:
df.sort_values(by = 'last', ascending = False)

Unnamed: 0,first,last,email
0,Corey,Schafer,CoreySchafer@gmail.com
1,Jane,Doe,JaneDoe@email.com
2,John,Doe,JohnDoe@email.com
3,Adam,Doe,Adam@email.com


In [54]:
df.sort_values(by = ['last', 'first'], ascending = False)

Unnamed: 0,first,last,email
0,Corey,Schafer,CoreySchafer@gmail.com
2,John,Doe,JohnDoe@email.com
1,Jane,Doe,JaneDoe@email.com
3,Adam,Doe,Adam@email.com


In [55]:
# Sorting one column by descending and another by ascending
df.sort_values(by = ['last', 'first'], ascending = [False, True],
               inplace = True)
df

Unnamed: 0,first,last,email
0,Corey,Schafer,CoreySchafer@gmail.com
3,Adam,Doe,Adam@email.com
1,Jane,Doe,JaneDoe@email.com
2,John,Doe,JohnDoe@email.com


In [56]:
# Revere the sort: using index
df.sort_index(inplace = True)
df

Unnamed: 0,first,last,email
0,Corey,Schafer,CoreySchafer@gmail.com
1,Jane,Doe,JaneDoe@email.com
2,John,Doe,JohnDoe@email.com
3,Adam,Doe,Adam@email.com


In [57]:
# Dealing with missing values
df = df.append({'first': 'Tony'}, ignore_index = True)
df

Unnamed: 0,first,last,email
0,Corey,Schafer,CoreySchafer@gmail.com
1,Jane,Doe,JaneDoe@email.com
2,John,Doe,JohnDoe@email.com
3,Adam,Doe,Adam@email.com
4,Tony,,


In [58]:
df.dropna()

Unnamed: 0,first,last,email
0,Corey,Schafer,CoreySchafer@gmail.com
1,Jane,Doe,JaneDoe@email.com
2,John,Doe,JohnDoe@email.com
3,Adam,Doe,Adam@email.com


In [60]:
df.dropna(axis='index', how='any') # Default parameters
df.dropna(axis='index', how='all') # Only drop when all values are NA
df.dropna(axis='columns', how='all')

Unnamed: 0,first,last,email
0,Corey,Schafer,CoreySchafer@gmail.com
1,Jane,Doe,JaneDoe@email.com
2,John,Doe,JohnDoe@email.com
3,Adam,Doe,Adam@email.com
4,Tony,,


In [61]:
df.dropna(axis='index',
          how='all',
          subset=['email', 'last'],
          inplace=True)

In [62]:
# Dealing with NA values in a more complex dataframe
import numpy as np
people = {
    'first': ['Corey', 'Jane', 'John', 'Chris', np.nan, None, 'NA'],
    'last': ['Schafer', 'Doe', 'Doe', 'Schafer', np.nan, np.nan,
             'Missing'],
    'email': ['CoreySchafer@gmail.com', 'JaneDoe@email.com',
              'JohnDoe@email.com', None, np.nan,
              'Anonymous@email.com', 'Missing'],
    'age': ['33', '55', '63', '36', None, None, 'NA']
}
df = pd.DataFrame(people)
df

Unnamed: 0,first,last,email,age
0,Corey,Schafer,CoreySchafer@gmail.com,33.0
1,Jane,Doe,JaneDoe@email.com,55.0
2,John,Doe,JohnDoe@email.com,63.0
3,Chris,Schafer,,36.0
4,,,,
5,,,Anonymous@email.com,
6,,Missing,Missing,


In [63]:
df.replace('NA', np.nan, inplace=True)
df.replace('Missing', np.nan, inplace=True)
df

Unnamed: 0,first,last,email,age
0,Corey,Schafer,CoreySchafer@gmail.com,33.0
1,Jane,Doe,JaneDoe@email.com,55.0
2,John,Doe,JohnDoe@email.com,63.0
3,Chris,Schafer,,36.0
4,,,,
5,,,Anonymous@email.com,
6,,,,


In [64]:
df.dropna()

Unnamed: 0,first,last,email,age
0,Corey,Schafer,CoreySchafer@gmail.com,33
1,Jane,Doe,JaneDoe@email.com,55
2,John,Doe,JohnDoe@email.com,63


In [65]:
df.isna()

Unnamed: 0,first,last,email,age
0,False,False,False,False
1,False,False,False,False
2,False,False,False,False
3,False,False,True,False
4,True,True,True,True
5,True,True,False,True
6,True,True,True,True


In [70]:
df.fillna('MISSING', inplace=True)

In [71]:
# Casting Data Types
df.dtypes

first    object
last     object
email    object
age      object
dtype: object

In [74]:
df['age'].replace('MISSING', np.nan, inplace=True)

In [75]:
df['age'] = df['age'].astype(float)

In [78]:
df['age'].mean()

26.714285714285715

In [77]:
df.dtypes

first     object
last      object
email     object
age      float64
dtype: object