![alt text](pandas.png "Title")

In [1]:
import pandas as pd

# Dataframes: basic alterations

## Test data

In [2]:
data     = {'gender': ['M', 'F', 'F'], 'age': [20, 25, 23]}
patients = [10010, 10011, 10012]

In [3]:
def create_df(data: dict, patients: list):

    df = pd.DataFrame(data, index= patients, columns=['age', 'gender', 'race'])
    return df
    
df= create_df(data, patients)
df

Unnamed: 0,age,gender,race
10010,20,M,
10011,25,F,
10012,23,F,


## Delete a column

In [4]:
# in-place removal of a df column
del df['race']
df

Unnamed: 0,age,gender
10010,20,M
10011,25,F
10012,23,F


In [5]:
# alternatively, using drop() on the column axis (axis=1)
df = pd.DataFrame(data, index= patients, columns=['age', 'gender', 'race'])
df = df.drop(['race'], axis = 1)
df

Unnamed: 0,age,gender
10010,20,M
10011,25,F
10012,23,F


## Rename columns

In [6]:
# in-place edit of the 'columns' attribute:
df.columns = ['age', 'sex']
df

Unnamed: 0,age,sex
10010,20,M
10011,25,F
10012,23,F


In [7]:
# We could also pass a dictionnary of the columns we want to rename. 
df.rename(columns={'age': 'agen'})                  # this is not in-place

# df.rename(columns={'age': 'agen'}, inplace=True)  # this is in-place

Unnamed: 0,agen,sex
10010,20,M
10011,25,F
10012,23,F


## Change columns order

In [8]:
# Get the list of df columns (following order found in the df):
cols = df.columns # this is an iterable. We can make a list out of it: df.columns.to_list() or list(df.columns)
list(cols)


['age', 'sex']

In [9]:
# Change the order, e.g. reverting order
df = df[ cols[::-1] ] 
df

Unnamed: 0,sex,age
10010,M,20
10011,F,25
10012,F,23


In [12]:
df [ [list of columns] ]

SyntaxError: invalid syntax. Perhaps you forgot a comma? (3357459297.py, line 1)

## Add new rows

In [13]:
# We can use loc with a new index value, passing a list of value for each column.
df = pd.DataFrame(data, index= patients, columns=['age', 'gender'])
df

Unnamed: 0,age,gender
10010,20,M
10011,25,F
10012,23,F


In [14]:
df.loc[10013] = [40, 'M']
df

Unnamed: 0,age,gender
10010,20,M
10011,25,F
10012,23,F
10013,40,M


In [15]:
# that doesn't work because this syntax implies we are using a Series to create a new column and length isn't as expected. 
df[10015] = [50, 'F']
df

ValueError: Length of values (2) does not match length of index (4)

In [16]:
# see for yourself
df[10015] = [50, 'F', 'test', 'test2']
df

Unnamed: 0,age,gender,10015
10010,20,M,50
10011,25,F,F
10012,23,F,test
10013,40,M,test2


In [17]:
# In case we are not using a labelled index:
df = pd.DataFrame(data, columns=['age', 'gender'])
df

Unnamed: 0,age,gender
0,20,M
1,25,F
2,23,F


In [18]:
# Number of rows in our df:
n_rows = len(df)

# Add a new row:
df.loc[n_rows] = [50, 'F']

df

Unnamed: 0,age,gender
0,20,M
1,25,F
2,23,F
3,50,F


In [19]:
# Alternatively, we can use the concat() method, which concatenates 2 dataframes.
# You concatenate dataframes/series along a particular axis, with many options (handling indexes, missings, mismatch columns...)

new_row = pd.DataFrame(data=[[48, 'M']], columns=['age', 'gender'] ,index= [4])
new_row

Unnamed: 0,age,gender
4,48,M


https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.concat.html

In [20]:
df = pd.concat([df, new_row])
df

Unnamed: 0,age,gender
0,20,M
1,25,F
2,23,F
3,50,F
4,48,M


## Delete rows

In [21]:
df2 = pd.DataFrame(data, index= patients, columns=['age', 'gender', 'race'])
df2

Unnamed: 0,age,gender,race
10010,20,M,
10011,25,F,
10012,23,F,


In [22]:
# We can use drop(), this time on the row axis (the default)
for subjid in [10011, 10012]:
    df2.drop([subjid], inplace=True) # axis=0
df2

Unnamed: 0,age,gender,race
10010,20,M,


## Changing values

In [23]:
df=create_df(data, patients)
df

Unnamed: 0,age,gender,race
10010,20,M,
10011,25,F,
10012,23,F,


In [24]:
# In-place modification of a single value using the loc method and the coordinates:
df.loc[10010, 'age'] = 80
df

Unnamed: 0,age,gender,race
10010,80,M,
10011,25,F,
10012,23,F,


In [25]:
# Alternatively, you can use another df to update the values in a df. Update is done with matching index values.
new_row = pd.DataFrame({'gender': ['F'], 'age': [77]}, index= [10010], columns=['age', 'gender'])
new_row

Unnamed: 0,age,gender
10010,77,F


In [26]:
df.update(new_row)
df

Unnamed: 0,age,gender,race
10010,77,F,
10011,25,F,
10012,23,F,


https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.update.html

In [27]:
# You can create or recreate a column by broadcasting a string to the whole Series
df['test'] = 'Test'
df

Unnamed: 0,age,gender,race,test
10010,77,F,,Test
10011,25,F,,Test
10012,23,F,,Test


In [28]:
# Applying a vector to do some maths, element-wise
df['months'] = df.age * 12
df

Unnamed: 0,age,gender,race,test,months
10010,77,F,,Test,924
10011,25,F,,Test,300
10012,23,F,,Test,276


In [29]:
# Using pandas map() to apply a small transformation
gender = {'M': 'Male', 'F': 'Female'}
df['gender_long'] = df['gender'].map(lambda x: gender[x])
df

Unnamed: 0,age,gender,race,test,months,gender_long
10010,77,F,,Test,924,Female
10011,25,F,,Test,300,Female
10012,23,F,,Test,276,Female


## Sorting by values

https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.sort_values.html

In [30]:
# sort_values() is not in-place
df = df.sort_values('gender')

# in-plac sorting, with more options
df.sort_values(['gender','age'], inplace=True, ascending=[True, False])
df

Unnamed: 0,age,gender,race,test,months,gender_long
10010,77,F,,Test,924,Female
10011,25,F,,Test,300,Female
10012,23,F,,Test,276,Female


__________________________________________________
Nicolas Dupuis, Methodology and Innovation (IDAR C&SP), 2020+