In [1]:
import numpy as np
import pandas as pd

In [6]:
# create a dataframe
np.random.seed(50)
df = pd.DataFrame(data = np.random.randn(5,4), index = 'A B C D E'.split(), columns = ['One', 'Two', 'Three', 'Four'])
df

Unnamed: 0,One,Two,Three,Four
A,-1.560352,-0.030978,-0.620928,-1.46458
B,1.411946,-0.476732,-0.780469,1.070268
C,-1.282293,-1.327479,0.126338,0.862194
D,0.696737,-0.334565,-0.997526,1.598908
E,3.314075,0.98777,0.123866,0.742785


In [8]:
# adding and dropping column
df['Five'] = np.random.randn(5)

In [9]:
df

Unnamed: 0,One,Two,Three,Four,Five
A,-1.560352,-0.030978,-0.620928,-1.46458,0.222244
B,1.411946,-0.476732,-0.780469,1.070268,0.251814
C,-1.282293,-1.327479,0.126338,0.862194,0.707926
D,0.696737,-0.334565,-0.997526,1.598908,0.49399
E,3.314075,0.98777,0.123866,0.742785,1.471002


In [10]:
type(df['Five'])

pandas.core.series.Series

In [12]:
df.drop('Five', axis = 1)

Unnamed: 0,One,Two,Three,Four
A,-1.560352,-0.030978,-0.620928,-1.46458
B,1.411946,-0.476732,-0.780469,1.070268
C,-1.282293,-1.327479,0.126338,0.862194
D,0.696737,-0.334565,-0.997526,1.598908
E,3.314075,0.98777,0.123866,0.742785


In [13]:
df

Unnamed: 0,One,Two,Three,Four,Five
A,-1.560352,-0.030978,-0.620928,-1.46458,0.222244
B,1.411946,-0.476732,-0.780469,1.070268,0.251814
C,-1.282293,-1.327479,0.126338,0.862194,0.707926
D,0.696737,-0.334565,-0.997526,1.598908,0.49399
E,3.314075,0.98777,0.123866,0.742785,1.471002


In [14]:
df.drop('Five', axis = 1, inplace = True)

In [15]:
df

Unnamed: 0,One,Two,Three,Four
A,-1.560352,-0.030978,-0.620928,-1.46458
B,1.411946,-0.476732,-0.780469,1.070268
C,-1.282293,-1.327479,0.126338,0.862194
D,0.696737,-0.334565,-0.997526,1.598908
E,3.314075,0.98777,0.123866,0.742785


#### Indexing in pandas DataFrame
- index of a DataFrame is similar to an address

In [16]:
cars = {
    'NoPlate':['YT2332', 'YT5673', 'AC90C3'],
    'Make': ['Honda', 'Ford', 'BMW'],
    'Year':[2022, 2012, 2023]
}
df = pd.DataFrame(cars)
df

Unnamed: 0,NoPlate,Make,Year
0,YT2332,Honda,2022
1,YT5673,Ford,2012
2,AC90C3,BMW,2023


In [17]:
# accessing column names
df.columns

Index(['NoPlate', 'Make', 'Year'], dtype='object')

In [18]:
# index
df.index

RangeIndex(start=0, stop=3, step=1)

In [21]:
df.set_index('Make', inplace = True)

In [22]:
df

Unnamed: 0_level_0,NoPlate,Year
Make,Unnamed: 1_level_1,Unnamed: 2_level_1
Honda,YT2332,2022
Ford,YT5673,2012
BMW,AC90C3,2023


In [25]:
df.sort_index(inplace = True)

In [26]:
df

Unnamed: 0_level_0,NoPlate,Year
Make,Unnamed: 1_level_1,Unnamed: 2_level_1
BMW,AC90C3,2023
Ford,YT5673,2012
Honda,YT2332,2022


In [27]:
df.reset_index()

Unnamed: 0,Make,NoPlate,Year
0,BMW,AC90C3,2023
1,Ford,YT5673,2012
2,Honda,YT2332,2022


In [28]:
df.rename(columns={'NoPlate':'Plate'}, inplace = True)

In [29]:
df

Unnamed: 0_level_0,Plate,Year
Make,Unnamed: 1_level_1,Unnamed: 2_level_1
BMW,AC90C3,2023
Ford,YT5673,2012
Honda,YT2332,2022


In [31]:
np.random.seed(50)
df = pd.DataFrame(data = np.random.randn(5,4), index = 'A B C D E'.split(), columns = ['One', 'Two', 'Three', 'Four'])
df

Unnamed: 0,One,Two,Three,Four
A,-1.560352,-0.030978,-0.620928,-1.46458
B,1.411946,-0.476732,-0.780469,1.070268
C,-1.282293,-1.327479,0.126338,0.862194
D,0.696737,-0.334565,-0.997526,1.598908
E,3.314075,0.98777,0.123866,0.742785


In [35]:
# conditional selection
df[df < 0]

Unnamed: 0,One,Two,Three,Four
A,-1.560352,-0.030978,-0.620928,-1.46458
B,,-0.476732,-0.780469,
C,-1.282293,-1.327479,,
D,,-0.334565,-0.997526,
E,,,,


In [36]:
# show the DataFrame where the values of 'Three' column are negative
df[df['Three'] < 0]

Unnamed: 0,One,Two,Three,Four
A,-1.560352,-0.030978,-0.620928,-1.46458
B,1.411946,-0.476732,-0.780469,1.070268
D,0.696737,-0.334565,-0.997526,1.598908


In [38]:
df[df['Three'] < 0]

Unnamed: 0,One,Two,Three,Four
A,-1.560352,-0.030978,-0.620928,-1.46458
B,1.411946,-0.476732,-0.780469,1.070268
D,0.696737,-0.334565,-0.997526,1.598908


In [39]:
df

Unnamed: 0,One,Two,Three,Four
A,-1.560352,-0.030978,-0.620928,-1.46458
B,1.411946,-0.476732,-0.780469,1.070268
C,-1.282293,-1.327479,0.126338,0.862194
D,0.696737,-0.334565,-0.997526,1.598908
E,3.314075,0.98777,0.123866,0.742785


In [None]:
# Show the values of column 'Two' when the values of column 'Four' is less than 0
df['Two'][df['Four'] < 0]

In [41]:
df['Two'][df['Four'] < 0]

A   -0.030978
Name: Two, dtype: float64

In [49]:
df[df['Four'] < 0][['Two']]

Unnamed: 0,Two
A,-0.030978


In [46]:
# Show the values of column 'Two' and 'Three' when the values of column 'Four' is less than 0
df[['Two', 'One']][df['Four'] < 0]

Unnamed: 0,Two,One
A,-0.030978,-1.560352


In [51]:
np.random.seed(50)
df = pd.DataFrame(data = np.random.randn(5,4), index = 'A B C D E'.split(), columns = ['One', 'Two', 'Three', 'Four'])
df

Unnamed: 0,One,Two,Three,Four
A,-1.560352,-0.030978,-0.620928,-1.46458
B,1.411946,-0.476732,-0.780469,1.070268
C,-1.282293,-1.327479,0.126338,0.862194
D,0.696737,-0.334565,-0.997526,1.598908
E,3.314075,0.98777,0.123866,0.742785


In [55]:
# Show the values of 'One' and 'Four' when 'Two' is greater than 0 and 'Three' is less than 0
df[(df['Two']>0) & (df['Three']<0)][['One','Four']]

Unnamed: 0,One,Four


In [59]:
df[['One', 'Four']][(df['Two'] < 0) & (df['Three'] > 0)]

Unnamed: 0,One,Four
C,-1.282293,0.862194


### Apply a function

In [60]:
students = {
    'Name': ['Nadia, I.', 'Hanna, J.', 'Alissa, K.', 'Lisa, A.', 'Aisha, B'],
    'ID': ['1233','1234','1235','1236','1237'],
    'Age': [21,23,23,25,24],
    'GPA': [3.88, 3.64, 3.97, 3.45, '3.65'],
    'Loan': ['No', 'Yes', 'Yes', 'No', 'No'],
}

df = pd.DataFrame(students)
df

Unnamed: 0,Name,ID,Age,GPA,Loan
0,"Nadia, I.",1233,21,3.88,No
1,"Hanna, J.",1234,23,3.64,Yes
2,"Alissa, K.",1235,23,3.97,Yes
3,"Lisa, A.",1236,25,3.45,No
4,"Aisha, B",1237,24,3.65,No


In [62]:
# transform 'No' to 0 and 'Yes' to 1 of the 'Loan' attribute
df['Loan'] = df['Loan'].map({'No':0, 'Yes':1})
df

Unnamed: 0,Name,ID,Age,GPA,Loan
0,"Nadia, I.",1233,21,3.88,0
1,"Hanna, J.",1234,23,3.64,1
2,"Alissa, K.",1235,23,3.97,1
3,"Lisa, A.",1236,25,3.45,0
4,"Aisha, B",1237,24,3.65,0


In [66]:
'Nadia, I.'.split(',')[0]

'Nadia'

In [69]:
def getFirstName(name):
    return name.split(',')[0]

df['First Name'] = df['Name'].apply(getFirstName)
df

Unnamed: 0,Name,ID,Age,GPA,Loan,First Name
0,"Nadia, I.",1233,21,3.88,0,Nadia
1,"Hanna, J.",1234,23,3.64,1,Hanna
2,"Alissa, K.",1235,23,3.97,1,Alissa
3,"Lisa, A.",1236,25,3.45,0,Lisa
4,"Aisha, B",1237,24,3.65,0,Aisha


In [73]:
# create another column in df named 'Interview'. 'Yes' if GPA >= 3.5 otherwise 'No'.
df['Interview'] = df['GPA'].apply(lambda x: 'No' if float(x) < 3.5 else 'Yes' )
df

Unnamed: 0,Name,ID,Age,GPA,Loan,First Name,Interview
0,"Nadia, I.",1233,21,3.88,0,Nadia,Yes
1,"Hanna, J.",1234,23,3.64,1,Hanna,Yes
2,"Alissa, K.",1235,23,3.97,1,Alissa,Yes
3,"Lisa, A.",1236,25,3.45,0,Lisa,No
4,"Aisha, B",1237,24,3.65,0,Aisha,Yes
