In [1]:
import pandas as pd

df = pd.DataFrame(data= {'name':['john', 'mary', 'peter','jeff','bill', 'lisa'], 'age':[23, 78, 22, 19, 45, 33], 
                         'state': ['iowa', 'dc', 'california', 'texas', 'washington', 'dc'], 'num_children': [2, 2, 0, 1, 2, 1],
                        'num_pets' : [0, 4, 0, 5, 0, 0]})

In [2]:
df

Unnamed: 0,name,age,state,num_children,num_pets
0,john,23,iowa,2,0
1,mary,78,dc,2,4
2,peter,22,california,0,0
3,jeff,19,texas,1,5
4,bill,45,washington,2,0
5,lisa,33,dc,1,0


In [3]:
df.columns

Index(['name', 'age', 'state', 'num_children', 'num_pets'], dtype='object')

In [4]:
list(df.columns)

['name', 'age', 'state', 'num_children', 'num_pets']

In [5]:
df[['name', 'age', 'state']]

Unnamed: 0,name,age,state
0,john,23,iowa
1,mary,78,dc
2,peter,22,california
3,jeff,19,texas
4,bill,45,washington
5,lisa,33,dc


In [6]:
# If we want to select only specific names and ages from the dataframe
# We can slice the data by doing the following:

df.loc[2:4, ['name', 'age']]

Unnamed: 0,name,age
2,peter,22
3,jeff,19
4,bill,45


In [7]:
# If you wanted to select entire rows based on index:
df.iloc[:2]

Unnamed: 0,name,age,state,num_children,num_pets
0,john,23,iowa,2,0
1,mary,78,dc,2,4


In [8]:
# That is the same thing as df.head(2)
# But head() always starts at the first one while iloc can choose any range

# Here we can get the bottom two rows by doing
df.iloc[-2:]
# or
df.tail(2)

Unnamed: 0,name,age,state,num_children,num_pets
4,bill,45,washington,2,0
5,lisa,33,dc,1,0


In [9]:
# If you wanted the first column of the dataframe without specifying the title of it:
df.iloc[:, 0]
# This says get all rows, but only take the 0th index of each

0     john
1     mary
2    peter
3     jeff
4     bill
5     lisa
Name: name, dtype: object

In [10]:
# You can even apply filters to dataframes:
# We want to get every row whose age value is greater than 30
df[df['age'] > 30]

# Can also be written as df[df.age > 30]

Unnamed: 0,name,age,state,num_children,num_pets
1,mary,78,dc,2,4
4,bill,45,washington,2,0
5,lisa,33,dc,1,0


In [11]:
# This gets each row who has more pets than children
df[df.num_pets > df.num_children]

Unnamed: 0,name,age,state,num_children,num_pets
1,mary,78,dc,2,4
3,jeff,19,texas,1,5


In [23]:
# Get rows where people are older than 40 and owns more than zero pets
df[ (df.age > 40) & (df.num_pets > 0) ]

Unnamed: 0,name,age,state,num_children,num_pets
1,mary,78,dc,2,4


In [24]:
# Get rows where people are older than 40 OR has pets
df[ (df.age > 40) | (df.num_pets > 0) ]

Unnamed: 0,name,age,state,num_children,num_pets
1,mary,78,dc,2,4
3,jeff,19,texas,1,5
4,bill,45,washington,2,0


In [27]:
# What if we didn't care about age and number of children?

df.drop(columns=['name','age'])

Unnamed: 0,state,num_children,num_pets
0,iowa,2,0
1,dc,2,4
2,california,0,0
3,texas,1,5
4,washington,2,0
5,dc,1,0


In [29]:
# What if we just weren't feeling rows 2 and 4?

df.drop([2,4])

Unnamed: 0,name,age,state,num_children,num_pets
0,john,23,iowa,2,0
1,mary,78,dc,2,4
3,jeff,19,texas,1,5
5,lisa,33,dc,1,0


In [33]:
# How would we find the average for age, num_pets, and num_children?

df.describe()

Unnamed: 0,age,num_children,num_pets
count,6.0,6.0,6.0
mean,36.666667,1.333333,1.5
std,22.384518,0.816497,2.345208
min,19.0,0.0,0.0
25%,22.25,1.0,0.0
50%,28.0,1.5,0.0
75%,42.0,2.0,3.0
max,78.0,2.0,5.0


In [40]:
# That's okay, but it gave us way more than we asked for.
# We can use the .mean() function to get the averages of our data columns:
df.mean()

age             36.666667
num_children     1.333333
num_pets         1.500000
dtype: float64

In [41]:
# And what if I only wanted to get the average of age?
# We can specify which columns we want...
df[['age']].mean()

age    36.666667
dtype: float64

In [42]:
df['age']

0    23
1    78
2    22
3    19
4    45
5    33
Name: age, dtype: int64

In [43]:
df[['age']]

Unnamed: 0,age
0,23
1,78
2,22
3,19
4,45
5,33


In [44]:
df[['age']].apply(lambda x: x*2)

Unnamed: 0,age
0,46
1,156
2,44
3,38
4,90
5,66


In [45]:
df[['age']]*2

Unnamed: 0,age
0,46
1,156
2,44
3,38
4,90
5,66


In [50]:
# Show the original dataframe but with each age doubled
newDf = df.copy()
newDf[['age']] = newDf[['age']]*2
newDf

Unnamed: 0,name,age,state,num_children,num_pets
0,john,368,iowa,2,0
1,mary,1248,dc,2,4
2,peter,352,california,0,0
3,jeff,304,texas,1,5
4,bill,720,washington,2,0
5,lisa,528,dc,1,0


In [54]:
df

Unnamed: 0,name,age,state,num_children,num_pets
0,john,23.0,iowa,2,0
1,mary,78.0,dc,2,4
2,peter,22.0,california,0,0
3,jeff,19.0,texas,1,5
4,bill,45.0,washington,2,0
5,lisa,33.0,dc,1,0


In [57]:
df.sort_values(['age'])

Unnamed: 0,name,age,state,num_children,num_pets
3,jeff,19.0,texas,1,5
2,peter,22.0,california,0,0
0,john,23.0,iowa,2,0
5,lisa,33.0,dc,1,0
4,bill,45.0,washington,2,0
1,mary,78.0,dc,2,4


In [65]:
df.name.str.startswith('j')

0     True
1    False
2    False
3     True
4    False
5    False
Name: name, dtype: bool