In [1]:
import numpy as np
import pandas as pd

In [17]:
df = pd.DataFrame({'col1':[1,2,3,4],
                   'col2':[444,555,666,444],
                   'col3':['abc','def','ghi','xyz']})

In [3]:
df

Unnamed: 0,col1,col2,col3
0,1,444,abc
1,2,555,def
2,3,666,ghi
3,4,444,xyz


In [6]:
df.head() #returns the first n rows, head(n=5), 5 by default

Unnamed: 0,col1,col2,col3
0,1,444,abc
1,2,555,def
2,3,666,ghi
3,4,444,xyz


### Finding unique values within a DF:

In [5]:
df['col2'].unique()
#returns a numpy array of all the unique values in col2

array([444, 555, 666])

In [7]:
len(df['col2'].unique())
#this means there are 3 unique values

3

In [8]:
df['col2'].nunique()
#this is exactly the same as above, just a built-in method

3

In [9]:
df['col2'].value_counts()
#Returns how many times each unique value occurred in that column

444    2
555    1
666    1
Name: col2, dtype: int64

#### The apply() method:

In [20]:
#here we have a function we defined ourselves
def times2(x):
    return x*2

In [21]:
#What if we wanted to apply our own function to every cell in the column?
df['col1'].apply(times2)
#this broadcasts the times2 function to every cell in the column

0    2
1    4
2    6
3    8
Name: col1, dtype: int64

In [22]:
df.apply(times2) #works for DFs too lol

Unnamed: 0,col1,col2,col3
0,2,888,abcabc
1,4,1110,defdef
2,6,1332,ghighi
3,8,888,xyzxyz


In [23]:
df['col3'].apply(len)
#this gives us a column of the length of each string in col3

0    3
1    3
2    3
3    3
Name: col3, dtype: int64

In [25]:
#apply() is most powerful with lambda expressions
df['col2'].apply(lambda x : x*2)

0     888
1    1110
2    1332
3     888
Name: col2, dtype: int64

#### Removing columns:

In [26]:
df.drop('col1',axis=1,inplace=True)

In [27]:
df

Unnamed: 0,col2,col3
0,444,abc
1,555,def
2,666,ghi
3,444,xyz


In [37]:
#permanently removes a column: we don't really use del df['col1'] anymore

In [28]:
df = pd.DataFrame({'col1':[1,2,3,4],
                   'col2':[444,555,666,444],
                   'col3':['abc','def','ghi','xyz']})

In [29]:
df

Unnamed: 0,col1,col2,col3
0,1,444,abc
1,2,555,def
2,3,666,ghi
3,4,444,xyz


#### List column and index names:

In [30]:
df.columns
#This gives us a list of column names

Index(['col1', 'col2', 'col3'], dtype='object')

In [31]:
df.index
#Same thing for index names. But since we have a range index 0,1,2,3 it just gives us a range with step size

RangeIndex(start=0, stop=4, step=1)

#### Sorting and ordering a DF:

In [36]:
#Sort by column 2:
df.sort_values(by='col2')
#note how index is attached to row, so we don't lose that info

Unnamed: 0,col1,col2,col3
0,1,444,abc
3,4,444,xyz
1,2,555,def
2,3,666,ghi


#### Check for null values:

In [34]:
df.isnull()
#tells us whether that value is null or not

Unnamed: 0,col1,col2,col3
0,False,False,False
1,False,False,False
2,False,False,False
3,False,False,False


In [35]:
# Drop rows with NaN Values
df.dropna()

Unnamed: 0,col1,col2,col3
0,1,444,abc
1,2,555,def
2,3,666,ghi
3,4,444,xyz


In [None]:
#Can also fill na values with something else with df.fillna('FILLED')

### Pivot tables (similar to excel):

In [38]:
data = {'A':['foo','foo','foo','bar','bar','bar'],
        'B':['one','one','two','two','one','one'],
        'C':['x','y','x','y','x','y'],
        'D':[1,3,2,5,4,1]}

In [40]:
df = pd.DataFrame(data)
df

Unnamed: 0,A,B,C,D
0,foo,one,x,1
1,foo,one,y,3
2,foo,two,x,2
3,bar,two,y,5
4,bar,one,x,4
5,bar,one,y,1


In [43]:
df.pivot_table(values='D',index=['A','B'],columns=['C'])
#we created an index out of the A and B columns (multi-index): bar and foo, then one and two
#the column values are from the 'C' column, so 2 columns x and y
#the data point values are from the 'D' column
#there are some null missing values, e.g. in original DF, we had no value that matched to bar, two, x

Unnamed: 0_level_0,C,x,y
A,B,Unnamed: 2_level_1,Unnamed: 3_level_1
bar,one,4.0,1.0
bar,two,,5.0
foo,one,1.0,3.0
foo,two,2.0,
