In [4]:
import numpy as np
import pandas as pd

In [8]:
from numpy.random import randn

In [9]:
np.random.seed(101)
#Done to ensure you get the same random values as your instructor

In [10]:
df = pd.DataFrame(randn(5,4),['A','B','C','D','E'],['W','X','Y','Z'])
df

Unnamed: 0,W,X,Y,Z
A,2.70685,0.628133,0.907969,0.503826
B,0.651118,-0.319318,-0.848077,0.605965
C,-2.018168,0.740122,0.528813,-0.589001
D,0.188695,-0.758872,-0.933237,0.955057
E,0.190794,1.978757,2.605967,0.683509


## Grabbing elements from the DataFrame

In [8]:
df['W']

A    2.706850
B    0.651118
C   -2.018168
D    0.188695
E    0.190794
Name: W, dtype: float64

In [9]:
#This is similar to a series, if you check its datatype
type(df['W'])

pandas.core.series.Series

## How to get multiple columns out of a DataFrame

In [10]:
df[['W','X','Y']]


Unnamed: 0,W,X,Y
A,2.70685,0.628133,0.907969
B,0.651118,-0.319318,-0.848077
C,-2.018168,0.740122,0.528813
D,0.188695,-0.758872,-0.933237
E,0.190794,1.978757,2.605967


## Creating new Column in an existing dataframe

In [11]:
df['new'] = df['W']+df['Y']
df

Unnamed: 0,W,X,Y,Z,new
A,2.70685,0.628133,0.907969,0.503826,3.614819
B,0.651118,-0.319318,-0.848077,0.605965,-0.196959
C,-2.018168,0.740122,0.528813,-0.589001,-1.489355
D,0.188695,-0.758872,-0.933237,0.955057,-0.744542
E,0.190794,1.978757,2.605967,0.683509,2.796762


## Removing Column

In [12]:
df.drop('new',axis=1)
#This will not remove the column originally unless specified

Unnamed: 0,W,X,Y,Z
A,2.70685,0.628133,0.907969,0.503826
B,0.651118,-0.319318,-0.848077,0.605965
C,-2.018168,0.740122,0.528813,-0.589001
D,0.188695,-0.758872,-0.933237,0.955057
E,0.190794,1.978757,2.605967,0.683509


# This removes the column 'new' from the original dataframe

In [13]:
df.drop('new',axis=1,inplace=True)
df

Unnamed: 0,W,X,Y,Z
A,2.70685,0.628133,0.907969,0.503826
B,0.651118,-0.319318,-0.848077,0.605965
C,-2.018168,0.740122,0.528813,-0.589001
D,0.188695,-0.758872,-0.933237,0.955057
E,0.190794,1.978757,2.605967,0.683509


##  Removing Rows

In [14]:
df.drop('A')

Unnamed: 0,W,X,Y,Z
B,0.651118,-0.319318,-0.848077,0.605965
C,-2.018168,0.740122,0.528813,-0.589001
D,0.188695,-0.758872,-0.933237,0.955057
E,0.190794,1.978757,2.605967,0.683509


In [15]:
df

Unnamed: 0,W,X,Y,Z
A,2.70685,0.628133,0.907969,0.503826
B,0.651118,-0.319318,-0.848077,0.605965
C,-2.018168,0.740122,0.528813,-0.589001
D,0.188695,-0.758872,-0.933237,0.955057
E,0.190794,1.978757,2.605967,0.683509


In [16]:
df.drop('A',inplace=True)
df

Unnamed: 0,W,X,Y,Z
B,0.651118,-0.319318,-0.848077,0.605965
C,-2.018168,0.740122,0.528813,-0.589001
D,0.188695,-0.758872,-0.933237,0.955057
E,0.190794,1.978757,2.605967,0.683509


### Using the .loc method to index along rows and columns

loc stands for location

In [17]:
df.loc['B']

W    0.651118
X   -0.319318
Y   -0.848077
Z    0.605965
Name: B, dtype: float64

#### The iloc (index location) method is used to locate based upon index contrary to the indexing with string

In [18]:
df.iloc[2]
#This did indexing on the D row

W    0.188695
X   -0.758872
Y   -0.933237
Z    0.955057
Name: D, dtype: float64

#### Grabbing data from rows and columns

In [19]:
df.loc['B','X']
#This will get me the data in B row, X column

-0.31931804459303326

In [20]:
df.loc[['B','C'],['X','Y']]

Unnamed: 0,X,Y
B,-0.319318,-0.848077
C,0.740122,0.528813


# Conditional Selection and Advanced DataFrame Operations

In [21]:
df>0

Unnamed: 0,W,X,Y,Z
B,True,False,False,True
C,False,True,True,False
D,True,False,False,True
E,True,True,True,True


In [22]:
df[df>0]
#This selects all the values where it is greater than 0

Unnamed: 0,W,X,Y,Z
B,0.651118,,,0.605965
C,,0.740122,0.528813,
D,0.188695,,,0.955057
E,0.190794,1.978757,2.605967,0.683509


#### Selecting values based upon conditions in rows and columns

In [23]:
df

Unnamed: 0,W,X,Y,Z
B,0.651118,-0.319318,-0.848077,0.605965
C,-2.018168,0.740122,0.528813,-0.589001
D,0.188695,-0.758872,-0.933237,0.955057
E,0.190794,1.978757,2.605967,0.683509


In [25]:
df[df['W']>0]

Unnamed: 0,W,X,Y,Z
B,0.651118,-0.319318,-0.848077,0.605965
D,0.188695,-0.758872,-0.933237,0.955057
E,0.190794,1.978757,2.605967,0.683509


In [26]:
df[df['W']>0]['X']

B   -0.319318
D   -0.758872
E    1.978757
Name: X, dtype: float64

In [27]:
df[df['W']>0][['X','Y']]

Unnamed: 0,X,Y
B,-0.319318,-0.848077
D,-0.758872,-0.933237
E,1.978757,2.605967


### Implementing multiple conditions

In [29]:
df[(df['W']>0) & df['X']>0]

Unnamed: 0,W,X,Y,Z
B,0.651118,-0.319318,-0.848077,0.605965
D,0.188695,-0.758872,-0.933237,0.955057
E,0.190794,1.978757,2.605967,0.683509


### Resetting Index values to numerical values

In [30]:
df.reset_index()

Unnamed: 0,index,W,X,Y,Z
0,B,0.651118,-0.319318,-0.848077,0.605965
1,C,-2.018168,0.740122,0.528813,-0.589001
2,D,0.188695,-0.758872,-0.933237,0.955057
3,E,0.190794,1.978757,2.605967,0.683509


In [11]:
new_index = "CA NY WY OR CO".split()
new_index

['CA', 'NY', 'WY', 'OR', 'CO']

In [12]:
df['States'] = new_index
df

Unnamed: 0,W,X,Y,Z,States
A,2.70685,0.628133,0.907969,0.503826,CA
B,0.651118,-0.319318,-0.848077,0.605965,NY
C,-2.018168,0.740122,0.528813,-0.589001,WY
D,0.188695,-0.758872,-0.933237,0.955057,OR
E,0.190794,1.978757,2.605967,0.683509,CO


#### Setting a column value as index

In [14]:
df.set_index('States')

Unnamed: 0_level_0,W,X,Y,Z
States,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
CA,2.70685,0.628133,0.907969,0.503826
NY,0.651118,-0.319318,-0.848077,0.605965
WY,-2.018168,0.740122,0.528813,-0.589001
OR,0.188695,-0.758872,-0.933237,0.955057
CO,0.190794,1.978757,2.605967,0.683509


#### Multi-Indexing and Index Hierarchy

In [15]:
outside = ['G1','G1','G1','G2','G2','G2']
inside = [1,2,3,1,2,3]
hier_index = list(zip(outside,inside))
hier_index = pd.MultiIndex.from_tuples(hier_index)

In [16]:
hier_index

MultiIndex([('G1', 1),
            ('G1', 2),
            ('G1', 3),
            ('G2', 1),
            ('G2', 2),
            ('G2', 3)],
           )

In [17]:
df = pd.DataFrame(randn(6,2),hier_index,['A','B'])
df

Unnamed: 0,Unnamed: 1,A,B
G1,1,0.302665,1.693723
G1,2,-1.706086,-1.159119
G1,3,-0.134841,0.390528
G2,1,0.166905,0.184502
G2,2,0.807706,0.07296
G2,3,0.638787,0.329646


In [23]:
df.index.names = ["Groups","Numbers"]
df

Unnamed: 0_level_0,Unnamed: 1_level_0,A,B
Groups,Numbers,Unnamed: 2_level_1,Unnamed: 3_level_1
G1,1,0.302665,1.693723
G1,2,-1.706086,-1.159119
G1,3,-0.134841,0.390528
G2,1,0.166905,0.184502
G2,2,0.807706,0.07296
G2,3,0.638787,0.329646


In [28]:
df.loc['G2'].loc[2]['B']

0.07295967531703869

In [29]:
df.loc['G2'].loc[3]['A']

0.638787013499328