# Dataframes

In [3]:
import numpy as np
import pandas as pd
from numpy.random import randn

In [7]:
np.random.seed(10)

Dataframe is composed of pandas series(W,x....) that share common index(a,b,c)

In [4]:
df = pd.DataFrame(randn(5,4), ['a','b','c','d','e'],['W','X','Y','Z'])

In [6]:
df

Unnamed: 0,W,X,Y,Z
a,1.331587,0.715279,-1.5454,-0.008384
b,0.621336,-0.720086,0.265512,0.108549
c,0.004291,-0.1746,0.433026,1.203037
d,-0.965066,1.028274,0.22863,0.445138
e,-1.136602,0.135137,1.484537,-1.079805


In [8]:
df['W']

a    1.331587
b    0.621336
c    0.004291
d   -0.965066
e   -1.136602
Name: W, dtype: float64

In [9]:
type(df['W'])

pandas.core.series.Series

In [10]:
type(df)

pandas.core.frame.DataFrame

In [8]:
df['new'] = df['W'] + df['X']
df

Unnamed: 0,W,X,Y,Z,new
a,0.775295,0.727755,0.298826,-0.092898,1.50305
b,-0.612578,0.357031,-0.830454,0.271668,-0.255548
c,-0.812124,0.668172,0.918348,-0.663649,-0.143952
d,-0.195927,0.743231,0.51878,1.61699,0.547304
e,-0.594138,-0.835941,-1.933859,0.242024,-1.430079


axis = 0 referes to index(a,b,d,e), axis = 1 means column.
If we keep inplace = True, the underlying dataframe gets modified, so by default, it is set to false.

In [9]:
df.drop('new',axis=1)
df

Unnamed: 0,W,X,Y,Z,new
a,0.775295,0.727755,0.298826,-0.092898,1.50305
b,-0.612578,0.357031,-0.830454,0.271668,-0.255548
c,-0.812124,0.668172,0.918348,-0.663649,-0.143952
d,-0.195927,0.743231,0.51878,1.61699,0.547304
e,-0.594138,-0.835941,-1.933859,0.242024,-1.430079


In [30]:
df.drop('new', axis = 1, inplace=True)
df

Unnamed: 0,W,X,Y,Z
a,1.331587,0.715279,-1.5454,-0.008384
b,0.621336,-0.720086,0.265512,0.108549
c,0.004291,-0.1746,0.433026,1.203037
d,-0.965066,1.028274,0.22863,0.445138
e,-1.136602,0.135137,1.484537,-1.079805


In [32]:
df.drop('e', axis = 0)

Unnamed: 0,W,X,Y,Z
a,1.331587,0.715279,-1.5454,-0.008384
b,0.621336,-0.720086,0.265512,0.108549
c,0.004291,-0.1746,0.433026,1.203037
d,-0.965066,1.028274,0.22863,0.445138


In [10]:
df[['W','X','Z']]

Unnamed: 0,W,X,Z
a,0.775295,0.727755,-0.092898
b,-0.612578,0.357031,0.271668
c,-0.812124,0.668172,-0.663649
d,-0.195927,0.743231,1.61699
e,-0.594138,-0.835941,0.242024


Just specify column labels to get the particular columns. But in case of rows, we need to use loc[] or iloc[]

In [38]:
df.loc['a']

W    1.331587
X    0.715279
Y   -1.545400
Z   -0.008384
Name: a, dtype: float64

In [41]:
df.iloc[0]

W    1.331587
X    0.715279
Y   -1.545400
Z   -0.008384
Name: a, dtype: float64

In [43]:
df.loc['c','X']

-0.17460021059294129

In [11]:
df.loc[['b','d'],['X','Z']]

Unnamed: 0,X,Z
b,0.357031,0.271668
d,0.743231,1.61699


# Dataframe comparisons

In [15]:
df > 0

Unnamed: 0,W,X,Y,Z,new
a,True,True,True,False,True
b,False,True,False,True,False
c,False,True,True,False,False
d,False,True,True,True,True
e,False,False,False,True,False


Selecting values based on some condition. It returns NaN if condition fails for that value. 

In [5]:
df[df>0]

Unnamed: 0,W,X,Y,Z
a,0.401422,1.470519,,0.847143
b,0.894006,,,
c,1.110417,0.464223,0.818469,0.471922
d,,1.674636,,0.942565
e,,0.152728,,1.076355


Following code will return rows only where "df['X'] > 0" returns true, that is, all rows but 'b' are returned

In [7]:
df[df['X'] > 0]

Unnamed: 0,W,X,Y,Z
a,0.401422,1.470519,-0.337293,0.847143
c,1.110417,0.464223,0.818469,0.471922
d,-1.330266,1.674636,-1.152462,0.942565
e,-0.085125,0.152728,-0.774816,1.076355


In [8]:
df[df['X'] < 0]

Unnamed: 0,W,X,Y,Z
b,0.894006,-0.590018,-1.380893,-1.074508


In [17]:
df[df['X'] > 0][['W','Z']]

Unnamed: 0,W,Z
a,0.775295,-0.092898
b,-0.612578,0.271668
c,-0.812124,-0.663649
d,-0.195927,1.61699


Passing conditions joined with and , or will cause an error. Because python can perform 'True and False', and or operations on single bool values, but not on array of bool values like [True, False] and [False, False] . To make sure such operation works, we use & and | .

In [11]:
df[ (df['X'] > 0) & (df['Y'] > 0)]

Unnamed: 0,W,X,Y,Z
c,1.110417,0.464223,0.818469,0.471922


In [12]:
df.reset_index()

Unnamed: 0,index,W,X,Y,Z
0,a,0.401422,1.470519,-0.337293,0.847143
1,b,0.894006,-0.590018,-1.380893,-1.074508
2,c,1.110417,0.464223,0.818469,0.471922
3,d,-1.330266,1.674636,-1.152462,0.942565
4,e,-0.085125,0.152728,-0.774816,1.076355


In [13]:
states = ['INR','CH','JPN','EU','USA']

In [14]:
df['States'] = states

In [23]:
newdf = df.set_index('States')
newdf

Unnamed: 0_level_0,W,X,Y,Z
States,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
INR,0.401422,1.470519,-0.337293,0.847143
CH,0.894006,-0.590018,-1.380893,-1.074508
JPN,1.110417,0.464223,0.818469,0.471922
EU,-1.330266,1.674636,-1.152462,0.942565
USA,-0.085125,0.152728,-0.774816,1.076355
