# DataFrames

DataFrames are the workhorse of pandas and are directly inspired by the R programming language. We can think of a DataFrame as a bunch of Series objects put together to share the same index. Let's use pandas to explore this topic!

In [1]:
import pandas as pd
import numpy as np

In [2]:
np.random.randn(5,4)

array([[-0.77132211,  0.23301064,  1.12427521,  1.01823758],
       [ 1.16380634,  1.38888874,  0.18441327, -1.74783969],
       [-1.10994574,  0.83687704, -0.22170423,  0.0177892 ],
       [ 0.2024807 , -1.04468792,  0.86727745,  1.59545892],
       [-0.61456567, -0.1154026 , -1.01785228,  0.69784188]])

In [3]:
from numpy.random import randn
randn(5,4)

array([[-1.15272299, -1.34917277, -0.01416343,  0.75912521],
       [ 0.48827236, -0.38786695, -1.69835548,  1.09290265],
       [ 0.94687236,  1.11626412,  0.71120319,  1.1799182 ],
       [-0.10792921,  1.22691981, -0.38236443, -0.65822507],
       [-0.96745233, -0.03794444, -0.10897352, -0.05637906]])

In [4]:
np.random
randn(5,4)

array([[-0.26131441, -1.30195379,  1.11115449, -0.94742967],
       [-0.15615599, -1.16190928,  2.61690555,  0.48045929],
       [-0.41477052,  1.14848287, -1.00256021, -1.19807793],
       [-0.78230887, -0.551248  , -1.38083124,  0.15214424],
       [-0.594729  , -0.12444391, -0.13826161,  2.10626122]])

In [5]:
np.random.seed()
randn(5,4)

array([[ 0.33683506,  0.60983095,  1.53830889, -1.02532174],
       [ 0.63290045,  1.05524853, -2.70625242, -0.79179319],
       [-0.61785161, -1.63214635, -0.48404374, -0.78840768],
       [-0.80259036,  1.25616857, -0.13515079, -0.31389666],
       [-0.2566137 ,  0.42385735,  1.7931093 , -0.26442872]])

In [6]:
#df = pd.DataFrame  (DATA,  index=ROW_INDEX,  columns=SCHEMA  )


df = pd.DataFrame(randn(5,4),index='A B C D E'.split(),columns='W X Y Z'.split())
df

Unnamed: 0,W,X,Y,Z
A,1.15202,0.257319,2.123037,-0.761726
B,0.559521,2.031303,-0.156069,-0.746121
C,-0.402225,-0.210495,-0.031168,1.069665
D,-0.02515,0.232597,0.960871,1.693892
E,0.123028,-0.34243,-0.257183,-1.184496


## Selection and Indexing

Let's learn the various methods to grab data from a DataFrame

In [7]:
df

Unnamed: 0,W,X,Y,Z
A,1.15202,0.257319,2.123037,-0.761726
B,0.559521,2.031303,-0.156069,-0.746121
C,-0.402225,-0.210495,-0.031168,1.069665
D,-0.02515,0.232597,0.960871,1.693892
E,0.123028,-0.34243,-0.257183,-1.184496


In [8]:
df['W']

A    1.152020
B    0.559521
C   -0.402225
D   -0.025150
E    0.123028
Name: W, dtype: float64

In [9]:
df[['X','Z']]

Unnamed: 0,X,Z
A,0.257319,-0.761726
B,2.031303,-0.746121
C,-0.210495,1.069665
D,0.232597,1.693892
E,-0.34243,-1.184496


In [10]:
df['X']

A    0.257319
B    2.031303
C   -0.210495
D    0.232597
E   -0.342430
Name: X, dtype: float64

In [11]:
# Pass a list of column names
df[['W','Z']]

Unnamed: 0,W,Z
A,1.15202,-0.761726
B,0.559521,-0.746121
C,-0.402225,1.069665
D,-0.02515,1.693892
E,0.123028,-1.184496


In [12]:
df['W']

A    1.152020
B    0.559521
C   -0.402225
D   -0.025150
E    0.123028
Name: W, dtype: float64

In [13]:
# SQL Syntax (NOT RECOMMENDED!)
df.W

A    1.152020
B    0.559521
C   -0.402225
D   -0.025150
E    0.123028
Name: W, dtype: float64

DataFrame Columns are just Series

In [14]:
type(df['X'])

pandas.core.series.Series

In [15]:
type(df[['W', 'X']])

pandas.core.frame.DataFrame

**Creating a new column:**

In [16]:
df

Unnamed: 0,W,X,Y,Z
A,1.15202,0.257319,2.123037,-0.761726
B,0.559521,2.031303,-0.156069,-0.746121
C,-0.402225,-0.210495,-0.031168,1.069665
D,-0.02515,0.232597,0.960871,1.693892
E,0.123028,-0.34243,-0.257183,-1.184496


In [17]:
df['new'] = df['W'] + df['Y']

In [18]:
df

Unnamed: 0,W,X,Y,Z,new
A,1.15202,0.257319,2.123037,-0.761726,3.275056
B,0.559521,2.031303,-0.156069,-0.746121,0.403452
C,-0.402225,-0.210495,-0.031168,1.069665,-0.433393
D,-0.02515,0.232597,0.960871,1.693892,0.935722
E,0.123028,-0.34243,-0.257183,-1.184496,-0.134155


In [19]:
df1=df
df1

Unnamed: 0,W,X,Y,Z,new
A,1.15202,0.257319,2.123037,-0.761726,3.275056
B,0.559521,2.031303,-0.156069,-0.746121,0.403452
C,-0.402225,-0.210495,-0.031168,1.069665,-0.433393
D,-0.02515,0.232597,0.960871,1.693892,0.935722
E,0.123028,-0.34243,-0.257183,-1.184496,-0.134155


** Removing Columns**

In [20]:
df.drop('new',axis=1)

Unnamed: 0,W,X,Y,Z
A,1.15202,0.257319,2.123037,-0.761726
B,0.559521,2.031303,-0.156069,-0.746121
C,-0.402225,-0.210495,-0.031168,1.069665
D,-0.02515,0.232597,0.960871,1.693892
E,0.123028,-0.34243,-0.257183,-1.184496


In [21]:
df

Unnamed: 0,W,X,Y,Z,new
A,1.15202,0.257319,2.123037,-0.761726,3.275056
B,0.559521,2.031303,-0.156069,-0.746121,0.403452
C,-0.402225,-0.210495,-0.031168,1.069665,-0.433393
D,-0.02515,0.232597,0.960871,1.693892,0.935722
E,0.123028,-0.34243,-0.257183,-1.184496,-0.134155


In [22]:
df

Unnamed: 0,W,X,Y,Z,new
A,1.15202,0.257319,2.123037,-0.761726,3.275056
B,0.559521,2.031303,-0.156069,-0.746121,0.403452
C,-0.402225,-0.210495,-0.031168,1.069665,-0.433393
D,-0.02515,0.232597,0.960871,1.693892,0.935722
E,0.123028,-0.34243,-0.257183,-1.184496,-0.134155


In [23]:
df

Unnamed: 0,W,X,Y,Z,new
A,1.15202,0.257319,2.123037,-0.761726,3.275056
B,0.559521,2.031303,-0.156069,-0.746121,0.403452
C,-0.402225,-0.210495,-0.031168,1.069665,-0.433393
D,-0.02515,0.232597,0.960871,1.693892,0.935722
E,0.123028,-0.34243,-0.257183,-1.184496,-0.134155


In [24]:
df.drop('X', axis=1)

Unnamed: 0,W,Y,Z,new
A,1.15202,2.123037,-0.761726,3.275056
B,0.559521,-0.156069,-0.746121,0.403452
C,-0.402225,-0.031168,1.069665,-0.433393
D,-0.02515,0.960871,1.693892,0.935722
E,0.123028,-0.257183,-1.184496,-0.134155


In [25]:
df

Unnamed: 0,W,X,Y,Z,new
A,1.15202,0.257319,2.123037,-0.761726,3.275056
B,0.559521,2.031303,-0.156069,-0.746121,0.403452
C,-0.402225,-0.210495,-0.031168,1.069665,-0.433393
D,-0.02515,0.232597,0.960871,1.693892,0.935722
E,0.123028,-0.34243,-0.257183,-1.184496,-0.134155


In [26]:
df

Unnamed: 0,W,X,Y,Z,new
A,1.15202,0.257319,2.123037,-0.761726,3.275056
B,0.559521,2.031303,-0.156069,-0.746121,0.403452
C,-0.402225,-0.210495,-0.031168,1.069665,-0.433393
D,-0.02515,0.232597,0.960871,1.693892,0.935722
E,0.123028,-0.34243,-0.257183,-1.184496,-0.134155


In [27]:
df.drop('W',axis=1,inplace=True)   ### INplace will Delete FRom Main/Original Datafarme 
## You can't Store With New dataframe 

In [28]:
df

Unnamed: 0,X,Y,Z,new
A,0.257319,2.123037,-0.761726,3.275056
B,2.031303,-0.156069,-0.746121,0.403452
C,-0.210495,-0.031168,1.069665,-0.433393
D,0.232597,0.960871,1.693892,0.935722
E,-0.34243,-0.257183,-1.184496,-0.134155


### HOW TO DELETE A ROW

In [29]:
df.drop('E',axis=0)

Unnamed: 0,X,Y,Z,new
A,0.257319,2.123037,-0.761726,3.275056
B,2.031303,-0.156069,-0.746121,0.403452
C,-0.210495,-0.031168,1.069665,-0.433393
D,0.232597,0.960871,1.693892,0.935722


In [30]:
df

Unnamed: 0,X,Y,Z,new
A,0.257319,2.123037,-0.761726,3.275056
B,2.031303,-0.156069,-0.746121,0.403452
C,-0.210495,-0.031168,1.069665,-0.433393
D,0.232597,0.960871,1.693892,0.935722
E,-0.34243,-0.257183,-1.184496,-0.134155


In [31]:
df.drop('E',axis=0,inplace=True)
df

Unnamed: 0,X,Y,Z,new
A,0.257319,2.123037,-0.761726,3.275056
B,2.031303,-0.156069,-0.746121,0.403452
C,-0.210495,-0.031168,1.069665,-0.433393
D,0.232597,0.960871,1.693892,0.935722


** Selecting Rows**

In [32]:
df

Unnamed: 0,X,Y,Z,new
A,0.257319,2.123037,-0.761726,3.275056
B,2.031303,-0.156069,-0.746121,0.403452
C,-0.210495,-0.031168,1.069665,-0.433393
D,0.232597,0.960871,1.693892,0.935722


In [33]:
df.loc['A']

X      0.257319
Y      2.123037
Z     -0.761726
new    3.275056
Name: A, dtype: float64

In [34]:
df.loc[['A','B']]

Unnamed: 0,X,Y,Z,new
A,0.257319,2.123037,-0.761726,3.275056
B,2.031303,-0.156069,-0.746121,0.403452


In [35]:
df

Unnamed: 0,X,Y,Z,new
A,0.257319,2.123037,-0.761726,3.275056
B,2.031303,-0.156069,-0.746121,0.403452
C,-0.210495,-0.031168,1.069665,-0.433393
D,0.232597,0.960871,1.693892,0.935722


#### To get a specific value

In [37]:
# using loc ---> DataFrame.loc[row,column]
df.loc['C','Y']

-0.03116774802660083

In [36]:
# using iloc ---> DataFrame.iloc[row index,column index]
df.iloc[2,1]

-0.03116774802660083

Or select based off of position instead of label 

In [39]:
df.drop('new', axis =1, inplace= True)

In [40]:
df

Unnamed: 0,X,Y,Z
A,0.257319,2.123037,-0.761726
B,2.031303,-0.156069,-0.746121
C,-0.210495,-0.031168,1.069665
D,0.232597,0.960871,1.693892


In [41]:
df.iloc[0]

X    0.257319
Y    2.123037
Z   -0.761726
Name: A, dtype: float64

In [42]:
df.iloc[2]

X   -0.210495
Y   -0.031168
Z    1.069665
Name: C, dtype: float64

In [55]:
df

Unnamed: 0,X,Y,Z
A,0.257319,2.123037,-0.761726
B,2.031303,-0.156069,-0.746121
C,-0.210495,-0.031168,1.069665
D,0.232597,0.960871,1.693892


In [43]:
df.iloc[2:]

Unnamed: 0,X,Y,Z
C,-0.210495,-0.031168,1.069665
D,0.232597,0.960871,1.693892


In [44]:
df

Unnamed: 0,X,Y,Z
A,0.257319,2.123037,-0.761726
B,2.031303,-0.156069,-0.746121
C,-0.210495,-0.031168,1.069665
D,0.232597,0.960871,1.693892


In [45]:
df.iloc[1:3]

Unnamed: 0,X,Y,Z
B,2.031303,-0.156069,-0.746121
C,-0.210495,-0.031168,1.069665


In [46]:
df

Unnamed: 0,X,Y,Z
A,0.257319,2.123037,-0.761726
B,2.031303,-0.156069,-0.746121
C,-0.210495,-0.031168,1.069665
D,0.232597,0.960871,1.693892


In [47]:
df.iloc[1,2]

-0.7461211867681204

In [48]:
df.iloc[0,0]

0.2573192483282977

** Selecting subset of rows and columns **

In [49]:
df

Unnamed: 0,X,Y,Z
A,0.257319,2.123037,-0.761726
B,2.031303,-0.156069,-0.746121
C,-0.210495,-0.031168,1.069665
D,0.232597,0.960871,1.693892


In [50]:
df.loc['B','Y']

-0.15606899403341923

In [52]:
df

Unnamed: 0,X,Y,Z
A,0.257319,2.123037,-0.761726
B,2.031303,-0.156069,-0.746121
C,-0.210495,-0.031168,1.069665
D,0.232597,0.960871,1.693892


In [51]:
df.loc[['A','B'],['W','Y']]

Passing list-likes to .loc or [] with any missing label will raise
KeyError in the future, you can use .reindex() as an alternative.

See the documentation here:
https://pandas.pydata.org/pandas-docs/stable/indexing.html#deprecate-loc-reindex-listlike
  return self._getitem_tuple(key)


Unnamed: 0,W,Y
A,,2.123037
B,,-0.156069


In [53]:
df

Unnamed: 0,X,Y,Z
A,0.257319,2.123037,-0.761726
B,2.031303,-0.156069,-0.746121
C,-0.210495,-0.031168,1.069665
D,0.232597,0.960871,1.693892


In [54]:
df.iloc[[1,3],[0,2]]

Unnamed: 0,X,Z
B,2.031303,-0.746121
D,0.232597,1.693892


### Conditional Selection

An important feature of pandas is conditional selection using bracket notation, very similar to numpy:

In [56]:
df

Unnamed: 0,X,Y,Z
A,0.257319,2.123037,-0.761726
B,2.031303,-0.156069,-0.746121
C,-0.210495,-0.031168,1.069665
D,0.232597,0.960871,1.693892


In [57]:
df>0

Unnamed: 0,X,Y,Z
A,True,True,False
B,True,False,False
C,False,False,True
D,True,True,True


In [58]:
df[df>0]

Unnamed: 0,X,Y,Z
A,0.257319,2.123037,
B,2.031303,,
C,,,1.069665
D,0.232597,0.960871,1.693892


In [59]:
df[df<0]

Unnamed: 0,X,Y,Z
A,,,-0.761726
B,,-0.156069,-0.746121
C,-0.210495,-0.031168,
D,,,


In [60]:
df

Unnamed: 0,X,Y,Z
A,0.257319,2.123037,-0.761726
B,2.031303,-0.156069,-0.746121
C,-0.210495,-0.031168,1.069665
D,0.232597,0.960871,1.693892


In [61]:
# Reset to default 0,1...n index
df.reset_index()

Unnamed: 0,index,X,Y,Z
0,A,0.257319,2.123037,-0.761726
1,B,2.031303,-0.156069,-0.746121
2,C,-0.210495,-0.031168,1.069665
3,D,0.232597,0.960871,1.693892


# Great Job!