# DataFrames

DataFrames are the workhorse of pandas and are directly inspired by the R programming language. We can think of a DataFrame as a bunch of Series objects put together to share the same index.

In [2]:
import numpy as np
import pandas as pd
from numpy.random import randn

In [3]:
np.random.seed(101)

### Each of the columns is a Pandas Series(eg. W is a pandas series and so x,y,z). 
### They all share a common index. Eg., they all share index A

In [10]:
df = pd.DataFrame(randn(5,4), ['A','B','C','D','E'], ['W','X','Y','Z'])
df

Unnamed: 0,W,X,Y,Z
A,-1.467514,-0.494095,-0.162535,0.485809
B,0.392489,0.221491,-0.855196,1.54199
C,0.666319,-0.538235,-0.568581,1.407338
D,0.641806,-0.9051,-0.391157,1.028293
E,-1.972605,-0.866885,0.720788,-1.223082


## Selection and Indexing
### output of single column is series where the output of multiple columns is a dataframe

In [11]:
# Output is like a series
df['W']

A   -1.467514
B    0.392489
C    0.666319
D    0.641806
E   -1.972605
Name: W, dtype: float64

In [12]:
type(df['W'])

pandas.core.series.Series

In [13]:
type(df)

pandas.core.frame.DataFrame

In [14]:
df.X

A   -0.494095
B    0.221491
C   -0.538235
D   -0.905100
E   -0.866885
Name: X, dtype: float64

In [15]:
# To fetch more than 1 column
df[['Y','Z']]

Unnamed: 0,Y,Z
A,-0.162535,0.485809
B,-0.855196,1.54199
C,-0.568581,1.407338
D,-0.391157,1.028293
E,0.720788,-1.223082


**Creating a new column:**

In [16]:
df['new'] = df['W'] + df['X']
df

Unnamed: 0,W,X,Y,Z,new
A,-1.467514,-0.494095,-0.162535,0.485809,-1.961609
B,0.392489,0.221491,-0.855196,1.54199,0.613979
C,0.666319,-0.538235,-0.568581,1.407338,0.128085
D,0.641806,-0.9051,-0.391157,1.028293,-0.263294
E,-1.972605,-0.866885,0.720788,-1.223082,-2.83949


In [18]:
df.drop('new')

KeyError: "['new'] not found in axis"

In [19]:
df.drop('new', axis=1)

Unnamed: 0,W,X,Y,Z
A,-1.467514,-0.494095,-0.162535,0.485809
B,0.392489,0.221491,-0.855196,1.54199
C,0.666319,-0.538235,-0.568581,1.407338
D,0.641806,-0.9051,-0.391157,1.028293
E,-1.972605,-0.866885,0.720788,-1.223082


In [20]:
df

Unnamed: 0,W,X,Y,Z,new
A,-1.467514,-0.494095,-0.162535,0.485809,-1.961609
B,0.392489,0.221491,-0.855196,1.54199,0.613979
C,0.666319,-0.538235,-0.568581,1.407338,0.128085
D,0.641806,-0.9051,-0.391157,1.028293,-0.263294
E,-1.972605,-0.866885,0.720788,-1.223082,-2.83949


In [21]:
# We have to put inplace = true in order to drop a column permanently. 
# By default, it is false to ensure we don't accidentally drop a column
df.drop('new', axis=1, inplace=True)

In [22]:
df

Unnamed: 0,W,X,Y,Z
A,-1.467514,-0.494095,-0.162535,0.485809
B,0.392489,0.221491,-0.855196,1.54199
C,0.666319,-0.538235,-0.568581,1.407338
D,0.641806,-0.9051,-0.391157,1.028293
E,-1.972605,-0.866885,0.720788,-1.223082


In [25]:
# To drop a row
df.drop('E')

Unnamed: 0,W,X,Y,Z
A,-1.467514,-0.494095,-0.162535,0.485809
B,0.392489,0.221491,-0.855196,1.54199
C,0.666319,-0.538235,-0.568581,1.407338
D,0.641806,-0.9051,-0.391157,1.028293


In [26]:
df.shape

(5, 4)