In [1]:
import numpy as np
import pandas as pd

## Series

In [2]:
# Here, we specify the index 
s = pd.Series(np.random.randn(5), index=['a', 'b', 'c', 'd', 'e'])

In [3]:
s

a    0.669856
b   -1.026991
c   -0.661863
d    0.753685
e   -0.225617
dtype: float64

In [4]:
s.index

Index(['a', 'b', 'c', 'd', 'e'], dtype='object')

In [5]:
# Here, we let Pandas create a default index
pd.Series(np.random.randn(5))

0   -0.856640
1   -0.857603
2    0.244461
3    1.852427
4   -1.432970
dtype: float64

In [7]:
d = {'b': 1, 'a': 0, 'c': 2}
pd.Series(d)

b    1
a    0
c    2
dtype: int64

In [76]:
# This is supposed to work, but doesn't
pd.Series(5., index=['a', 'b', 'c', 'd', 'e'])

TypeError: Cannot interpret '<attribute 'dtype' of 'numpy.generic' objects>' as a data type

In [14]:
# Instead...
pd.Series([5,5,5,5,5], index=['a', 'b', 'c', 'd', 'e'])

a    5
b    5
c    5
d    5
e    5
dtype: int64

In [20]:
s[0]

0.6698562282768611

In [21]:
s[:3]

a    0.669856
b   -1.026991
c   -0.661863
dtype: float64

In [22]:
s[s > s.median()]

a    0.669856
d    0.753685
dtype: float64

In [23]:
s[[4, 3, 1]]

e   -0.225617
d    0.753685
b   -1.026991
dtype: float64

In [24]:
np.exp(s)

a    1.953956
b    0.358083
c    0.515889
d    2.124817
e    0.798024
dtype: float64

In [25]:
s.dtype

dtype('float64')

In [26]:
s.to_numpy()

array([ 0.66985623, -1.02699138, -0.66186338,  0.75368545, -0.2256168 ])

In [27]:
s['a']

0.6698562282768611

In [28]:
s['e']

-0.22561680142474502

In [29]:
s

a    0.669856
b   -1.026991
c   -0.661863
d    0.753685
e   -0.225617
dtype: float64

In [30]:
'e' in s

True

In [31]:
'f' in s

False

In [33]:
# This would raise a KeyError since it's not in the Series' dictionary
# s['f']

In [34]:
s + s

a    1.339712
b   -2.053983
c   -1.323727
d    1.507371
e   -0.451234
dtype: float64

In [35]:
s * 2

a    1.339712
b   -2.053983
c   -1.323727
d    1.507371
e   -0.451234
dtype: float64

In [36]:
np.exp(s)

a    1.953956
b    0.358083
c    0.515889
d    2.124817
e    0.798024
dtype: float64

A key difference between Series and ndarray is that operations between Series automatically align data based on the label. Thus, you can write computations without considering whether the Series involved have the same labels.

In [37]:
s1 = s[1:]

In [38]:
s2 = s[:-1]

In [39]:
s1 + s2

a         NaN
b   -2.053983
c   -1.323727
d    1.507371
e         NaN
dtype: float64

In [41]:
s = pd.Series(np.random.randn(5), name='something')
s

0   -2.330545
1   -0.054256
2    0.549430
3    3.244120
4   -0.414051
Name: something, dtype: float64

In [42]:
s.name

'something'

## Dataframes

#### From Dictionary of Series or Dictionaries

In [43]:
d = {'one': pd.Series([1., 2., 3.], index=['a', 'b', 'c']),
         'two': pd.Series([1., 2., 3., 4.], index=['a', 'b', 'c', 'd'])} 

In [44]:
df = pd.DataFrame(d)
df

Unnamed: 0,one,two
a,1.0,1.0
b,2.0,2.0
c,3.0,3.0
d,,4.0


In [45]:
pd.DataFrame(d, index=['d', 'b', 'a'])

Unnamed: 0,one,two
d,,4.0
b,2.0,2.0
a,1.0,1.0


In [47]:
pd.DataFrame(d, index=['d', 'b', 'a'], columns=['two', 'three'])

AttributeError: type object 'object' has no attribute 'dtype'

#### From Dictionary of ndarrays or lists

In [48]:
d = {'one': [1., 2., 3., 4.],
         'two': [4., 3., 2., 1.]}
pd.DataFrame(d)

Unnamed: 0,one,two
0,1.0,4.0
1,2.0,3.0
2,3.0,2.0
3,4.0,1.0


In [49]:
pd.DataFrame(d, index=['a', 'b', 'c', 'd'])

Unnamed: 0,one,two
a,1.0,4.0
b,2.0,3.0
c,3.0,2.0
d,4.0,1.0


#### From a Series

In [50]:
pd.DataFrame(pd.Series(np.random.randn(5), name='something'))

Unnamed: 0,something
0,1.309451
1,-0.460302
2,-1.62217
3,0.936387
4,0.123566


## Column selection, addition, deletion

In [51]:
df['one']

a    1.0
b    2.0
c    3.0
d    NaN
Name: one, dtype: float64

In [52]:
df['three'] = df['one'] * df['two']

In [53]:
df['flag'] = df['one'] > 2

In [54]:
df

Unnamed: 0,one,two,three,flag
a,1.0,1.0,1.0,False
b,2.0,2.0,4.0,False
c,3.0,3.0,9.0,True
d,,4.0,,False


In [55]:
del df['two']

In [56]:
df

Unnamed: 0,one,three,flag
a,1.0,1.0,False
b,2.0,4.0,False
c,3.0,9.0,True
d,,,False


In [57]:
df['foo'] = 'bar'

In [58]:
df

Unnamed: 0,one,three,flag,foo
a,1.0,1.0,False,bar
b,2.0,4.0,False,bar
c,3.0,9.0,True,bar
d,,,False,bar


In [59]:
df['one_trunc'] = df['one'][:2]

In [60]:
df

Unnamed: 0,one,three,flag,foo,one_trunc
a,1.0,1.0,False,bar,1.0
b,2.0,4.0,False,bar,2.0
c,3.0,9.0,True,bar,
d,,,False,bar,


In [61]:
df['one'][:2]

a    1.0
b    2.0
Name: one, dtype: float64

In [67]:
df = pd.DataFrame(np.random.randn(8, 3), columns=list('ABC'))
df

Unnamed: 0,A,B,C
0,-0.331125,-0.649841,0.867999
1,-1.581559,-2.149061,-1.643073
2,-0.090702,-0.573914,0.235652
3,-0.689898,-1.490539,0.680203
4,0.549315,0.29895,-1.732368
5,-0.810807,-0.524853,0.302032
6,0.183688,-3.149902,1.080294
7,-1.42783,-0.73781,1.574571


In [68]:
df * 5 + 2

Unnamed: 0,A,B,C
0,0.344374,-1.249203,6.339995
1,-5.907793,-8.745305,-6.215367
2,1.546488,-0.86957,3.178259
3,-1.449489,-5.452695,5.401014
4,4.746576,3.494752,-6.661841
5,-2.054033,-0.624266,3.51016
6,2.918442,-13.749509,7.401468
7,-5.139149,-1.689049,9.872854


In [69]:
# Exponentiation operator
df ** 4

Unnamed: 0,A,B,C
0,0.012022,0.178331,0.567645
1,6.256639,21.330204,7.288328
2,6.8e-05,0.108489,0.003084
3,0.226537,4.935979,0.214069
4,0.091051,0.007987,9.006599
5,0.432184,0.075884,0.008322
6,0.001138,98.443735,1.361969
7,4.15629,0.296331,6.146794


In [70]:
df1 = pd.DataFrame({'a': [1, 0, 1], 'b': [0, 1, 1]}, dtype=bool)
df2 = pd.DataFrame({'a': [0, 1, 1], 'b': [1, 1, 0]}, dtype=bool)

In [71]:
df1 & df2

Unnamed: 0,a,b
0,False,False
1,False,True
2,True,False


In [72]:
df1 | df2

Unnamed: 0,a,b
0,True,True
1,True,True
2,True,True


In [73]:
# Unsure what this operator is
df1 ^ df2

Unnamed: 0,a,b
0,True,True
1,True,False
2,False,True


In [74]:
-df1

Unnamed: 0,a,b
0,False,True
1,True,False
2,False,False


## dtypes

In [75]:
dft = pd.DataFrame({'A': np.random.rand(3),
                        'B': 1,
                        'C': 'foo',
                        'D': pd.Timestamp('20010102'),
                        'E': pd.Series([1.0] * 3).astype('float32'),
                        'F': False,
                        'G': pd.Series([1] * 3, dtype='int8')})

TypeError: Cannot interpret '<attribute 'dtype' of 'numpy.generic' objects>' as a data type