In [80]:
import numpy as np
import pandas as pd

In [81]:
# Series

# s = pd.Series(data, index=index)
s = pd.Series(np.random.randn(5), index=['a', 'b', 'c', 'd', 'e'])
s

a    0.829727
b   -0.514593
c    1.695213
d   -0.265183
e   -2.202084
dtype: float64

In [82]:
s.index

Index(['a', 'b', 'c', 'd', 'e'], dtype='object')

In [83]:
pd.Series(np.random.randn(5))

0   -1.156144
1    2.064946
2   -0.407360
3   -0.047888
4    0.872636
dtype: float64

In [84]:
# From dict

d = {'b': 1, 'a': 0, 'c': 2}
pd.Series(d)

b    1
a    0
c    2
dtype: int64

In [85]:
d = {'a': 0., 'b': 1., 'c': 2.}
pd.Series(d)

a    0.0
b    1.0
c    2.0
dtype: float64

In [86]:
pd.Series(d, index=['b', 'c', 'd', 'a'])

b    1.0
c    2.0
d    NaN
a    0.0
dtype: float64

In [87]:
# From scalar value

pd.Series(5., index=['a', 'b', 'c', 'd', 'e'])

a    5.0
b    5.0
c    5.0
d    5.0
e    5.0
dtype: float64

In [88]:
# Series is ndarray-like

s[0]

0.829726651310849

In [89]:
s[:3]

a    0.829727
b   -0.514593
c    1.695213
dtype: float64

In [90]:
s[s > s.median()]

a    0.829727
c    1.695213
dtype: float64

In [91]:
s[[4, 3, 1]]

e   -2.202084
d   -0.265183
b   -0.514593
dtype: float64

In [92]:
np.exp(s)

a    2.292692
b    0.597744
c    5.447809
d    0.767066
e    0.110572
dtype: float64

In [93]:
s.dtype

dtype('float64')

In [94]:
s.array

<PandasArray>
[   0.829726651310849,   -0.514593367292122,   1.6952134654245912,
 -0.26518276829520065,  -2.2020843917540582]
Length: 5, dtype: float64

In [95]:
s.to_numpy()

array([ 0.82972665, -0.51459337,  1.69521347, -0.26518277, -2.20208439])

In [96]:
# Series is dict-like

s['a']

0.829726651310849

In [97]:
s['e'] = 12.
s

a     0.829727
b    -0.514593
c     1.695213
d    -0.265183
e    12.000000
dtype: float64

In [98]:
'e' in s

True

In [99]:
'f' in s

False

In [100]:
s.get('f')
s.get('f', np.nan)

nan

In [101]:
# Vectorized operations and label alignment with Series

s + s

a     1.659453
b    -1.029187
c     3.390427
d    -0.530366
e    24.000000
dtype: float64

In [102]:
s * 2

a     1.659453
b    -1.029187
c     3.390427
d    -0.530366
e    24.000000
dtype: float64

In [103]:
np.exp(s)

a         2.292692
b         0.597744
c         5.447809
d         0.767066
e    162754.791419
dtype: float64

In [104]:
s[1:] + s[:-1]

a         NaN
b   -1.029187
c    3.390427
d   -0.530366
e         NaN
dtype: float64

In [105]:
# Name attribute

s = pd.Series(np.random.randn(5), name='something')
s

0    1.772164
1    1.012416
2    2.396647
3   -0.007764
4   -2.036278
Name: something, dtype: float64

In [106]:
s.name

'something'

In [107]:
s2 = s.rename("different")
s2.name

'different'

In [108]:
# DataFrame

# From dict of Series or dicts

d = {'one': pd.Series([1., 2., 3.], index=['a', 'b', 'c']),
    'two': pd.Series([1., 2., 3., 4.], index=['a', 'b', 'c', 'd'])}
df = pd.DataFrame(d)
df

Unnamed: 0,one,two
a,1.0,1.0
b,2.0,2.0
c,3.0,3.0
d,,4.0


In [109]:
pd.DataFrame(d, index=['d', 'b', 'a'])

Unnamed: 0,one,two
d,,4.0
b,2.0,2.0
a,1.0,1.0


In [110]:
pd.DataFrame(d, index=['d', 'b', 'a'], columns=['two', 'three'])

Unnamed: 0,two,three
d,4.0,
b,2.0,
a,1.0,


In [111]:
df.index

Index(['a', 'b', 'c', 'd'], dtype='object')

In [112]:
df.columns

Index(['one', 'two'], dtype='object')

In [113]:
# From dict of ndarrays / lists

d = {'one': [1., 2., 3., 4.],
    'two': [4., 3., 2., 1.]}
pd.DataFrame(d)

Unnamed: 0,one,two
0,1.0,4.0
1,2.0,3.0
2,3.0,2.0
3,4.0,1.0


In [117]:
pd.DataFrame(d, index=['a', 'b', 'c', 'd'])

Unnamed: 0,one,two
a,1.0,4.0
b,2.0,3.0
c,3.0,2.0
d,4.0,1.0


In [118]:
# From structured or record array

data = np.zeros((2, ), dtype=[('A', 'i4'), ('B', 'f4'), ('C', 'a10')])
data[:] = [(1, 2., 'Hello'), (2, 3., "World")]
pd.DataFrame(data)

Unnamed: 0,A,B,C
0,1,2.0,b'Hello'
1,2,3.0,b'World'


In [119]:
pd.DataFrame(data, index=['first', 'second'])

Unnamed: 0,A,B,C
first,1,2.0,b'Hello'
second,2,3.0,b'World'


In [120]:
pd.DataFrame(data, columns=['C', 'A', 'B'])

Unnamed: 0,C,A,B
0,b'Hello',1,2.0
1,b'World',2,3.0


In [121]:
# From a list of dicts

data2 = [{'a': 1, 'b': 2}, {'a': 5, 'b': 10, 'c': 20}]
pd.DataFrame(data2)

Unnamed: 0,a,b,c
0,1,2,
1,5,10,20.0


In [122]:
pd.DataFrame(data2, index=['first', 'second'])

Unnamed: 0,a,b,c
first,1,2,
second,5,10,20.0


In [123]:
pd.DataFrame(data2, columns=['a', 'b'])

Unnamed: 0,a,b
0,1,2
1,5,10


In [124]:
# From a dict of tuples

pd.DataFrame({('a', 'b'): {('A', 'B'): 1, ('A', 'C'): 2},
                ('a', 'a'): {('A', 'C'): 3, ('A', 'B'): 4},
                ('a', 'c'): {('A', 'B'): 5, ('A', 'C'): 6},
                ('b', 'a'): {('A', 'C'): 7, ('A', 'B'): 8},
                ('b', 'b'): {('A', 'D'): 9, ('A', 'B'): 10}})

Unnamed: 0_level_0,Unnamed: 1_level_0,a,a,a,b,b
Unnamed: 0_level_1,Unnamed: 1_level_1,b,a,c,a,b
A,B,1.0,4.0,5.0,8.0,10.0
A,C,2.0,3.0,6.0,7.0,
A,D,,,,,9.0


In [125]:
# From a Series

# Alternate constructors

pd.DataFrame.from_dict(dict([('A', [1, 2, 3]), ('B', [4, 5, 6])]))

Unnamed: 0,A,B
0,1,4
1,2,5
2,3,6


In [126]:
pd.DataFrame.from_dict(dict([('A', [1, 2, 3]), ('B', [4, 5, 6])]),
                        orient='index', columns=['one', 'two', 'three'])

Unnamed: 0,one,two,three
A,1,2,3
B,4,5,6


In [127]:
data

array([(1, 2., b'Hello'), (2, 3., b'World')],
      dtype=[('A', '<i4'), ('B', '<f4'), ('C', 'S10')])

In [128]:
pd.DataFrame.from_records(data, index='C')

Unnamed: 0_level_0,A,B
C,Unnamed: 1_level_1,Unnamed: 2_level_1
b'Hello',1,2.0
b'World',2,3.0


In [129]:
# Column selection, addition, deletion

df['one']

a    1.0
b    2.0
c    3.0
d    NaN
Name: one, dtype: float64

In [130]:
df['three'] = df['one'] * df['two']
df['flag'] = df['one'] > 2
df

Unnamed: 0,one,two,three,flag
a,1.0,1.0,1.0,False
b,2.0,2.0,4.0,False
c,3.0,3.0,9.0,True
d,,4.0,,False


In [131]:
del df['two']
three = df.pop('three')
df

Unnamed: 0,one,flag
a,1.0,False
b,2.0,False
c,3.0,True
d,,False


In [132]:
df['foo'] = 'bar'
df

Unnamed: 0,one,flag,foo
a,1.0,False,bar
b,2.0,False,bar
c,3.0,True,bar
d,,False,bar


In [133]:
df['one_trunc'] = df['one'][:2]
df

Unnamed: 0,one,flag,foo,one_trunc
a,1.0,False,bar,1.0
b,2.0,False,bar,2.0
c,3.0,True,bar,
d,,False,bar,


In [134]:
df.insert(1, 'bar', df['one'])
df

Unnamed: 0,one,bar,flag,foo,one_trunc
a,1.0,1.0,False,bar,1.0
b,2.0,2.0,False,bar,2.0
c,3.0,3.0,True,bar,
d,,,False,bar,


In [135]:
# Assigning new columns in method chains

from sklearn import datasets
data,target = datasets.load_iris(return_X_y=True)
iris = pd.read_csv #('data/iris.data')
iris

<function pandas.io.parsers._make_parser_function.<locals>.parser_f(filepath_or_buffer, sep=',', delimiter=None, header='infer', names=None, index_col=None, usecols=None, squeeze=False, prefix=None, mangle_dupe_cols=True, dtype=None, engine=None, converters=None, true_values=None, false_values=None, skipinitialspace=False, skiprows=None, skipfooter=0, nrows=None, na_values=None, keep_default_na=True, na_filter=True, verbose=False, skip_blank_lines=True, parse_dates=False, infer_datetime_format=False, keep_date_col=False, date_parser=None, dayfirst=False, iterator=False, chunksize=None, compression='infer', thousands=None, decimal=b'.', lineterminator=None, quotechar='"', quoting=0, doublequote=True, escapechar=None, comment=None, encoding=None, dialect=None, tupleize_cols=None, error_bad_lines=True, warn_bad_lines=True, delim_whitespace=False, low_memory=True, memory_map=False, float_precision=None)>

In [136]:
(iris.assign(sepal_ratio=iris['SepalWidth'] / iris['SepalLength'])
            .head())

AttributeError: 'function' object has no attribute 'assign'

In [137]:
iris.assign(sepal_ratio=lambda x: (x['SepalWidth'] / x['SepalLength'])).head()


AttributeError: 'function' object has no attribute 'assign'

In [138]:
(iris.query('SepalLength > 5')
        .assign(SepalRatio=lambda x: x.SepalWidth / x.SepalLength,
                PetalRatio=lambda x: x.PetalWidth / x.PetalLength)
        .plot(kind='scatter', x='SepalRatio', y='PetalRatio'))

AttributeError: 'function' object has no attribute 'query'

In [139]:
dfa = pd.DataFrame({"A": [1, 2, 3],
                    "B": [4, 5, 6]})

dfa.assign(C=lambda x: x['A'] + x['B'],
            D=lambda x: x['A'] + x['C'])

Unnamed: 0,A,B,C,D
0,1,4,5,6
1,2,5,7,9
2,3,6,9,12


In [140]:
dependent = pd.DataFrame({"A": [1, 1, 1]})

(dependent.assign(A=lambda x: x['A'] + 1)
            .assign(B=lambda x: x['A'] + 2))

Unnamed: 0,A,B
0,2,4
1,2,4
2,2,4


In [141]:
dependent = pd.DataFrame({"A": [1, 1, 1]})
dependent.assign(A=lambda x: x["A"] + 1, B=lambda x: x["A"] + 2)

Unnamed: 0,A,B
0,2,4
1,2,4
2,2,4


In [150]:
# Indexing / selection

df.loc['b']

AttributeError: 'numpy.ndarray' object has no attribute 'loc'

In [151]:
df.iloc[2]

AttributeError: 'numpy.ndarray' object has no attribute 'iloc'

In [152]:
# Data alignment and arithmetic

df = pd.DataFrame(np.random.randn(10, 4), columns=['A', 'B', 'C', 'D'])
df2 = pd.DataFrame(np.random.randn(7, 3), columns=['A', 'B', 'C'])
df + df2

Unnamed: 0,A,B,C,D
0,-0.64896,0.296021,-1.357465,
1,-1.047264,0.928696,1.943992,
2,0.238523,0.387228,-0.121643,
3,0.390561,0.78365,0.262465,
4,2.009094,2.574505,-0.723582,
5,-1.5191,1.138151,0.204837,
6,-0.230044,2.745028,0.891045,
7,,,,
8,,,,
9,,,,


In [153]:
df - df.iloc[0]

Unnamed: 0,A,B,C,D
0,0.0,0.0,0.0,0.0
1,-0.194062,1.294076,1.678504,-0.028468
2,-0.383771,-0.227717,2.129366,0.362386
3,1.007454,0.393459,1.862483,1.006266
4,1.061905,-0.009424,-0.529043,-0.7268
5,0.131739,-0.547246,2.023049,0.294065
6,0.093312,3.202026,1.783658,1.03146
7,1.645383,1.029926,2.659809,0.403884
8,-0.466929,-0.885442,2.870238,-0.066554
9,0.262488,0.183013,0.619486,1.680698


In [154]:
index = pd.date_range('1/1/2000', periods=8)
df = pd.DataFrame(np.random.randn(8, 3), index=index, columns=list('ABC'))
df

Unnamed: 0,A,B,C
2000-01-01,-0.897763,2.226046,-0.783815
2000-01-02,-1.062121,1.351344,-2.496834
2000-01-03,0.425174,0.884179,-1.499912
2000-01-04,-0.952021,0.460121,1.364759
2000-01-05,-0.752438,1.534341,-0.584413
2000-01-06,-1.950263,0.080474,-0.07907
2000-01-07,1.584483,1.846708,1.147055
2000-01-08,0.793401,0.118362,-1.145094


In [155]:
type(df['A'])

pandas.core.series.Series

In [156]:
df - df['A']

Unnamed: 0,2000-01-01 00:00:00,2000-01-02 00:00:00,2000-01-03 00:00:00,2000-01-04 00:00:00,2000-01-05 00:00:00,2000-01-06 00:00:00,2000-01-07 00:00:00,2000-01-08 00:00:00,A,B,C
2000-01-01,,,,,,,,,,,
2000-01-02,,,,,,,,,,,
2000-01-03,,,,,,,,,,,
2000-01-04,,,,,,,,,,,
2000-01-05,,,,,,,,,,,
2000-01-06,,,,,,,,,,,
2000-01-07,,,,,,,,,,,
2000-01-08,,,,,,,,,,,


In [157]:
df.sub(df['A'], axis=0)

Unnamed: 0,A,B,C
2000-01-01,0.0,3.123809,0.113948
2000-01-02,0.0,2.413466,-1.434712
2000-01-03,0.0,0.459005,-1.925086
2000-01-04,0.0,1.412142,2.31678
2000-01-05,0.0,2.286778,0.168024
2000-01-06,0.0,2.030737,1.871193
2000-01-07,0.0,0.262225,-0.437428
2000-01-08,0.0,-0.675039,-1.938495


In [158]:
df * 5 + 2

Unnamed: 0,A,B,C
2000-01-01,-2.488814,13.130232,-1.919076
2000-01-02,-3.310607,8.756722,-10.484169
2000-01-03,4.125871,6.420896,-5.499559
2000-01-04,-2.760104,4.300606,8.823794
2000-01-05,-1.762188,9.671703,-0.922067
2000-01-06,-7.751317,2.402368,1.604649
2000-01-07,9.922414,11.23354,7.735274
2000-01-08,5.967006,2.591808,-3.72547


In [159]:
1 / df

Unnamed: 0,A,B,C
2000-01-01,-1.11388,0.449227,-1.275811
2000-01-02,-0.941512,0.740004,-0.400507
2000-01-03,2.351977,1.130992,-0.666706
2000-01-04,-1.050397,2.17334,0.73273
2000-01-05,-1.329014,0.651746,-1.711117
2000-01-06,-0.512751,12.426422,-12.646983
2000-01-07,0.631121,0.541504,0.871798
2000-01-08,1.260396,8.448681,-0.873291


In [160]:
df ** 4

Unnamed: 0,A,B,C
2000-01-01,0.649601,24.55483,0.377446
2000-01-02,1.272614,3.334757,38.864988
2000-01-03,0.032679,0.611169,5.061308
2000-01-04,0.821459,0.044822,3.469155
2000-01-05,0.32054,5.542262,0.116649
2000-01-06,14.466823,4.2e-05,3.9e-05
2000-01-07,6.303041,11.630353,1.731158
2000-01-08,0.396252,0.000196,1.719352


In [161]:
df1 = pd.DataFrame({'a': [1, 0, 1], 'b': [0, 1, 1]}, dtype=bool)
df2 = pd.DataFrame({'a': [0, 1, 1], 'b': [1, 1, 0]}, dtype=bool)
df1 & df2

Unnamed: 0,a,b
0,False,False
1,False,True
2,True,False


In [162]:
df1 | df2

Unnamed: 0,a,b
0,True,True
1,True,True
2,True,True


In [163]:
df1 ^ df2

Unnamed: 0,a,b
0,True,True
1,True,False
2,False,True


In [164]:
-df1

Unnamed: 0,a,b
0,False,True
1,True,False
2,False,False


In [165]:
# Tranposing

# only show the first 5 rows
df[:5].T

Unnamed: 0,2000-01-01 00:00:00,2000-01-02 00:00:00,2000-01-03 00:00:00,2000-01-04 00:00:00,2000-01-05 00:00:00
A,-0.897763,-1.062121,0.425174,-0.952021,-0.752438
B,2.226046,1.351344,0.884179,0.460121,1.534341
C,-0.783815,-2.496834,-1.499912,1.364759,-0.584413


In [166]:
# DataFrame interoperability with NumPy functions

np.exp(df)

Unnamed: 0,A,B,C
2000-01-01,0.40748,9.263172,0.45666
2000-01-02,0.345722,3.862615,0.082345
2000-01-03,1.529857,2.420997,0.22315
2000-01-04,0.38596,1.584266,3.914779
2000-01-05,0.471217,4.638266,0.557433
2000-01-06,0.142237,1.0838,0.923975
2000-01-07,4.876769,6.338917,3.148905
2000-01-08,2.210903,1.125651,0.318194


In [167]:
np.asarray(df)

array([[-0.89776287,  2.22604649, -0.78381524],
       [-1.06212142,  1.35134444, -2.49683379],
       [ 0.42517425,  0.88417924, -1.49991172],
       [-0.95202075,  0.46012125,  1.36475887],
       [-0.75243756,  1.53434055, -0.58441347],
       [-1.95026348,  0.08047369, -0.07907024],
       [ 1.58448284,  1.84670797,  1.14705482],
       [ 0.79340116,  0.11836167, -1.14509408]])

In [168]:
ser = pd.Series([1, 2, 3, 4])
np.exp(ser)

0     2.718282
1     7.389056
2    20.085537
3    54.598150
dtype: float64

In [169]:
ser1 = pd.Series([1, 2, 3], index=['a', 'b', 'c'])
ser2 = pd.Series([1, 3, 5], index=['b', 'a', 'c'])
ser1

a    1
b    2
c    3
dtype: int64

In [170]:
ser2

b    1
a    3
c    5
dtype: int64

In [171]:
np.remainder(ser1, ser2)

a    0
b    2
c    3
dtype: int64

In [172]:
ser3 = pd.Series([2, 4, 6], index=['b', 'c', 'd'])
ser3

b    2
c    4
d    6
dtype: int64

In [173]:
np.remainder(ser1, ser3)

a    1
b    2
c    3
dtype: int64

In [174]:
ser = pd.Series([1, 2, 3])
idx = pd.Index([4, 5, 6])
np.maximum(ser, idx)

0    4
1    5
2    6
dtype: int64

In [177]:
# Console display

baseball = pd.read_csv('data/baseball.csv')
print(baseball)

FileNotFoundError: [Errno 2] File b'data/baseball.csv' does not exist: b'data/baseball.csv'

In [178]:
baseball.info()

NameError: name 'baseball' is not defined

In [179]:
print(baseball.iloc[-20:, :12].to_string())

NameError: name 'baseball' is not defined

In [180]:
pd.DataFrame(np.random.randn(3, 12))

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11
0,-0.234498,0.98395,-0.417277,-0.430651,-0.976837,-2.249192,0.951111,-0.652024,0.019646,1.091706,1.654639,1.93941
1,0.106363,0.868022,-1.335856,-0.47944,0.021524,-0.521133,1.479183,-0.174008,0.128308,0.270793,1.872385,-0.837419
2,1.357589,-0.095594,-0.334861,-0.421383,-0.42964,-0.340866,0.945644,-0.512846,0.981923,-1.13787,-1.631962,-1.435171


In [181]:
pd.set_option('display.width', 40)  # default is 80
pd.DataFrame(np.random.randn(3, 12))

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11
0,-0.614204,-1.134932,-0.415349,-0.28317,1.108272,0.597341,-0.086412,0.74577,-0.501309,-1.129489,-0.169266,-1.640596
1,-1.335681,-0.115078,1.630853,0.288404,-0.853676,0.662335,1.354154,0.061883,-1.409292,-2.467019,-1.244542,0.175554
2,0.721699,-0.414833,-1.191589,1.666311,0.383228,-0.111346,-1.130665,-0.539738,-1.634303,-0.370669,-0.116612,-0.9642


In [182]:
datafile = {'filename': ['filename_01', 'filename_02'],
            'path': ["media/user_name/storage/folder_01/filename_01",
                    "media/user_name/storage/folder_02/filename_02"]}

pd.set_option('display.max_colwidth', 30)
pd.DataFrame(datafile)

Unnamed: 0,filename,path
0,filename_01,media/user_name/storage/fo...
1,filename_02,media/user_name/storage/fo...


In [183]:
pd.set_option('display.max_colwidth', 100)
pd.DataFrame(datafile)

Unnamed: 0,filename,path
0,filename_01,media/user_name/storage/folder_01/filename_01
1,filename_02,media/user_name/storage/folder_02/filename_02


In [184]:
df = pd.DataFrame({'foo1': np.random.randn(5),
                    'foo2': np.random.randn(5)})
df

Unnamed: 0,foo1,foo2
0,-1.174121,-0.19477
1,0.623682,0.713956
2,-0.695659,-0.06896
3,-1.179035,-0.054341
4,-1.065444,-1.068986


In [185]:
df.foo1

0   -1.174121
1    0.623682
2   -0.695659
3   -1.179035
4   -1.065444
Name: foo1, dtype: float64

In [186]:
df.fo<TAB>  # noqa: E225, E999

SyntaxError: invalid syntax (<ipython-input-186-f9ab91f24c25>, line 1)