# Essential Basic Functionality

In [2]:
import pandas as pd
import numpy as np

In [4]:
index = pd.date_range('1/1/2000', periods=8)
index

DatetimeIndex(['2000-01-01', '2000-01-02', '2000-01-03', '2000-01-04',
               '2000-01-05', '2000-01-06', '2000-01-07', '2000-01-08'],
              dtype='datetime64[ns]', freq='D')

In [18]:
s = pd.Series(np.random.randn(5), index=['a', 'b', 'c', 'd', 'e'])

In [5]:
df = pd.DataFrame(np.random.randn(8, 3), index=index, columns=['A', 'B', 'C'])

In [7]:
wp = pd.Panel(np.random.randn(2, 5, 4), items=['Item1', 'Item2'],
              major_axis=pd.date_range('1/1/2000', periods=5),
              minor_axis=['A', 'B', 'C', 'D'])
wp.to_frame()

Panel is deprecated and will be removed in a future version.
The recommended way to represent these types of 3-dimensional data are with a MultiIndex on a DataFrame, via the Panel.to_frame() method
Alternatively, you can use the xarray package http://xarray.pydata.org/en/stable/.
Pandas provides a `.to_xarray()` method to help automate this conversion.

  exec(code_obj, self.user_global_ns, self.user_ns)


Unnamed: 0_level_0,Unnamed: 1_level_0,Item1,Item2
major,minor,Unnamed: 2_level_1,Unnamed: 3_level_1
2000-01-01,A,-1.488202,1.452607
2000-01-01,B,-0.742221,0.725504
2000-01-01,C,-0.907441,0.585714
2000-01-01,D,0.636496,-0.443917
2000-01-02,A,-0.777971,-0.864378
2000-01-02,B,-0.555977,0.193734
2000-01-02,C,1.368313,0.379978
2000-01-02,D,-0.398145,0.777924
2000-01-03,A,-0.623993,0.712458
2000-01-03,B,1.02478,1.151972


## Head and Tail

In [10]:
long_series = pd.Series(np.random.randn(1000))

In [11]:
long_series.head()

0    0.073474
1   -1.237817
2    1.021734
3    0.987819
4   -0.422822
dtype: float64

In [12]:
long_series.tail(2)

998   -1.473408
999   -0.689090
dtype: float64

## Attributes and Underlying Data

pandas objects have a number of attributes enabling you to access the metadata
- **shape**: gives the axis demensions of object, consistent with ndarray
- Axis labels
    - **Series**: index(only axis)
    - **DataFrame**: index(rows) and columns
    - **Panel**: items, major_axis, and minor_axis

In [13]:
df[:2]

Unnamed: 0,A,B,C
2000-01-01,0.087897,-0.4322,1.54027
2000-01-02,0.101178,-0.841805,-1.313317


In [14]:
df.columns = [x.lower() for x in df.columns]

In [15]:
df

Unnamed: 0,a,b,c
2000-01-01,0.087897,-0.4322,1.54027
2000-01-02,0.101178,-0.841805,-1.313317
2000-01-03,2.006411,-0.654908,2.565879
2000-01-04,-0.078675,0.321598,-1.590824
2000-01-05,1.09564,0.616037,0.499984
2000-01-06,-1.460066,1.727503,0.868001
2000-01-07,1.111612,0.337015,-0.600528
2000-01-08,1.999116,-0.066361,0.534636


In [21]:
s

a   -1.996803
b   -1.429720
c    1.017977
d    0.327830
e    0.305343
dtype: float64

In [22]:
s.array

<PandasArray>
[-1.9968032972795733, -1.4297203258963704,   1.017977287585785,
 0.32782968301862975,  0.3053430644557993]
Length: 5, dtype: float64

In [24]:
s.index.array

<PandasArray>
['a', 'b', 'c', 'd', 'e']
Length: 5, dtype: object

In [25]:
s.to_numpy()

array([-1.9968033 , -1.42972033,  1.01797729,  0.32782968,  0.30534306])

In [26]:
np.asarray(s)

array([-1.9968033 , -1.42972033,  1.01797729,  0.32782968,  0.30534306])

In [27]:
ser = pd.Series(pd.date_range('2000', periods=2, tz='CET'))

In [28]:
ser.to_numpy(dtype=object)

array([Timestamp('2000-01-01 00:00:00+0100', tz='CET', freq='D'),
       Timestamp('2000-01-02 00:00:00+0100', tz='CET', freq='D')],
      dtype=object)

In [29]:
ser.to_numpy(dtype='datetime64[ns]')

array(['1999-12-31T23:00:00.000000000', '2000-01-01T23:00:00.000000000'],
      dtype='datetime64[ns]')

In [31]:
arr = df.to_numpy()

In [37]:
arr[0][0] = 1
arr[0][1] = 2

In [38]:
arr, df

(array([[ 1.        ,  2.        ,  1.54026966],
        [ 0.10117831, -0.841805  , -1.31331683],
        [ 2.00641065, -0.65490839,  2.56587923],
        [-0.0786752 ,  0.32159797, -1.59082439],
        [ 1.09563975,  0.61603744,  0.49998424],
        [-1.46006568,  1.72750336,  0.86800057],
        [ 1.11161238,  0.33701515, -0.60052815],
        [ 1.99911641, -0.0663606 ,  0.53463627]]),
                    a         b         c
 2000-01-01  1.000000  2.000000  1.540270
 2000-01-02  0.101178 -0.841805 -1.313317
 2000-01-03  2.006411 -0.654908  2.565879
 2000-01-04 -0.078675  0.321598 -1.590824
 2000-01-05  1.095640  0.616037  0.499984
 2000-01-06 -1.460066  1.727503  0.868001
 2000-01-07  1.111612  0.337015 -0.600528
 2000-01-08  1.999116 -0.066361  0.534636)

In [56]:
str_ser = pd.Series(['a','b','c','d','e','f','g','h'],index=index)
heter_df = df
heter_df['d'] = str_ser
heter_df

Unnamed: 0,a,b,c,d
2000-01-01,1.0,2.0,1.54027,a
2000-01-02,0.101178,-0.841805,-1.313317,b
2000-01-03,2.006411,-0.654908,2.565879,c
2000-01-04,-0.078675,0.321598,-1.590824,d
2000-01-05,1.09564,0.616037,0.499984,e
2000-01-06,-1.460066,1.727503,0.868001,f
2000-01-07,1.111612,0.337015,-0.600528,g
2000-01-08,1.999116,-0.066361,0.534636,h


In [57]:
heter_arr = heter_df.to_numpy()

In [59]:
heter_arr[0][0] = 0
heter_arr, heter_df

(array([[0, 2.0, 1.5402696603951627, 'a'],
        [0.10117831236605421, -0.8418050035195802, -1.3133168320868638,
         'b'],
        [2.0064106499872914, -0.6549083908868593, 2.5658792325878945, 'c'],
        [-0.0786751977761196, 0.32159796565305265, -1.5908243917124651,
         'd'],
        [1.0956397519917107, 0.6160374402607975, 0.4999842394552588, 'e'],
        [-1.460065676358506, 1.7275033642203792, 0.8680005748250189, 'f'],
        [1.1116123834965557, 0.3370151492202917, -0.6005281533144555, 'g'],
        [1.9991164088261004, -0.06636059775147456, 0.5346362710990745,
         'h']], dtype=object),                    a         b         c  d
 2000-01-01  1.000000  2.000000  1.540270  a
 2000-01-02  0.101178 -0.841805 -1.313317  b
 2000-01-03  2.006411 -0.654908  2.565879  c
 2000-01-04 -0.078675  0.321598 -1.590824  d
 2000-01-05  1.095640  0.616037  0.499984  e
 2000-01-06 -1.460066  1.727503  0.868001  f
 2000-01-07  1.111612  0.337015 -0.600528  g
 2000-01-08  1.99911

## Accelerated operations

In [60]:
# pd.set_option('compute.use_bottleneck', False)
# pd.set_option('compute.use_numexpr', False)

## Flexible binary operations
- Broadcasting behavior between higher-(DataFrame) and lower-dimensional(Series) objects.
- Missing data in computations

### Matching / broadcasting behavior

In [62]:
df = pd.DataFrame({
    'one': pd.Series(np.random.randn(3), index=['a', 'b', 'c']),
    'two': pd.Series(np.random.randn(4), index=['a', 'b', 'c', 'd']),
    'three': pd.Series(np.random.randn(3), index=['b', 'c', 'd']),
})
df

Unnamed: 0,one,three,two
a,0.53568,,-0.860844
b,-1.605736,-0.560823,0.233212
c,1.91356,0.215165,-0.390917
d,,0.617621,-1.089797


In [63]:
row = df.iloc[1]
column = df['two']

In [65]:
df.sub(row, axis='columns')

Unnamed: 0,one,three,two
a,2.141416,,-1.094056
b,0.0,0.0,0.0
c,3.519296,0.775988,-0.624129
d,,1.178444,-1.323009


In [69]:
df.sub(row, axis=1)

Unnamed: 0,one,three,two
a,2.141416,,-1.094056
b,0.0,0.0,0.0
c,3.519296,0.775988,-0.624129
d,,1.178444,-1.323009


In [71]:
df.sub(column, axis='index')

Unnamed: 0,one,three,two
a,1.396524,,0.0
b,-1.838948,-0.794035,0.0
c,2.304477,0.606082,0.0
d,,1.707418,0.0


In [72]:
df.sub(column, axis=0)

Unnamed: 0,one,three,two
a,1.396524,,0.0
b,-1.838948,-0.794035,0.0
c,2.304477,0.606082,0.0
d,,1.707418,0.0


In [73]:
dfmi = df.copy()

In [75]:
dfmi.index = pd.MultiIndex.from_tuples([
    (1, 'a'), (1, 'b'), (1, 'c'), (2, 'a')
], names=['first', 'second'])
dfmi

Unnamed: 0_level_0,Unnamed: 1_level_0,one,three,two
first,second,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,a,0.53568,,-0.860844
1,b,-1.605736,-0.560823,0.233212
1,c,1.91356,0.215165,-0.390917
2,a,,0.617621,-1.089797


In [76]:
dfmi.sub(column, axis=0, level='second')

Unnamed: 0_level_0,Unnamed: 1_level_0,one,three,two
first,second,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,a,1.396524,,0.0
1,b,-1.838948,-0.794035,0.0
1,c,2.304477,0.606082,0.0
2,a,,1.478466,-0.228952


In [80]:
major_mean = wp.mean(axis='major')
major_mean

Unnamed: 0,Item1,Item2
A,-0.674072,0.370956
B,0.267737,0.308576
C,0.211351,0.223421
D,-0.335491,0.172063


In [81]:
wp.sub(major_mean, axis='major')

Panel is deprecated and will be removed in a future version.
The recommended way to represent these types of 3-dimensional data are with a MultiIndex on a DataFrame, via the Panel.to_frame() method
Alternatively, you can use the xarray package http://xarray.pydata.org/en/stable/.
Pandas provides a `.to_xarray()` method to help automate this conversion.

  return self._combine_frame(other, func, axis=axis)


<class 'pandas.core.panel.Panel'>
Dimensions: 2 (items) x 5 (major_axis) x 4 (minor_axis)
Items axis: Item1 to Item2
Major_axis axis: 2000-01-01 00:00:00 to 2000-01-05 00:00:00
Minor_axis axis: A to D

In [83]:
s = pd.Series(np.arange(10))
s

0    0
1    1
2    2
3    3
4    4
5    5
6    6
7    7
8    8
9    9
dtype: int64

In [84]:
div, rem = divmod(s, 3)
div, rem

(0    0
 1    0
 2    0
 3    1
 4    1
 5    1
 6    2
 7    2
 8    2
 9    3
 dtype: int64, 0    0
 1    1
 2    2
 3    0
 4    1
 5    2
 6    0
 7    1
 8    2
 9    0
 dtype: int64)

In [85]:
idx = pd.Index(np.arange(10))
idx

Int64Index([0, 1, 2, 3, 4, 5, 6, 7, 8, 9], dtype='int64')

In [86]:
div, rem = divmod(idx, 3)

In [87]:
div, rem

(Int64Index([0, 0, 0, 1, 1, 1, 2, 2, 2, 3], dtype='int64'),
 Int64Index([0, 1, 2, 0, 1, 2, 0, 1, 2, 0], dtype='int64'))

In [88]:
div, rem = divmod(s, [2,2,3,3,4,4,5,5,6,6])
div, rem

(0    0
 1    0
 2    0
 3    1
 4    1
 5    1
 6    1
 7    1
 8    1
 9    1
 dtype: int64, 0    0
 1    1
 2    2
 3    0
 4    0
 5    1
 6    1
 7    2
 8    2
 9    3
 dtype: int64)

### Missing data / operations with fill values

In [89]:
df

Unnamed: 0,one,three,two
a,0.53568,,-0.860844
b,-1.605736,-0.560823,0.233212
c,1.91356,0.215165,-0.390917
d,,0.617621,-1.089797


In [91]:
df2 = df

In [92]:
df + df2

Unnamed: 0,one,three,two
a,1.07136,,-1.721689
b,-3.211472,-1.121646,0.466424
c,3.82712,0.43033,-0.781834
d,,1.235242,-2.179594


In [94]:
df.add(df2, fill_value=0)

Unnamed: 0,one,three,two
a,1.07136,,-1.721689
b,-3.211472,-1.121646,0.466424
c,3.82712,0.43033,-0.781834
d,,1.235242,-2.179594


### Flexible Comparisons

In [96]:
df.gt(df2)

Unnamed: 0,one,three,two
a,False,False,False
b,False,False,False
c,False,False,False
d,False,False,False


In [97]:
df2.ne(df)

Unnamed: 0,one,three,two
a,False,True,False
b,False,False,False
c,False,False,False
d,True,False,False


In [105]:
s1 = pd.Series([1,2,3])
s2 = pd.Series([2,1,3])
s1.ne(s2)

0     True
1     True
2    False
dtype: bool

### Boolean Reductions

In [109]:
(df > 0)

Unnamed: 0,one,three,two
a,True,False,False
b,False,False,True
c,True,True,False
d,False,True,False


In [110]:
(df > 0).all()

one      False
three    False
two      False
dtype: bool

In [111]:
(df > 0).any()

one      True
three    True
two      True
dtype: bool

In [112]:
(df > 0).any().any()

True

In [113]:
df.empty

False

In [114]:
pd.DataFrame(columns=list('ABC')).empty

True

In [115]:
pd.Series([True]).bool()

True

In [116]:
pd.Series([False]).bool()

False

In [119]:
pd.DataFrame([[True]]).bool()

True

### Comparing if objects are equivalent