In [1]:
import pandas as pd 
import numpy as np 
import matplotlib.pyplot as plt 

In [2]:
sd = pd.Series(np.arange(3.))

In [3]:
sd.iloc[-1]

2.0

In [4]:
sd.index = ['a','b','c']

In [5]:
sd[-1]  #position based indexing works with non-integer indices , which was set above

2.0

In [6]:
sd[:2]

a    0.0
b    1.0
dtype: float64

### Adding two series objects together , returns the sum of the similar indexes. if index is not present , a null value (Nan) is returned 

### Adding two DataFrame objects , adds based on index and columns. Again , if the indexes and column names dont match , the resulting dataFrame will contain null values for the subsequent empty indexes and columns

In [7]:
a = pd.Series(np.arange(4), index = ['a','b','c','d'])

In [8]:
b = pd.Series(np.arange(3), index = ['a','c','e'])

In [9]:
a

a    0
b    1
c    2
d    3
dtype: int64

In [10]:
b

a    0
c    1
e    2
dtype: int64

In [11]:
c = a + b 

In [12]:
c

a    0.0
b    NaN
c    3.0
d    NaN
e    NaN
dtype: float64

In [13]:
df1 = pd.DataFrame(np.random.randn(12).reshape(3,4), index = ['a','b','c'])

In [14]:
df1

Unnamed: 0,0,1,2,3
a,-0.641112,0.351708,1.74246,-1.023859
b,0.213575,-0.28602,-0.146631,-0.45451
c,-0.231022,-2.672273,0.374833,-1.068827


In [15]:
df2 = pd.DataFrame(np.random.randn(20).reshape(4,5), index = ['a','b','c','e'], columns = [1,2,4,5,6])

In [16]:
df2

Unnamed: 0,1,2,4,5,6
a,-1.811651,-0.367941,-0.371676,0.557591,-0.104631
b,-0.124607,-1.689268,-1.213154,-1.015426,1.686314
c,0.257109,0.934379,0.153384,0.19238,-0.391542
e,-0.279689,2.047039,0.516978,-1.538411,0.215453


In [17]:
df2.loc['c', 4] = np.nan

In [18]:
df2

Unnamed: 0,1,2,4,5,6
a,-1.811651,-0.367941,-0.371676,0.557591,-0.104631
b,-0.124607,-1.689268,-1.213154,-1.015426,1.686314
c,0.257109,0.934379,,0.19238,-0.391542
e,-0.279689,2.047039,0.516978,-1.538411,0.215453


In [19]:
df3 = df1 + df2

In [20]:
df3

Unnamed: 0,0,1,2,3,4,5,6
a,,-1.459943,1.374519,,,,
b,,-0.410627,-1.835899,,,,
c,,-2.415164,1.309212,,,,
e,,,,,,,


In [21]:
df3 = df1.add(df2, fill_value=0)

In [22]:
df3

Unnamed: 0,0,1,2,3,4,5,6
a,-0.641112,-1.459943,1.374519,-1.023859,-0.371676,0.557591,-0.104631
b,0.213575,-0.410627,-1.835899,-0.45451,-1.213154,-1.015426,1.686314
c,-0.231022,-2.415164,1.309212,-1.068827,,0.19238,-0.391542
e,,-0.279689,2.047039,,0.516978,-1.538411,0.215453


##### Operations between DataFrames and Series 

In [23]:
arr = np.arange(12).reshape(3,4)

In [24]:
arr

array([[ 0,  1,  2,  3],
       [ 4,  5,  6,  7],
       [ 8,  9, 10, 11]])

In [25]:
arr[0]

array([0, 1, 2, 3])

In [26]:
arr1 = arr - arr[0]   #subtracts arr[0] from every row in arr

arr1

array([[0, 0, 0, 0],
       [4, 4, 4, 4],
       [8, 8, 8, 8]])

In [27]:
dataframe = pd.DataFrame(np.arange(12).reshape(3,4), columns = ['a','b','c','d'], index = ['x','y','z'])

In [28]:
dataframe

Unnamed: 0,a,b,c,d
x,0,1,2,3
y,4,5,6,7
z,8,9,10,11


In [29]:
series = dataframe.iloc[0]

series

a    0
b    1
c    2
d    3
Name: x, dtype: int64

In [30]:
dt_s = dataframe - series

dt_s                           # subtracting the series from a dataframe , subtracts the series from every row 
                               # in the dataframe 

Unnamed: 0,a,b,c,d
x,0,0,0,0
y,4,4,4,4
z,8,8,8,8


In [31]:
dataframe.sub(series, axis='columns')  # .sub() method over the column axis 

Unnamed: 0,a,b,c,d
x,0,0,0,0
y,4,4,4,4
z,8,8,8,8


## Function application and mapping 

In [32]:
frame = pd.DataFrame(np.random.randn(4,3), columns = list('bde'), index = ['utah','ohio','texas','oregan'])

In [33]:
frame

Unnamed: 0,b,d,e
utah,-0.801061,0.116093,-0.991776
ohio,-0.180888,-1.358011,0.730562
texas,0.335004,-0.246455,1.553054
oregan,-0.614388,0.097881,-0.442414


In [34]:
f = lambda x: x.max() - x.min()

In [35]:
frame = np.abs(frame)

In [36]:
frame

Unnamed: 0,b,d,e
utah,0.801061,0.116093,0.991776
ohio,0.180888,1.358011,0.730562
texas,0.335004,0.246455,1.553054
oregan,0.614388,0.097881,0.442414


In [37]:
frame.apply(f)     # apply() method applies a fucntion on 1d arrays to each column or row 

b    0.620172
d    1.260130
e    1.110640
dtype: float64

In [38]:
frame.apply(f, axis = 'columns')

utah      0.875683
ohio      1.177122
texas     1.306600
oregan    0.516507
dtype: float64

In [39]:
fmat = lambda x: '%.2f' % x

frame.applymap(fmat)      #element wise python func used via applymap() method 

Unnamed: 0,b,d,e
utah,0.8,0.12,0.99
ohio,0.18,1.36,0.73
texas,0.34,0.25,1.55
oregan,0.61,0.1,0.44


In [42]:
# series also has a map func for applying element wise func 

frame['e'].map(fmat)

utah      0.99
ohio      0.73
texas     1.55
oregan    0.44
Name: e, dtype: object

## Sorting & Ranking 

In [43]:
# use sort_index method ; mentioning axis will sort dataframe for that axis 

unsorted_series = pd.Series(np.arange(4), index = ['c','a','d','b'])

unsorted_series

c    0
a    1
d    2
b    3
dtype: int64

In [45]:
sorted_series = unsorted_series.sort_index()

sorted_series

a    1
b    3
c    0
d    2
dtype: int64

In [46]:
unsorted_dataframe = pd.DataFrame(np.arange(6).reshape(2,3), columns = list('zta'), index = ['zod', 'abe',])

unsorted_dataframe

Unnamed: 0,z,t,a
zod,0,1,2
abe,3,4,5


In [47]:
sorted_dataframe_index = unsorted_dataframe.sort_index()

sorted_dataframe_index

Unnamed: 0,z,t,a
abe,3,4,5
zod,0,1,2


In [48]:
sorted_dataframe_columns = unsorted_dataframe.sort_index(axis='columns') # axis = 1 would have also worked 

sorted_dataframe_columns

Unnamed: 0,a,t,z
zod,2,1,0
abe,5,4,3


In [49]:
sorted_dataframe_columns_desc = unsorted_dataframe.sort_index(axis='columns', ascending = False)

sorted_dataframe_columns_desc

Unnamed: 0,z,t,a
zod,0,1,2
abe,3,4,5


In [50]:
# to sort a series based on its values , use sort_values() method. Any missing values are pushed to the end 

unsorted_vals_series = pd.Series([np.nan, -3, 2, -1, 5, np.nan, -0.26, 33, np.nan])

unsorted_vals_series

0      NaN
1    -3.00
2     2.00
3    -1.00
4     5.00
5      NaN
6    -0.26
7    33.00
8      NaN
dtype: float64

In [51]:
sorted_val_series = unsorted_vals_series.sort_values()

sorted_val_series

1    -3.00
3    -1.00
6    -0.26
2     2.00
4     5.00
7    33.00
0      NaN
5      NaN
8      NaN
dtype: float64

In [54]:
# in a dataframe , can use data from one or more columns as the sort key , using the by option in sort_values()

some_df = pd.DataFrame({'b':[98, 12, 154, 0.2],
                       'a':[1, 7, 2, 17]}, index = ['val1', 'val2', 'val3', 'val4'])

some_df

Unnamed: 0,b,a
val1,98.0,1
val2,12.0,7
val3,154.0,2
val4,0.2,17


In [55]:
sorted_df_by_column = some_df.sort_values(by = 'b')

sorted_df_by_column

Unnamed: 0,b,a
val4,0.2,17
val2,12.0,7
val1,98.0,1
val3,154.0,2


In [57]:
sorted_df_by_column = some_df.sort_values(by = ['b','a']) # sorting by multiple columns

sorted_df_by_column

Unnamed: 0,b,a
val4,0.2,17
val2,12.0,7
val1,98.0,1
val3,154.0,2


#### ranking assigns ranks from one through the number of valid data points in an array. The rank methods for series and DataFrame are the place to look; by default rank breaks ties by assigning each group of the mean rank

In [62]:
altf = pd.Series([1,6.5,1,3,2.8,8], index = list('abcdef'))

altf

a    1.0
b    6.5
c    1.0
d    3.0
e    2.8
f    8.0
dtype: float64

In [63]:
altf.rank()

a    1.5
b    5.0
c    1.5
d    4.0
e    3.0
f    6.0
dtype: float64

In [64]:
#rank series according to the order they are observed in data using method = 'first'

altf.rank(method = 'first')

a    1.0
b    5.0
c    2.0
d    4.0
e    3.0
f    6.0
dtype: float64

In [73]:
altf.rank(ascending=False, method='max')  #rank in descending order

a    6.0
b    2.0
c    6.0
d    3.0
e    4.0
f    1.0
dtype: float64

In [69]:
altd = pd.DataFrame({'a':[23,1,65,3,3],
                    'b':[2.2, 1.0, 44, 33.2, 76],
                    'c':[0.23, 0.56, -2, -0.87, 90]}, index = ['v1','v2','v3','v4','v5'])

altd

Unnamed: 0,a,b,c
v1,23,2.2,0.23
v2,1,1.0,0.56
v3,65,44.0,-2.0
v4,3,33.2,-0.87
v5,3,76.0,90.0


In [70]:
altd.rank(axis=1) # rank over columns 

Unnamed: 0,a,b,c
v1,3.0,2.0,1.0
v2,2.5,2.5,1.0
v3,3.0,2.0,1.0
v4,2.0,3.0,1.0
v5,1.0,2.0,3.0


In [71]:
altd.rank() # rank over rows 

Unnamed: 0,a,b,c
v1,4.0,2.0,3.0
v2,1.0,1.0,4.0
v3,5.0,4.0,1.0
v4,2.5,3.0,2.0
v5,2.5,5.0,5.0


## Axes indices with duplicate labels 

In [74]:
obj = pd.Series(np.arange(5), index = ['a','b','a','b','c'])

obj

a    0
b    1
a    2
b    3
c    4
dtype: int64

In [75]:
obj.index.is_unique    #index attribute .is_unique can tell if index is unique or not 

False

In [76]:
obj2 = pd.Series(np.arange(5), index = ['a','b','c','d','e'])

obj2

a    0
b    1
c    2
d    3
e    4
dtype: int64

In [77]:
obj2.index.is_unique

True

In [78]:
obj['a']

a    0
a    2
dtype: int64

In [79]:
obj['c']

4

In [80]:
df = pd.DataFrame(np.random.randn(4,3), index = ['a','a','b','b'])

df

Unnamed: 0,0,1,2
a,-0.657745,0.74228,-0.322074
a,-0.554106,0.074289,-0.203535
b,-1.81263,0.008646,0.34239
b,-0.813163,1.325711,-0.167463


In [81]:
df.loc['a']

Unnamed: 0,0,1,2
a,-0.657745,0.74228,-0.322074
a,-0.554106,0.074289,-0.203535


In [84]:
a_df = pd.DataFrame([[4, np.nan, 8, 12],[9,np.nan,0,1],
                   [11,45,23,9], [0,1,3,4]], index = ['a','b','c','d'], columns =['w','x','y','z'])

a_df

Unnamed: 0,w,x,y,z
a,4,,8,12
b,9,,0,1
c,11,45.0,23,9
d,0,1.0,3,4


In [85]:
a_df.sum()

w    24.0
x    46.0
y    34.0
z    26.0
dtype: float64

In [87]:
a_df.sum(axis='columns')

a    24.0
b    10.0
c    88.0
d     8.0
dtype: float64

In [88]:
a_df.sum(axis =1, skipna = False)  #skipna = False , includes Nan vals , otherwise its True by default

a     NaN
b     NaN
c    88.0
d     8.0
dtype: float64

In [89]:
a_df.idxmax() #similar to argmax() func in numpy , gives location of max val

w    c
x    c
y    c
z    a
dtype: object

In [90]:
a_df.idxmin() #similar to argmin() func in numpy , gives location of min val

w    d
x    d
y    b
z    b
dtype: object

In [91]:
a_df.cumsum()

Unnamed: 0,w,x,y,z
a,4,,8,12
b,13,,8,13
c,24,45.0,31,22
d,24,46.0,34,26


In [92]:
a_df.describe() # gives info summary on numerical data 

Unnamed: 0,w,x,y,z
count,4.0,2.0,4.0,4.0
mean,6.0,23.0,8.5,6.5
std,4.966555,31.112698,10.214369,4.932883
min,0.0,1.0,0.0,1.0
25%,3.0,12.0,2.25,3.25
50%,6.5,23.0,5.5,6.5
75%,9.5,34.0,11.75,9.75
max,11.0,45.0,23.0,12.0


In [94]:
non_num_df = pd.DataFrame([['a','b','c'],['d','e','f'],
                          ['g','h','i'],['j','k','l']], index = [1,2,3,4], columns = [1,2,3])

non_num_df

Unnamed: 0,1,2,3
1,a,b,c
2,d,e,f
3,g,h,i
4,j,k,l


In [95]:
non_num_df.describe()

Unnamed: 0,1,2,3
count,4,4,4
unique,4,4,4
top,a,b,c
freq,1,1,1


### Unique Values, value counts, membership

In [99]:
one =  pd.Series(['a','a','a','b','b'])

one

0    a
1    a
2    a
3    b
4    b
dtype: object

In [100]:
one.unique() # gives an array of unique values

array(['a', 'b'], dtype=object)

In [101]:
one.value_counts() # gives the frequency of values 

a    3
b    2
dtype: int64

In [105]:
mask = one.isin(['a'])  # vectorized set membership check 

mask

0     True
1     True
2     True
3    False
4    False
dtype: bool

In [106]:
one.isin(['b'])

0    False
1    False
2    False
3     True
4     True
dtype: bool

In [107]:
# related to isin is the index.get_indexer method, which gives you an index array from an array of 
# possibly nondistinctive values into another array of distinct values. 

to_match = pd.Series(['a','c','b','b','c','a','c','b','a','c','c'])

matcher = pd.Series(['a','b','c'])

pd.Index(matcher).get_indexer(to_match)

array([0, 2, 1, 1, 2, 0, 2, 1, 0, 2, 2])