In [1]:
# Series

# A series is a one-dimensional array like object containing a sequence of values and an associated array of data lables
#called its index. The simplest Series is formed from only an array of data:


In [85]:
import pandas as pd
import numpy as np

In [87]:
obj = pd.Series([4,7,-5,3])
obj

0    4
1    7
2   -5
3    3
dtype: int64

In [88]:
# Series index

#The string representation of a series displayed interactivly shows the index on the left and the values on the right.
#The default one consisting of the integers 0 through N-1 is created if no specific index is given.

In [89]:
obj.values

array([ 4,  7, -5,  3], dtype=int64)

In [90]:
obj.index #like range(4)

RangeIndex(start=0, stop=4, step=1)

In [91]:
obj2=pd.Series([4,7,-5,3], index=['d','b','a','c'])
obj2

d    4
b    7
a   -5
c    3
dtype: int64

In [92]:
obj2.index

Index(['d', 'b', 'a', 'c'], dtype='object')

In [93]:
# Use labels in the index to select values
#Youu can use lable in the index when selecting single values or a set of values.
# you can also use numpuy functions or numpy-like operations.

In [94]:
obj2['a']

-5

In [95]:
obj2['d'] =6

In [96]:
obj2[['c','a','d']]

c    3
a   -5
d    6
dtype: int64

In [97]:
obj2[obj2>0]

d    6
b    7
c    3
dtype: int64

In [98]:
obj2 * 2

d    12
b    14
a   -10
c     6
dtype: int64

In [99]:
np.exp(obj2)

d     403.428793
b    1096.633158
a       0.006738
c      20.085537
dtype: float64

In [100]:
'b' in obj2

True

In [101]:
'e' in obj2

False

In [102]:
#use a dict to create a series

In [103]:
sdata={'Ohio':35000, 'Texas':71000,'Oregon':16000,'Utah':5000}

In [104]:
obj3= pd.Series(sdata)
obj3

Ohio      35000
Texas     71000
Oregon    16000
Utah       5000
dtype: int64

In [105]:
#when you are only passing a dict, the index is the resulting series will have the dict's keys in sorted order.
#You can override this by passing the dict keys in the order you want them to appear in the resulting series;


In [106]:
states =['California', 'Ohio', 'Oregon', 'Texas']

In [107]:
obj4=pd.Series(sdata, index=states)
obj4

California        NaN
Ohio          35000.0
Oregon        16000.0
Texas         71000.0
dtype: float64

In [108]:
#isnull and notnull function = used to detect missing data.

In [109]:
pd.isnull(obj4)

California     True
Ohio          False
Oregon        False
Texas         False
dtype: bool

In [110]:
pd.notnull(obj4)

California    False
Ohio           True
Oregon         True
Texas          True
dtype: bool

In [111]:
obj4.isnull()


California     True
Ohio          False
Oregon        False
Texas         False
dtype: bool

In [112]:
obj4.notnull()

California    False
Ohio           True
Oregon         True
Texas          True
dtype: bool

In [113]:
#Name attribute
#both the series object itself and its index have a name attribute, which integrates with other key areas of pandas
#functionality.


In [114]:
obj4.name ='population'
obj4.index.name='state'
obj4

state
California        NaN
Ohio          35000.0
Oregon        16000.0
Texas         71000.0
Name: population, dtype: float64

In [115]:
#Alter index

In [116]:
# A series's index can be altred in place by assignment. The number of indices needs to be matched.

In [117]:
obj

0    4
1    7
2   -5
3    3
dtype: int64

In [118]:
obj.index=['bob', 'steve', 'jeff', 'Ryan']

In [119]:
obj

bob      4
steve    7
jeff    -5
Ryan     3
dtype: int64

In [120]:
#DataFrame
#A Dataframe represents a rectangular table of data and contains an ordered collection of columns, each of which can be
#different value type(numeric,string, boolean etc); it can be thought of as a dict of series all sharing the same index.

# The data frame has both a row and column index.

In [121]:
data ={'state':['ohio', 'ohio', 'ohio', 'Nevada', 'Nevada', 'Nevada'],
       'year':[2000, 2001, 2002, 2001, 2002, 2003],
      'pop':[1.5, 1.7, 3.6, 2.4, 2.9, 3.2]}
frame = pd.DataFrame(data)
frame

Unnamed: 0,state,year,pop
0,ohio,2000,1.5
1,ohio,2001,1.7
2,ohio,2002,3.6
3,Nevada,2001,2.4
4,Nevada,2002,2.9
5,Nevada,2003,3.2


In [122]:
frame.head() #the Head function only select first 5 row and tail function select last 5 row.

Unnamed: 0,state,year,pop
0,ohio,2000,1.5
1,ohio,2001,1.7
2,ohio,2002,3.6
3,Nevada,2001,2.4
4,Nevada,2002,2.9


In [123]:
# if You specify a sequence of columns, the DataFrame's columns will be arranged in that order:


In [124]:
pd.DataFrame(data, columns=['year','state','pop'])

Unnamed: 0,year,state,pop
0,2000,ohio,1.5
1,2001,ohio,1.7
2,2002,ohio,3.6
3,2001,Nevada,2.4
4,2002,Nevada,2.9
5,2003,Nevada,3.2


In [125]:
#if you pass a column that isn't contained in the dict it will appear with missing values in the result:

In [126]:
frame2 =pd.DataFrame(data, columns =['year','state', 'pop', 'debt'],
                     index=['one', 'two', 'three', 'four', 'five', 'six',])
frame2

Unnamed: 0,year,state,pop,debt
one,2000,ohio,1.5,
two,2001,ohio,1.7,
three,2002,ohio,3.6,
four,2001,Nevada,2.4,
five,2002,Nevada,2.9,
six,2003,Nevada,3.2,


In [127]:
frame2.columns

Index(['year', 'state', 'pop', 'debt'], dtype='object')

In [128]:
# A column in a DataFrame can be retrived as a Series either by dict-like notation or by attribute:

In [129]:
frame2['state']

one        ohio
two        ohio
three      ohio
four     Nevada
five     Nevada
six      Nevada
Name: state, dtype: object

In [130]:
frame2['year']

one      2000
two      2001
three    2002
four     2001
five     2002
six      2003
Name: year, dtype: int64

In [131]:
#row also can be retrived by position or name with the special loc attirbute


In [132]:
frame2.loc['three']

year     2002
state    ohio
pop       3.6
debt      NaN
Name: three, dtype: object

In [133]:
#columns can be modified by assignment.

In [134]:
frame2['debt'] =16.5
frame2

Unnamed: 0,year,state,pop,debt
one,2000,ohio,1.5,16.5
two,2001,ohio,1.7,16.5
three,2002,ohio,3.6,16.5
four,2001,Nevada,2.4,16.5
five,2002,Nevada,2.9,16.5
six,2003,Nevada,3.2,16.5


In [135]:
#when you are assigning list or arrays to a column, the value's length must match the length of the DataFrame.
#if you assign a Series, its lables will be realigned exactly to the DataFrame's index, inserting missing values in any holes.

val = pd.Series([-1.2, -1.5, 1.7], index=['two','four','five'])

In [136]:
frame2['debt'] =val
frame2

Unnamed: 0,year,state,pop,debt
one,2000,ohio,1.5,
two,2001,ohio,1.7,-1.2
three,2002,ohio,3.6,
four,2001,Nevada,2.4,-1.5
five,2002,Nevada,2.9,1.7
six,2003,Nevada,3.2,


In [137]:
# Another common form of Data is nested dict of dicts:
pop ={'Nevada':{2001:2.4, 2002:2.9},
     'Ohio':{2000:1.5, 2001:1.7,2002:3.6}}
frame3 =pd.DataFrame(pop)

In [138]:
frame3

Unnamed: 0,Nevada,Ohio
2000,,1.5
2001,2.4,1.7
2002,2.9,3.6


In [139]:
#you can transpose the Dataframe(swap rows and columns) with similar syntax to a Numpy Array:
frame3.T

Unnamed: 0,2000,2001,2002
Nevada,,2.4,2.9
Ohio,1.5,1.7,3.6


In [140]:
#The keys in the inner dicts are combined and sorted to form the index in the result. This isn't ture if an explicit index is specified:
pd.DataFrame(pop,index=[2001, 2002, 2003])

Unnamed: 0,Nevada,Ohio
2001,2.4,1.7
2002,2.9,3.6
2003,,


In [141]:
#if a Dataframe's index and columns have their name attribute set, these will also be displayed.
frame3.index.name='YEAR'; frame3.columns.name='STATE'
frame3

STATE,Nevada,Ohio
YEAR,Unnamed: 1_level_1,Unnamed: 2_level_1
2000,,1.5
2001,2.4,1.7
2002,2.9,3.6


In [142]:
#As with Series, the values attribute returns the data contained in the DataFrame as two-dimensional ndarray:
frame3.values

array([[nan, 1.5],
       [2.4, 1.7],
       [2.9, 3.6]])

In [143]:
#index objects
#pandas index objects are responsible for holding the axis lables and other metadata.
#any array other sequence of labels you use when constucting a Series or DataFrame is internally converted to an index:

obj=pd.Series(range(3), index=['a','b','c'])
index =obj.index
index

Index(['a', 'b', 'c'], dtype='object')

In [144]:
# index objects are immutable and thus can't be modified by the user:
index[1] ='d' #typeError

TypeError: Index does not support mutable operations

In [145]:
#columns can be returened as index objects too.
frame3

STATE,Nevada,Ohio
YEAR,Unnamed: 1_level_1,Unnamed: 2_level_1
2000,,1.5
2001,2.4,1.7
2002,2.9,3.6


In [146]:
frame3.columns

Index(['Nevada', 'Ohio'], dtype='object', name='STATE')

In [147]:
#unlike Python sets, a pandas index can contain duplicate lables.


In [148]:
#Reindexing
#Reindex means to create a new objects with the data conformed to a new index.

In [149]:
obj=pd.Series([4.5,7.2,-5.3,3.6], index=['d','b','a','c'])
obj

d    4.5
b    7.2
a   -5.3
c    3.6
dtype: float64

In [150]:
obj2= obj.reindex(['a','b','c','d','e'])

In [151]:
obj2

a   -5.3
b    7.2
c    3.6
d    4.5
e    NaN
dtype: float64

In [152]:
#Reindexing with dataframe, reindex can alter either the index, columns or both. when passed only a sequence,
#in reindexes the rows in the result.


In [153]:
frame=pd.DataFrame(np.arange(9).reshape((3,3)), index=['a','c','d'], columns=['Ohio','Texas','California'])
frame


Unnamed: 0,Ohio,Texas,California
a,0,1,2
c,3,4,5
d,6,7,8


In [154]:
frame2=frame.reindex(['a','b','c','d'])
frame2

Unnamed: 0,Ohio,Texas,California
a,0.0,1.0,2.0
b,,,
c,3.0,4.0,5.0
d,6.0,7.0,8.0


In [155]:
#The columns can be reindexed with the columns keyword:
states=['Texas', 'utah','California']
frame.reindex(columns=states)

Unnamed: 0,Texas,utah,California
a,1,,2
c,4,,5
d,7,,8


In [156]:
#Dropping Entroes from an Axis
#You can use drop method to drop one or more entries from an axis. using drop method will return a new object.


In [157]:
obj=pd.Series(np.arange(5.), index=['a','b','c','d','e'])
obj

a    0.0
b    1.0
c    2.0
d    3.0
e    4.0
dtype: float64

In [158]:
new_obj = obj.drop('c')
new_obj

a    0.0
b    1.0
d    3.0
e    4.0
dtype: float64

In [159]:
obj.drop(['d','c'])

a    0.0
b    1.0
e    4.0
dtype: float64

In [160]:
data =pd.DataFrame(np.arange(16).reshape((4, 4)),
                   index=['Ohio', 'Colorado','Utah','New York'],
                   columns=['one','two','three','four'])
data

Unnamed: 0,one,two,three,four
Ohio,0,1,2,3
Colorado,4,5,6,7
Utah,8,9,10,11
New York,12,13,14,15


In [161]:
data.drop('two', axis=1)

Unnamed: 0,one,three,four
Ohio,0,2,3
Colorado,4,6,7
Utah,8,10,11
New York,12,14,15


In [162]:
data.drop(['two','four'], axis='columns')

Unnamed: 0,one,three
Ohio,0,2
Colorado,4,6
Utah,8,10
New York,12,14


In [163]:
#inplace = True will drop entries from the original DataFrame Object
obj.drop('c', inplace=True)
obj

a    0.0
b    1.0
d    3.0
e    4.0
dtype: float64

In [164]:
#indexing
#Series indexing (obj[...]) woeks analogously to NumPy array indexing, except you can use the Series's index values
#insted of only integers.

In [166]:
obj =pd.Series(np.arange(4.),index=['a','b','c','d'])
obj

a    0.0
b    1.0
c    2.0
d    3.0
dtype: float64

In [167]:
obj['b']

1.0

In [169]:
obj[1]

1.0

In [170]:
obj[2:4]

c    2.0
d    3.0
dtype: float64

In [171]:
obj[['b','a','d']]

b    1.0
a    0.0
d    3.0
dtype: float64

In [172]:
obj[[1,3]]

b    1.0
d    3.0
dtype: float64

In [173]:
obj[1:3]

b    1.0
c    2.0
dtype: float64

In [174]:
obj[obj<2]

a    0.0
b    1.0
dtype: float64

In [175]:
#since with lables differently than normal Python slicing in that the end point is inclusive:
obj['b':'c']

b    1.0
c    2.0
dtype: float64

In [176]:
#setting using methods modifies the corresponding section of the Series:
obj['b':'c'] =5

In [177]:
obj

a    0.0
b    5.0
c    5.0
d    3.0
dtype: float64

In [181]:
# indexing into a DataFrame is for retrieving one or more columns either with a single value or sequence.
data=pd.DataFrame(np.arange(16).reshape((4, 4)), index=['ohio','colorado', 'utha', 'New York'],
                 columns=['one', 'two', 'three','four'])
data

Unnamed: 0,one,two,three,four
ohio,0,1,2,3
colorado,4,5,6,7
utha,8,9,10,11
New York,12,13,14,15


In [182]:
data['two']

ohio         1
colorado     5
utha         9
New York    13
Name: two, dtype: int32

In [183]:
data[['three','one']]

Unnamed: 0,three,one
ohio,2,0
colorado,6,4
utha,10,8
New York,14,12


In [184]:
#indexing like this has few special cases. First slicing or selecting data with a boolean array:
data[:2]

Unnamed: 0,one,two,three,four
ohio,0,1,2,3
colorado,4,5,6,7


In [185]:
data[data['three']>5]

Unnamed: 0,one,two,three,four
colorado,4,5,6,7
utha,8,9,10,11
New York,12,13,14,15


In [186]:
data<5

Unnamed: 0,one,two,three,four
ohio,True,True,True,True
colorado,True,False,False,False
utha,False,False,False,False
New York,False,False,False,False


In [187]:
data[data<5]=0

In [188]:
data

Unnamed: 0,one,two,three,four
ohio,0,0,0,0
colorado,0,5,6,7
utha,8,9,10,11
New York,12,13,14,15


In [190]:
#selection with loc and iloc
#Enable you to select a subset of the rows and columns from a DataFrame with NumPy-Like notation using either axis labels(loc)
#or integers(iloc).

In [192]:
data.loc['colorado',['two','three']]

two      5
three    6
Name: colorado, dtype: int32

In [193]:
data.iloc[2,[3,0,1]]

four    11
one      8
two      9
Name: utha, dtype: int32

In [194]:
data.iloc[2]

one       8
two       9
three    10
four     11
Name: utha, dtype: int32

In [195]:
#Arithmetic and dataakignment

In [196]:
# An important pandas feature for some application is the behavior of arithmetic between objects with different indexes.
# when you are adding together objects, if any index pairs are not the same, the respective index in the result will be the 
# union of the index pair.


In [199]:
s1 =pd.Series([7.3,-2.5,3.4,1.5], index=['a','c','d','e'])
s2 =pd.Series([-2.1, 3.6, -1.5, 4, 3.1], index=['a', 'c', 'e', 'f', 'g'])

In [200]:
s1

a    7.3
c   -2.5
d    3.4
e    1.5
dtype: float64

In [201]:
s2

a   -2.1
c    3.6
e   -1.5
f    4.0
g    3.1
dtype: float64

In [202]:
s1+s2

a    5.2
c    1.1
d    NaN
e    0.0
f    NaN
g    NaN
dtype: float64

In [210]:
#In case of DataFrame, alignment is performed on both the rows and the columns:
df1 =pd.DataFrame(np.arange(9.).reshape((3,3)), 
                 columns= list('bcd'),
                 index=['Ohio', 'Texas', 'Colorado'])

In [206]:
df2 =pd.DataFrame(np.arange(12.).reshape((4,3)), columns=list('bde'),
                 index=['utah','Ohio','Texas','Oregon'])

In [211]:
df1

Unnamed: 0,b,c,d
Ohio,0.0,1.0,2.0
Texas,3.0,4.0,5.0
Colorado,6.0,7.0,8.0


In [212]:
df2

Unnamed: 0,b,d,e
utah,0.0,1.0,2.0
Ohio,3.0,4.0,5.0
Texas,6.0,7.0,8.0
Oregon,9.0,10.0,11.0


In [213]:
#Adding these togather returns a DataFrame whose index and columns are the unions of the ones in each DataFrame:
df1+df2

Unnamed: 0,b,c,d,e
Colorado,,,,
Ohio,3.0,,6.0,
Oregon,,,,
Texas,9.0,,12.0,
utah,,,,


In [215]:
#Arethmetic methods with fill values
#in arithmentic operations between differently indexed objects, you might want to fill with a special value, like 0,
#when an axis label is found in one object but not the other.


In [217]:
df1=pd.DataFrame(np.arange(12.).reshape((3,4)),
                columns=list('abcd'))
df2=pd.DataFrame(np.arange(20.).reshape((4,5)),
                columns=list('abcde'))

In [218]:
df2.loc[1,'b']=np.nan

In [219]:
df1

Unnamed: 0,a,b,c,d
0,0.0,1.0,2.0,3.0
1,4.0,5.0,6.0,7.0
2,8.0,9.0,10.0,11.0


In [220]:
df2

Unnamed: 0,a,b,c,d,e
0,0.0,1.0,2.0,3.0,4.0
1,5.0,,7.0,8.0,9.0
2,10.0,11.0,12.0,13.0,14.0
3,15.0,16.0,17.0,18.0,19.0


In [221]:
#Adding these togaher result in NA value in the location that don't overlap:

In [222]:
df1+df2

Unnamed: 0,a,b,c,d,e
0,0.0,2.0,4.0,6.0,
1,9.0,,13.0,15.0,
2,18.0,20.0,22.0,24.0,
3,,,,,


In [223]:
df1.add(df2,fill_value=0)

Unnamed: 0,a,b,c,d,e
0,0.0,2.0,4.0,6.0,4.0
1,9.0,5.0,13.0,15.0,9.0
2,18.0,20.0,22.0,24.0,14.0
3,15.0,16.0,17.0,18.0,19.0


In [224]:
1/df1

Unnamed: 0,a,b,c,d
0,inf,1.0,0.5,0.333333
1,0.25,0.2,0.166667,0.142857
2,0.125,0.111111,0.1,0.090909


In [225]:
df1.rdiv(1)

Unnamed: 0,a,b,c,d
0,inf,1.0,0.5,0.333333
1,0.25,0.2,0.166667,0.142857
2,0.125,0.111111,0.1,0.090909


In [226]:
df1.reindex(columns=df2.columns, fill_value=0)

Unnamed: 0,a,b,c,d,e
0,0.0,1.0,2.0,3.0,0
1,4.0,5.0,6.0,7.0,0
2,8.0,9.0,10.0,11.0,0


In [228]:
#NumPy Ufuncs
#Numpy Ufuncs(element-wise array methods) also work with pandas obj.


In [229]:
frame=pd.DataFrame(np.random.randn(4,3), columns=list('bde'),
                  index=['utah','Ohio','Texas','Oregon'])
frame

Unnamed: 0,b,d,e
utah,-0.098159,0.19475,0.243464
Ohio,1.168727,1.826946,0.763344
Texas,-0.904684,0.922338,-0.181218
Oregon,-0.565404,0.280344,0.061876


In [230]:
np.abs(frame)

Unnamed: 0,b,d,e
utah,0.098159,0.19475,0.243464
Ohio,1.168727,1.826946,0.763344
Texas,0.904684,0.922338,0.181218
Oregon,0.565404,0.280344,0.061876


In [231]:
#To sort lexicongraphically by row or columns index, use the sort_index method, which return a new, sorted objects.

In [232]:
frame=pd.DataFrame(np.arange(8).reshape((2,4)), index=['three', 'one'],
                  columns=['d','a','b','c'])

In [233]:
frame.sort_index()

Unnamed: 0,d,a,b,c
one,4,5,6,7
three,0,1,2,3


In [234]:
frame.sort_index(axis=1)

Unnamed: 0,a,b,c,d
three,1,2,3,0
one,5,6,7,4


In [235]:
frame.sort_index(axis=1, ascending=False)

Unnamed: 0,d,c,b,a
three,0,3,2,1
one,4,7,6,5


In [236]:
#SORTING
#To sort a Series by its values, use its sort_values method . Any missing values are sorted to the end of the series by default

In [237]:
obj =pd.Series([4, np.nan, 7, np.nan, -3,2])

In [238]:
obj.sort_values()

4   -3.0
5    2.0
0    4.0
2    7.0
1    NaN
3    NaN
dtype: float64

In [239]:
#when shorting a DataFrame, You can use the data in one or more columns as the sort keys.

In [240]:
frame=pd.DataFrame({'b':[4,7,-3,2], 'a':[0,1,0,1]})

In [241]:
frame

Unnamed: 0,b,a
0,4,0
1,7,1
2,-3,0
3,2,1


In [242]:
frame.sort_values(by='b')

Unnamed: 0,b,a
2,-3,0
3,2,1
0,4,0
1,7,1


In [243]:
#To sort by multiple column. pass a list of names.
frame.sort_values(by=['a','b'])

Unnamed: 0,b,a
2,-3,0
0,4,0
3,2,1
1,7,1


In [244]:
#Ranking assigns ranks from one through the number of valid data points in an array. The rank methods for Series and DataFrame
#are the place to look by default rank breaks ties by assigning each group the mean rank.


In [245]:
obj=pd.Series([7,-5,7,4,2,0,4])

In [246]:
obj.rank()

0    6.5
1    1.0
2    6.5
3    4.5
4    3.0
5    2.0
6    4.5
dtype: float64

In [247]:
#rank can also be assined according to the order in which they're observed in the data:

In [248]:
obj.rank(method='first')

0    6.0
1    1.0
2    7.0
3    4.0
4    3.0
5    2.0
6    5.0
dtype: float64