# PANDAS

#### Designed to make data cleaning and analysis fast and easy in python. Designed to work with tabular and heterogenous data , unlike numpy which uses homogenous numerical data. 


In [1]:
import numpy as np

import pandas as pd

from pandas import Series, DataFrame

Pandas has two main datastructures : 
1. Series
2. Dataframe


### Series  

A series is a one dimensional array like object containing a sequence of values , and its associated data labels called its index. 

In [2]:
obj = pd.Series([1,2,3,4,5,6,-7]) # series is case sensitive , so whenever called its .Series()
print(obj)

0    1
1    2
2    3
3    4
4    5
5    6
6   -7
dtype: int64


In [3]:
obj.values # .values gets array representation

array([ 1,  2,  3,  4,  5,  6, -7])

In [4]:
obj.index # .index gets the index object

RangeIndex(start=0, stop=7, step=1)

In [5]:
# create a series with unique index 

obj2 = pd.Series([1,2,3,4,5], index = ['b','d','e','a','c'])
obj2

b    1
d    2
e    3
a    4
c    5
dtype: int64

In [6]:
obj2.index

Index(['b', 'd', 'e', 'a', 'c'], dtype='object')

In [7]:
#get values by calling out index

obj2['e']


3

In [8]:
obj2[['c', 'a', 'd']]

c    5
a    4
d    2
dtype: int64

In [9]:
# Can perform maths on series , and boolean filtering ; preserving index values link

obj2[obj2 > 2]

e    3
a    4
c    5
dtype: int64

In [10]:
obj2 * 2 

b     2
d     4
e     6
a     8
c    10
dtype: int64

In [11]:
np.max(obj2)

5

In [12]:
np.exp(obj2)

b      2.718282
d      7.389056
e     20.085537
a     54.598150
c    148.413159
dtype: float64

In [13]:
'a' in obj2

True

In [14]:
2 in obj2

False

## Another way of thinking about series is , that it is a fixed length ordered dictionary , mapping the index values to data values !!

### can also pass in a dict into a series 

In [15]:
player_data = {'kohli': 'batsmen', 'bumrah':'bowler', 'pant':'keeper', 'chahal':'spinner', 'hardik':'all-rounder'}

In [16]:
series_data = pd.Series(player_data)

In [17]:
series_data

kohli         batsmen
bumrah         bowler
pant           keeper
chahal        spinner
hardik    all-rounder
dtype: object

In [18]:
# change the order of index , by choosing key order  

my_index = ['kohli', 'pant', 'hardik', 'chahal', 'bumrah', 'shami']

updated_data = pd.Series(player_data, index=my_index)

In [19]:
updated_data

kohli         batsmen
pant           keeper
hardik    all-rounder
chahal        spinner
bumrah         bowler
shami             NaN
dtype: object

In [20]:
pd.isnull(updated_data) #check for any missing values   

kohli     False
pant      False
hardik    False
chahal    False
bumrah    False
shami      True
dtype: bool

In [21]:
pd.notnull(updated_data) #check for no missing data 

kohli      True
pant       True
hardik     True
chahal     True
bumrah     True
shami     False
dtype: bool

In [22]:
# isnull and notnull can be used as attributes or methods 

updated_data.isnull()

kohli     False
pant      False
hardik    False
chahal    False
bumrah    False
shami      True
dtype: bool

In [23]:
sample_dict = {'a':21, 'c':33, 'e':1, 'f':61, 'b':12, 'd':99}

sample_series = pd.Series(sample_dict)

sample_series

a    21
c    33
e     1
f    61
b    12
d    99
dtype: int64

index and data values have an attribute name , useful in identifying variable names

In [24]:
sample_series.index = ['key1','key2','key3','key4','key5','key6']

sample_series

key1    21
key2    33
key3     1
key4    61
key5    12
key6    99
dtype: int64

### DataFrame

A DataFrame represents a rectangular table of data and contains an order collection of columns each of which 
can be a different type(numeric, string, boolean). The DataFrame has both row and column index.
DataFrame can be thought of as a dict of Series all sharing the same index.

In [25]:
data = {'country': ['India', 'Austria', 'Usa', 'Iran', 'Tasmania', 'Angola', 'Chile'],
        'continent': ['Asia', 'Europe', 'North America', 'Middle East', 'Australia', 'Africa', 'South America'],
        'population':[120934523, 2398762, 15000000, 54637263, 309287, 31092832, 27894562]
       }

dataframe = pd.DataFrame(data)

dataframe

Unnamed: 0,country,continent,population
0,India,Asia,120934523
1,Austria,Europe,2398762
2,Usa,North America,15000000
3,Iran,Middle East,54637263
4,Tasmania,Australia,309287
5,Angola,Africa,31092832
6,Chile,South America,27894562


In [26]:
# can specify a sequence of column

pd.DataFrame(data, columns =['continent', 'country', 'population'])

Unnamed: 0,continent,country,population
0,Asia,India,120934523
1,Europe,Austria,2398762
2,North America,Usa,15000000
3,Middle East,Iran,54637263
4,Australia,Tasmania,309287
5,Africa,Angola,31092832
6,South America,Chile,27894562


In [27]:
dataframe[['country', 'population']]  # double square brackets if require more than one column

Unnamed: 0,country,population
0,India,120934523
1,Austria,2398762
2,Usa,15000000
3,Iran,54637263
4,Tasmania,309287
5,Angola,31092832
6,Chile,27894562


In [28]:
frame2 = pd.DataFrame(data, columns = ['country', 'continent', 'population', 'debt'], index = ['one','two','three','four','five','six','seven'])

frame2  #passed in a empty column will return NaN values in view

Unnamed: 0,country,continent,population,debt
one,India,Asia,120934523,
two,Austria,Europe,2398762,
three,Usa,North America,15000000,
four,Iran,Middle East,54637263,
five,Tasmania,Australia,309287,
six,Angola,Africa,31092832,
seven,Chile,South America,27894562,


In [29]:
# column data can be retrieved either using dict anotation or as an attribute 

frame2['country']

one         India
two       Austria
three         Usa
four         Iran
five     Tasmania
six        Angola
seven       Chile
Name: country, dtype: object

In [30]:
frame2.country

one         India
two       Austria
three         Usa
four         Iran
five     Tasmania
six        Angola
seven       Chile
Name: country, dtype: object

In [31]:
pd.isnull(frame2)

Unnamed: 0,country,continent,population,debt
one,False,False,False,True
two,False,False,False,True
three,False,False,False,True
four,False,False,False,True
five,False,False,False,True
six,False,False,False,True
seven,False,False,False,True


In [32]:
# rows are obtained by using the special loc attribute 

frame2.loc['two']

country       Austria
continent      Europe
population    2398762
debt              NaN
Name: two, dtype: object

In [33]:
# empty columns can be assigned a value or an array of values 

frame2['debt'] = ['€32m', '€10m', '€1.2b', '€30k', '€420m', '€88m', '€2.3b']

In [34]:
frame2

Unnamed: 0,country,continent,population,debt
one,India,Asia,120934523,€32m
two,Austria,Europe,2398762,€10m
three,Usa,North America,15000000,€1.2b
four,Iran,Middle East,54637263,€30k
five,Tasmania,Australia,309287,€420m
six,Angola,Africa,31092832,€88m
seven,Chile,South America,27894562,€2.3b


In [35]:
frame2.index = [1,2,3,4,5,6,7] # changed index values 

In [36]:
frame2

Unnamed: 0,country,continent,population,debt
1,India,Asia,120934523,€32m
2,Austria,Europe,2398762,€10m
3,Usa,North America,15000000,€1.2b
4,Iran,Middle East,54637263,€30k
5,Tasmania,Australia,309287,€420m
6,Angola,Africa,31092832,€88m
7,Chile,South America,27894562,€2.3b


In [37]:
frame3 = pd.DataFrame(frame2, columns = ['country', 'continent', 'population','debt','literacy rate'])

In [38]:
frame3

Unnamed: 0,country,continent,population,debt,literacy rate
1,India,Asia,120934523,€32m,
2,Austria,Europe,2398762,€10m,
3,Usa,North America,15000000,€1.2b,
4,Iran,Middle East,54637263,€30k,
5,Tasmania,Australia,309287,€420m,
6,Angola,Africa,31092832,€88m,
7,Chile,South America,27894562,€2.3b,


In [39]:
# can add a series into a dataframe , also specifying index values where data is inserted , leaving others NaN

literacy_vals = pd.Series(['62%','90%','42%','87%'], index = [1,2,6,3])

In [40]:
frame3['literacy rate'] = literacy_vals # add series data into specified dataframe column

In [41]:
frame3

Unnamed: 0,country,continent,population,debt,literacy rate
1,India,Asia,120934523,€32m,62%
2,Austria,Europe,2398762,€10m,90%
3,Usa,North America,15000000,€1.2b,87%
4,Iran,Middle East,54637263,€30k,
5,Tasmania,Australia,309287,€420m,
6,Angola,Africa,31092832,€88m,42%
7,Chile,South America,27894562,€2.3b,


In [42]:
# delete columns by using del attribute 

# create a new column
frame3['landlocked'] = frame2.country.index = ['no','yes','no','yes','no','yes','no']
frame3

Unnamed: 0,country,continent,population,debt,literacy rate,landlocked
1,India,Asia,120934523,€32m,62%,no
2,Austria,Europe,2398762,€10m,90%,yes
3,Usa,North America,15000000,€1.2b,87%,no
4,Iran,Middle East,54637263,€30k,,yes
5,Tasmania,Australia,309287,€420m,,no
6,Angola,Africa,31092832,€88m,42%,yes
7,Chile,South America,27894562,€2.3b,,no


In [43]:
# delete columns by using del attribute 

del frame3['landlocked']

frame3.columns

Index(['country', 'continent', 'population', 'debt', 'literacy rate'], dtype='object')

In [44]:
frame3

Unnamed: 0,country,continent,population,debt,literacy rate
1,India,Asia,120934523,€32m,62%
2,Austria,Europe,2398762,€10m,90%
3,Usa,North America,15000000,€1.2b,87%
4,Iran,Middle East,54637263,€30k,
5,Tasmania,Australia,309287,€420m,
6,Angola,Africa,31092832,€88m,42%
7,Chile,South America,27894562,€2.3b,


Another common form of data in DataFrame is nested dictionary of dictionaries 

In [45]:
nested = {'India':{'leader':'modi', 'gov':'democratic'},
         'North Korea': {'leader':'kim jung un', 'gov':'dictator', 'nuke':'active'}
         }

# outer dict keys becomes columns , inner dict keys beacome row indices 

nested_data = pd.DataFrame(nested)

nested_data

Unnamed: 0,India,North Korea
leader,modi,kim jung un
gov,democratic,dictator
nuke,,active


In [46]:
nested_data.T # transpose attribute changes the columns into rows and rows into columns and vice versa

Unnamed: 0,leader,gov,nuke
India,modi,democratic,
North Korea,kim jung un,dictator,active


In [47]:
new = {1:{1:10,2:20,3:30,4:40,5:50},
      2:{1:11,2:22,3:33,4:44,5:55},
      3:{1:12,2:24,3:36,4:48,5:60}}

newer = pd.DataFrame(new)

newer

Unnamed: 0,1,2,3
1,10,11,12
2,20,22,24
3,30,33,36
4,40,44,48
5,50,55,60


In [48]:
pdata = {'x10': newer[1][:-1],  #slicing off chunks of newer dataframe, and writing it onto a pd_frame dataframe
        'x12': newer[3][:2]}

pd_frame = pd.DataFrame(pdata)

pd_frame

Unnamed: 0,x10,x12
1,10,12.0
2,20,24.0
3,30,
4,40,


In [49]:
# set cloumn name and index name

pd_frame.index.name = 'nums'; pd_frame.columns.name = 'multiplied by'  

In [50]:
pd_frame

multiplied by,x10,x12
nums,Unnamed: 1_level_1,Unnamed: 2_level_1
1,10,12.0
2,20,24.0
3,30,
4,40,


In [51]:
pd_frame.values # .values returns a 2d array of values 

array([[10., 12.],
       [20., 24.],
       [30., nan],
       [40., nan]])

In [52]:
type(pd_frame.values)

numpy.ndarray

In [53]:
frame3.values

array([['India', 'Asia', 120934523, '€32m', '62%'],
       ['Austria', 'Europe', 2398762, '€10m', '90%'],
       ['Usa', 'North America', 15000000, '€1.2b', '87%'],
       ['Iran', 'Middle East', 54637263, '€30k', nan],
       ['Tasmania', 'Australia', 309287, '€420m', nan],
       ['Angola', 'Africa', 31092832, '€88m', '42%'],
       ['Chile', 'South America', 27894562, '€2.3b', nan]], dtype=object)

### Index Objects 

Pandas index objects are responsible for holiding the axis labels and other metadata. Any array or sequence of labels constructed in series or dataframe is internally converted into an index

In [54]:
ss = pd.Series(range(3), index=['a','b','c'])

In [55]:
index = ss.index

In [56]:
index

Index(['a', 'b', 'c'], dtype='object')

In [57]:
index[-1]

'c'

In [58]:
index[:-1]

Index(['a', 'b'], dtype='object')

In [59]:
# index objects are immutable 