# PANDAS

#### Designed to make data cleaning and analysis fast and easy in python. Designed to work with tabular and heterogenous data , unlike numpy which uses homogenous numerical data. 


In [137]:
import numpy as np

import pandas as pd

from pandas import Series, DataFrame

Pandas has two main datastructures : 
1. Series
2. Dataframe


### Series  

A series is a one dimensional array like object containing a sequence of values , and its associated data labels called its index. 

In [138]:
obj = pd.Series([1,2,3,4,5,6,-7]) # series is case sensitive , so whenever called its .Series()
print(obj)

0    1
1    2
2    3
3    4
4    5
5    6
6   -7
dtype: int64


In [139]:
obj.values # .values gets array representation

array([ 1,  2,  3,  4,  5,  6, -7])

In [140]:
obj.index # .index gets the index object

RangeIndex(start=0, stop=7, step=1)

In [141]:
# create a series with unique index 

obj2 = pd.Series([1,2,3,4,5], index = ['b','d','e','a','c'])
obj2

b    1
d    2
e    3
a    4
c    5
dtype: int64

In [142]:
obj2.index

Index(['b', 'd', 'e', 'a', 'c'], dtype='object')

In [143]:
#get values by calling out index

obj2['e']


3

In [144]:
obj2[['c', 'a', 'd']]

c    5
a    4
d    2
dtype: int64

In [145]:
# Can perform maths on series , and boolean filtering ; preserving index values link

obj2[obj2 > 2]

e    3
a    4
c    5
dtype: int64

In [146]:
obj2 * 2 

b     2
d     4
e     6
a     8
c    10
dtype: int64

In [147]:
np.max(obj2)

5

In [148]:
np.exp(obj2)

b      2.718282
d      7.389056
e     20.085537
a     54.598150
c    148.413159
dtype: float64

In [149]:
'a' in obj2

True

In [150]:
2 in obj2

False

## Another way of thinking about series is , that it is a fixed length ordered dictionary , mapping the index values to data values !!

### can also pass in a dict into a series 

In [151]:
player_data = {'kohli': 'batsmen', 'bumrah':'bowler', 'pant':'keeper', 'chahal':'spinner', 'hardik':'all-rounder'}

In [152]:
series_data = pd.Series(player_data)

In [153]:
series_data

kohli         batsmen
bumrah         bowler
pant           keeper
chahal        spinner
hardik    all-rounder
dtype: object

In [154]:
# change the order of index , by choosing key order  # can aslo use .reindex() , check Index section

my_index = ['kohli', 'pant', 'hardik', 'chahal', 'bumrah', 'shami']

updated_data = pd.Series(player_data, index=my_index)

In [155]:
updated_data

kohli         batsmen
pant           keeper
hardik    all-rounder
chahal        spinner
bumrah         bowler
shami             NaN
dtype: object

In [156]:
pd.isnull(updated_data) #check for any missing values   

kohli     False
pant      False
hardik    False
chahal    False
bumrah    False
shami      True
dtype: bool

In [157]:
pd.notnull(updated_data) #check for no missing data 

kohli      True
pant       True
hardik     True
chahal     True
bumrah     True
shami     False
dtype: bool

In [158]:
# isnull and notnull can be used as attributes or methods 

updated_data.isnull()

kohli     False
pant      False
hardik    False
chahal    False
bumrah    False
shami      True
dtype: bool

In [159]:
sample_dict = {'a':21, 'c':33, 'e':1, 'f':61, 'b':12, 'd':99}

sample_series = pd.Series(sample_dict)

sample_series

a    21
c    33
e     1
f    61
b    12
d    99
dtype: int64

index and data values have an attribute name , useful in identifying variable names

In [160]:
sample_series.index = ['key1','key2','key3','key4','key5','key6']

sample_series

key1    21
key2    33
key3     1
key4    61
key5    12
key6    99
dtype: int64

### DataFrame

A DataFrame represents a rectangular table of data and contains an order collection of columns each of which 
can be a different type(numeric, string, boolean). The DataFrame has both row and column index.
DataFrame can be thought of as a dict of Series all sharing the same index.

In [161]:
data = {'country': ['India', 'Austria', 'Usa', 'Iran', 'Tasmania', 'Angola', 'Chile'],
        'continent': ['Asia', 'Europe', 'North America', 'Middle East', 'Australia', 'Africa', 'South America'],
        'population':[120934523, 2398762, 15000000, 54637263, 309287, 31092832, 27894562]
       }

dataframe = pd.DataFrame(data)

dataframe

Unnamed: 0,country,continent,population
0,India,Asia,120934523
1,Austria,Europe,2398762
2,Usa,North America,15000000
3,Iran,Middle East,54637263
4,Tasmania,Australia,309287
5,Angola,Africa,31092832
6,Chile,South America,27894562


In [162]:
# can specify a sequence of column

pd.DataFrame(data, columns =['continent', 'country', 'population'])

Unnamed: 0,continent,country,population
0,Asia,India,120934523
1,Europe,Austria,2398762
2,North America,Usa,15000000
3,Middle East,Iran,54637263
4,Australia,Tasmania,309287
5,Africa,Angola,31092832
6,South America,Chile,27894562


In [163]:
dataframe[['country', 'population']]  # double square brackets if require more than one column

Unnamed: 0,country,population
0,India,120934523
1,Austria,2398762
2,Usa,15000000
3,Iran,54637263
4,Tasmania,309287
5,Angola,31092832
6,Chile,27894562


In [164]:
frame2 = pd.DataFrame(data, columns = ['country', 'continent', 'population', 'debt'], index = ['one','two','three','four','five','six','seven'])

frame2  #passed in a empty column will return NaN values in view

Unnamed: 0,country,continent,population,debt
one,India,Asia,120934523,
two,Austria,Europe,2398762,
three,Usa,North America,15000000,
four,Iran,Middle East,54637263,
five,Tasmania,Australia,309287,
six,Angola,Africa,31092832,
seven,Chile,South America,27894562,


In [165]:
# column data can be retrieved either using dict anotation or as an attribute 

frame2['country']

one         India
two       Austria
three         Usa
four         Iran
five     Tasmania
six        Angola
seven       Chile
Name: country, dtype: object

In [166]:
frame2.country

one         India
two       Austria
three         Usa
four         Iran
five     Tasmania
six        Angola
seven       Chile
Name: country, dtype: object

In [167]:
pd.isnull(frame2)

Unnamed: 0,country,continent,population,debt
one,False,False,False,True
two,False,False,False,True
three,False,False,False,True
four,False,False,False,True
five,False,False,False,True
six,False,False,False,True
seven,False,False,False,True


In [168]:
# rows are obtained by using the special loc attribute 

frame2.loc['two']

country       Austria
continent      Europe
population    2398762
debt              NaN
Name: two, dtype: object

In [169]:
# empty columns can be assigned a value or an array of values 

frame2['debt'] = ['€32m', '€10m', '€1.2b', '€30k', '€420m', '€88m', '€2.3b']

In [170]:
frame2

Unnamed: 0,country,continent,population,debt
one,India,Asia,120934523,€32m
two,Austria,Europe,2398762,€10m
three,Usa,North America,15000000,€1.2b
four,Iran,Middle East,54637263,€30k
five,Tasmania,Australia,309287,€420m
six,Angola,Africa,31092832,€88m
seven,Chile,South America,27894562,€2.3b


In [171]:
frame2.index = [1,2,3,4,5,6,7] # changed index values 

In [172]:
frame2

Unnamed: 0,country,continent,population,debt
1,India,Asia,120934523,€32m
2,Austria,Europe,2398762,€10m
3,Usa,North America,15000000,€1.2b
4,Iran,Middle East,54637263,€30k
5,Tasmania,Australia,309287,€420m
6,Angola,Africa,31092832,€88m
7,Chile,South America,27894562,€2.3b


In [173]:
frame3 = pd.DataFrame(frame2, columns = ['country', 'continent', 'population','debt','literacy rate'])

In [174]:
frame3

Unnamed: 0,country,continent,population,debt,literacy rate
1,India,Asia,120934523,€32m,
2,Austria,Europe,2398762,€10m,
3,Usa,North America,15000000,€1.2b,
4,Iran,Middle East,54637263,€30k,
5,Tasmania,Australia,309287,€420m,
6,Angola,Africa,31092832,€88m,
7,Chile,South America,27894562,€2.3b,


In [175]:
# can add a series into a dataframe , also specifying index values where data is inserted , leaving others NaN

literacy_vals = pd.Series(['62%','90%','42%','87%'], index = [1,2,6,3])

In [176]:
frame3['literacy rate'] = literacy_vals # add series data into specified dataframe column

In [177]:
frame3

Unnamed: 0,country,continent,population,debt,literacy rate
1,India,Asia,120934523,€32m,62%
2,Austria,Europe,2398762,€10m,90%
3,Usa,North America,15000000,€1.2b,87%
4,Iran,Middle East,54637263,€30k,
5,Tasmania,Australia,309287,€420m,
6,Angola,Africa,31092832,€88m,42%
7,Chile,South America,27894562,€2.3b,


In [178]:
# delete columns by using del attribute 

# create a new column
frame3['landlocked'] = frame2.country.index = ['no','yes','no','yes','no','yes','no']
frame3

Unnamed: 0,country,continent,population,debt,literacy rate,landlocked
1,India,Asia,120934523,€32m,62%,no
2,Austria,Europe,2398762,€10m,90%,yes
3,Usa,North America,15000000,€1.2b,87%,no
4,Iran,Middle East,54637263,€30k,,yes
5,Tasmania,Australia,309287,€420m,,no
6,Angola,Africa,31092832,€88m,42%,yes
7,Chile,South America,27894562,€2.3b,,no


In [179]:
# delete columns by using del attribute 

del frame3['landlocked']

frame3.columns

Index(['country', 'continent', 'population', 'debt', 'literacy rate'], dtype='object')

In [180]:
frame3

Unnamed: 0,country,continent,population,debt,literacy rate
1,India,Asia,120934523,€32m,62%
2,Austria,Europe,2398762,€10m,90%
3,Usa,North America,15000000,€1.2b,87%
4,Iran,Middle East,54637263,€30k,
5,Tasmania,Australia,309287,€420m,
6,Angola,Africa,31092832,€88m,42%
7,Chile,South America,27894562,€2.3b,


Another common form of data in DataFrame is nested dictionary of dictionaries 

In [181]:
nested = {'India':{'leader':'modi', 'gov':'democratic'},
         'North Korea': {'leader':'kim jung un', 'gov':'dictator', 'nuke':'active'}
         }

# outer dict keys becomes columns , inner dict keys beacome row indices 

nested_data = pd.DataFrame(nested)

nested_data

Unnamed: 0,India,North Korea
leader,modi,kim jung un
gov,democratic,dictator
nuke,,active


In [182]:
nested_data.T # transpose attribute changes the columns into rows and rows into columns and vice versa

Unnamed: 0,leader,gov,nuke
India,modi,democratic,
North Korea,kim jung un,dictator,active


In [183]:
new = {1:{1:10,2:20,3:30,4:40,5:50},
      2:{1:11,2:22,3:33,4:44,5:55},
      3:{1:12,2:24,3:36,4:48,5:60}}

newer = pd.DataFrame(new)

newer

Unnamed: 0,1,2,3
1,10,11,12
2,20,22,24
3,30,33,36
4,40,44,48
5,50,55,60


In [184]:
pdata = {'x10': newer[1][:-1],  #slicing off chunks of newer dataframe, and writing it onto a pd_frame dataframe
        'x12': newer[3][:2]}

pd_frame = pd.DataFrame(pdata)

pd_frame

Unnamed: 0,x10,x12
1,10,12.0
2,20,24.0
3,30,
4,40,


In [185]:
# set cloumn name and index name

pd_frame.index.name = 'nums'; pd_frame.columns.name = 'multiplied by'  

In [186]:
pd_frame

multiplied by,x10,x12
nums,Unnamed: 1_level_1,Unnamed: 2_level_1
1,10,12.0
2,20,24.0
3,30,
4,40,


In [187]:
pd_frame.values # .values returns a 2d array of values 

array([[10., 12.],
       [20., 24.],
       [30., nan],
       [40., nan]])

In [188]:
type(pd_frame.values)

numpy.ndarray

In [189]:
frame3.values

array([['India', 'Asia', 120934523, '€32m', '62%'],
       ['Austria', 'Europe', 2398762, '€10m', '90%'],
       ['Usa', 'North America', 15000000, '€1.2b', '87%'],
       ['Iran', 'Middle East', 54637263, '€30k', nan],
       ['Tasmania', 'Australia', 309287, '€420m', nan],
       ['Angola', 'Africa', 31092832, '€88m', '42%'],
       ['Chile', 'South America', 27894562, '€2.3b', nan]], dtype=object)

### Index Objects 

Pandas index objects are responsible for holiding the axis labels and other metadata. Any array or sequence of labels constructed in series or dataframe is internally converted into an index

In [190]:
ss = pd.Series(range(3), index=['a','b','c'])

In [191]:
index = ss.index

In [192]:
index

Index(['a', 'b', 'c'], dtype='object')

In [193]:
index[-1]

'c'

In [194]:
index[:-1]

Index(['a', 'b'], dtype='object')

In [195]:
# index objects are immutable 

In [196]:
labels = pd.Index(np.arange(5)) #index functionality is very useful and should be taken advantage off 

In [197]:
labels

Int64Index([0, 1, 2, 3, 4], dtype='int64')

In [198]:
labelled_data = pd.Series(['selector','director','management','coaching staff','captain'], index=labels)

labelled_data

0          selector
1          director
2        management
3    coaching staff
4           captain
dtype: object

In [199]:
frame3

Unnamed: 0,country,continent,population,debt,literacy rate
1,India,Asia,120934523,€32m,62%
2,Austria,Europe,2398762,€10m,90%
3,Usa,North America,15000000,€1.2b,87%
4,Iran,Middle East,54637263,€30k,
5,Tasmania,Australia,309287,€420m,
6,Angola,Africa,31092832,€88m,42%
7,Chile,South America,27894562,€2.3b,


In [200]:
frame3.columns

Index(['country', 'continent', 'population', 'debt', 'literacy rate'], dtype='object')

In [201]:
'debt' in frame3.columns

True

In [202]:
4 in frame3.index

True

In [None]:
#using the reindex() method, we can conform the data to the new index sequence and add missing values when necessry

In [214]:
team_structure = pd.Series(['senior','keeper','captain','gun player','vice'], index=[1,2,3,4,5])
team_structure

1        senior
2        keeper
3       captain
4    gun player
5          vice
dtype: object

In [218]:
updated_structure = team_structure.reindex([3,5,1,4,2,6], method='ffill') 

updated_structure

3       captain
5          vice
1        senior
4    gun player
2        keeper
6          vice
dtype: object

In [222]:
#method = 'ffill' forward fills missing values

colours = pd.Series(['orange','yellow','blue','purple','red','green'], index=[1,2,3,4,5,6])

colours

1    orange
2    yellow
3      blue
4    purple
5       red
6     green
dtype: object

In [224]:
colours_2 = colours.reindex([1,2,3,4,5,6,7,8,9], method='ffill')

colours_2

1    orange
2    yellow
3      blue
4    purple
5       red
6     green
7     green
8     green
9     green
dtype: object

In [225]:
# reindex() can also alter row , columns or both 


frame = pd.DataFrame(np.arange(9).reshape(3,3), index=['a','c','d'], columns=['delhi','pune','kota'])

frame

Unnamed: 0,delhi,pune,kota
a,0,1,2
c,3,4,5
d,6,7,8


In [238]:
framed = frame.reindex(['a','b','c','d'])
framed

Unnamed: 0,delhi,pune,kota
a,0.0,1.0,2.0
b,,,
c,3.0,4.0,5.0
d,6.0,7.0,8.0


In [239]:
cities = ['delhi', 'kota', 'pune']

framed.reindex(columns = cities)

Unnamed: 0,delhi,kota,pune
a,0.0,2.0,1.0
b,,,
c,3.0,5.0,4.0
d,6.0,8.0,7.0


In [246]:
#simpler way to do reindexing using .loc[rows,columns] 

framed.loc[['b','a','d','c'], cities]

Unnamed: 0,delhi,kota,pune
b,,,
a,0.0,2.0,1.0
d,6.0,8.0,7.0
c,3.0,5.0,4.0


In [291]:
p_data = {'Capital':['Dublin', 'London', 'Berlin', 'Paris', 'Madrid', 'Rome', 'Amsterdam'],
          'Country':['Ireland', 'UK', 'Germany', 'France', 'Spain', 'Italy', 'Netherlands'],
          'Icon food':['Spuds', 'Crumpets', 'Pretzels', 'Croissants', 'Paella', 'Pizza', 'Cheese'],
          'Language':['English', 'English', 'German', 'French', 'Spanish', 'Italian', 'Dutch'],
          'Covid vaccine':['Pfizer', 'Pfizer', 'Moderna', 'Moderna', 'Astra', 'Moderna', 'Pfizer'],
          'People vaccinated %':[75, 88, 73, 71, 80, 69, 73],
          'Average salary (k)':[47, 55, 50, 45, 44, 39, 47]
         }

p_frame = pd.DataFrame(p_data)

p_frame.index = [1,2,3,4,5,6,7]  # changed the index of dataframe 

p_frame

Unnamed: 0,Capital,Country,Icon food,Language,Covid vaccine,People vaccinated %,Average salary (k)
1,Dublin,Ireland,Spuds,English,Pfizer,75,47
2,London,UK,Crumpets,English,Pfizer,88,55
3,Berlin,Germany,Pretzels,German,Moderna,73,50
4,Paris,France,Croissants,French,Moderna,71,45
5,Madrid,Spain,Paella,Spanish,Astra,80,44
6,Rome,Italy,Pizza,Italian,Moderna,69,39
7,Amsterdam,Netherlands,Cheese,Dutch,Pfizer,73,47


In [292]:
# reindex p_frame dataframe using .loc , make a new variable pd_frame to make it work 

pd_frame = p_frame.loc[[2,5,1,7,3,4,6],['Country','Capital','Language','Icon food','Average salary (k)',
                                        'Covid vaccine', 'People vaccinated %']]

pd_frame

Unnamed: 0,Country,Capital,Language,Icon food,Average salary (k),Covid vaccine,People vaccinated %
2,UK,London,English,Crumpets,55,Pfizer,88
5,Spain,Madrid,Spanish,Paella,44,Astra,80
1,Ireland,Dublin,English,Spuds,47,Pfizer,75
7,Netherlands,Amsterdam,Dutch,Cheese,47,Pfizer,73
3,Germany,Berlin,German,Pretzels,50,Moderna,73
4,France,Paris,French,Croissants,45,Moderna,71
6,Italy,Rome,Italian,Pizza,39,Moderna,69


In [293]:
# ^^^^^^ is the same as 

reindex_index_frame = p_frame.reindex([2,5,1,7,3,4,6]) 
#print(reindex_index_frame)

reorder_column_frame = p_frame.reindex(columns=['Country','Capital','Language','Icon food','Average salary (k)',
                                        'Covid vaccine', 'People vaccinated %'])
reorder_column_frame

Unnamed: 0,Country,Capital,Language,Icon food,Average salary (k),Covid vaccine,People vaccinated %
1,Ireland,Dublin,English,Spuds,47,Pfizer,75
2,UK,London,English,Crumpets,55,Pfizer,88
3,Germany,Berlin,German,Pretzels,50,Moderna,73
4,France,Paris,French,Croissants,45,Moderna,71
5,Spain,Madrid,Spanish,Paella,44,Astra,80
6,Italy,Rome,Italian,Pizza,39,Moderna,69
7,Netherlands,Amsterdam,Dutch,Cheese,47,Pfizer,73


In [294]:
# delete the Icon food column 

del reorder_column_frame['Icon food']

reorder_column_frame 

#dont run this cell again, as it ll produce an error because 'Icon food' is deleted , it no longer exists in memory

Unnamed: 0,Country,Capital,Language,Average salary (k),Covid vaccine,People vaccinated %
1,Ireland,Dublin,English,47,Pfizer,75
2,UK,London,English,55,Pfizer,88
3,Germany,Berlin,German,50,Moderna,73
4,France,Paris,French,45,Moderna,71
5,Spain,Madrid,Spanish,44,Astra,80
6,Italy,Rome,Italian,39,Moderna,69
7,Netherlands,Amsterdam,Dutch,47,Pfizer,73


### To locate specific rows within a dataframe , use the loc attribute

In [203]:
nested_data.loc['leader'] # locates row with named index 'leader'. if index not named , use [0],[1] etc !!

India                 modi
North Korea    kim jung un
Name: leader, dtype: object

In [204]:
nested_data.loc[['leader','gov']]

Unnamed: 0,India,North Korea
leader,modi,kim jung un
gov,democratic,dictator


In [205]:
# return entire dataframe use .to_string() converts all data into a string 

nested_data.to_string()

'             India  North Korea\nleader        modi  kim jung un\ngov     democratic     dictator\nnuke           NaN       active'

### To get a quick overview of the data use .head( ) ; view the top rows of data . To view the bottom rows of data use .tail( ). By default the number of rows set to view is 5

In [206]:
frame3.head()

Unnamed: 0,country,continent,population,debt,literacy rate
1,India,Asia,120934523,€32m,62%
2,Austria,Europe,2398762,€10m,90%
3,Usa,North America,15000000,€1.2b,87%
4,Iran,Middle East,54637263,€30k,
5,Tasmania,Australia,309287,€420m,


### To get general info about DataFrame , use .info( )

In [207]:
nested_data.info()

<class 'pandas.core.frame.DataFrame'>
Index: 3 entries, leader to nuke
Data columns (total 2 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   India        2 non-null      object
 1   North Korea  3 non-null      object
dtypes: object(2)
memory usage: 180.0+ bytes


In [208]:
frame3.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 7 entries, 1 to 7
Data columns (total 5 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   country        7 non-null      object
 1   continent      7 non-null      object
 2   population     7 non-null      int64 
 3   debt           7 non-null      object
 4   literacy rate  4 non-null      object
dtypes: int64(1), object(4)
memory usage: 636.0+ bytes


In [295]:
example_data = pd.Series(np.arange(5.), index=['a','b','c','d','e'])

example_data

a    0.0
b    1.0
c    2.0
d    3.0
e    4.0
dtype: float64

In [297]:
# use .drop() to drop rows from data 

new_obj = example_data.drop('c')

new_obj

a    0.0
b    1.0
d    3.0
e    4.0
dtype: float64

In [298]:
newer_obj = new_obj.drop(['a','e'])

newer_obj

b    1.0
d    3.0
dtype: float64

In [300]:
state_data = pd.DataFrame(np.arange(16).reshape(4,4), index=['Galway', 'Limerick', 'Cork', 'Dublin'],
                         columns = ['dice1', 'dice2', 'dice3', 'dice4'])

state_data

Unnamed: 0,dice1,dice2,dice3,dice4
Galway,0,1,2,3
Limerick,4,5,6,7
Cork,8,9,10,11
Dublin,12,13,14,15


In [301]:
# using .drop() can also remove columns by mentioning the relative axis
# in a 2d array of data , axis=1 or axis='columns'

removed_state_data = state_data.drop('dice3', axis=1)

removed_state_data

Unnamed: 0,dice1,dice2,dice4
Galway,0,1,3
Limerick,4,5,7
Cork,8,9,11
Dublin,12,13,15


In [302]:
removed = removed_state_data.drop(['dice1', 'dice2'], axis='columns')

removed

Unnamed: 0,dice4
Galway,3
Limerick,7
Cork,11
Dublin,15


### Indexing , Selection , Filtering
Series indexing works very similar to numpy array indexing, except you can use series index values instead of just integers. 

In [307]:
# Indexing a series

nums_ray = pd.Series(np.arange(5.), index=['a','b','c','d','e'])

nums_ray

a    0.0
b    1.0
c    2.0
d    3.0
e    4.0
dtype: float64

In [308]:
nums_ray['b'] #used the series index 

1.0

In [306]:
nums_ray[1] #used the integer index 

1.0

In [310]:
nums_ray[['b','e','a']]

b    1.0
e    4.0
a    0.0
dtype: float64

In [311]:
nums_ray[2:-1]

c    2.0
d    3.0
dtype: float64

In [312]:
nums_ray[[1,3]]

b    1.0
d    3.0
dtype: float64

In [314]:
nums_ray[nums_ray > 2]

d    3.0
e    4.0
dtype: float64

In [315]:
nums_ray[nums_ray %2 == 0]

a    0.0
c    2.0
e    4.0
dtype: float64

In [316]:
nums_ray['a':'c']

a    0.0
b    1.0
c    2.0
dtype: float64

In [317]:
nums_ray['a':'c'] = 4

In [318]:
nums_ray

a    4.0
b    4.0
c    4.0
d    3.0
e    4.0
dtype: float64

In [319]:
nums_ray[['a','b','c']] = 0.0, 1.0, 2.0

In [320]:
nums_ray

a    0.0
b    1.0
c    2.0
d    3.0
e    4.0
dtype: float64

In [321]:
# Indexing a DataFrame ; used to retrieve one or more columns either by a single val or sequence of vals 

df = pd.DataFrame(np.arange(16).reshape(4,4), index =['Ohio', 'Colorado', 'Utah', 'New york'],
                 columns =['one','two','three','four'])

df

Unnamed: 0,one,two,three,four
Ohio,0,1,2,3
Colorado,4,5,6,7
Utah,8,9,10,11
New york,12,13,14,15


In [322]:
df['two']

Ohio         1
Colorado     5
Utah         9
New york    13
Name: two, dtype: int64

In [323]:
df[['one','three']]

Unnamed: 0,one,three
Ohio,0,2
Colorado,4,6
Utah,8,10
New york,12,14


In [328]:
df[1:3]

Unnamed: 0,one,two,three,four
Colorado,4,5,6,7
Utah,8,9,10,11


In [330]:
df[df['four'] > 7]

Unnamed: 0,one,two,three,four
Utah,8,9,10,11
New york,12,13,14,15


In [334]:
four = df['four']          #returns the column 'four'

four

Ohio         3
Colorado     7
Utah        11
New york    15
Name: four, dtype: int64

In [336]:
four[four > 7]            # able to slice the column based on certain value

Utah        11
New york    15
Name: four, dtype: int64

In [337]:
df > 3

Unnamed: 0,one,two,three,four
Ohio,False,False,False,False
Colorado,True,True,True,True
Utah,True,True,True,True
New york,True,True,True,True


In [338]:
df[df < 2] = 'zero'

In [339]:
df

Unnamed: 0,one,two,three,four
Ohio,zero,zero,2,3
Colorado,4,5,6,7
Utah,8,9,10,11
New york,12,13,14,15


#### DataFrame is syntactically more like 2D numpy arrays  




## Selecting with loc and iloc 

For DataFrame label indexing on rows and columns use loc and iloc. They enable you to select a subset of the rows and columns from a DataFrame with numpy-like notation using either axis labels (loc) or integers (iloc)

In [342]:
df

Unnamed: 0,one,two,three,four
Ohio,zero,zero,2,3
Colorado,4,5,6,7
Utah,8,9,10,11
New york,12,13,14,15


In [343]:
df.loc['Colorado',['two','three']]               # 'colorado' is the row , and ['two', 'three'] are columns 

two      5
three    6
Name: Colorado, dtype: object

In [350]:
# iloc is the same , except it uses the integer index values 

df.iloc[1,[1,2]]

two      5
three    6
Name: Colorado, dtype: object

In [351]:
df.iloc[2] #returns row 2 -----> 'Utah'

one       8
two       9
three    10
four     11
Name: Utah, dtype: object

In [357]:
df.loc[:,['three']] #returns all data from column 'three' 

Unnamed: 0,three
Ohio,2
Colorado,6
Utah,10
New york,14


In [358]:
df.iloc[:,2] # same as above but used int index values 

Ohio         2
Colorado     6
Utah        10
New york    14
Name: three, dtype: int64

In [359]:
df.iloc[[1,2],[3,0,1]]

Unnamed: 0,four,one,two
Colorado,7,4,5
Utah,11,8,9


In [361]:
df.loc[:'Utah', ['one','two']]

Unnamed: 0,one,two
Ohio,zero,zero
Colorado,4,5
Utah,8,9


In [362]:
df.iloc[:, :3]

Unnamed: 0,one,two,three
Ohio,zero,zero,2
Colorado,4,5,6
Utah,8,9,10
New york,12,13,14


In [363]:
df.iloc[:, :3][df.three > 6]

Unnamed: 0,one,two,three
Utah,8,9,10
New york,12,13,14
