In [3]:
import pandas as pd

In [5]:
obj = pd.Series([4,7,-5,3])

obj

0    4
1    7
2   -5
3    3
dtype: int64

In [8]:
obj.values
print(obj.index)

RangeIndex(start=0, stop=4, step=1)


In [10]:
obj2 = pd.Series([4,7,-5,3], index = ['d','b','a','c'])
obj2.index

Index(['d', 'b', 'a', 'c'], dtype='object')

In [13]:
obj2[['a','d','c']]

a   -5
d    4
c    3
dtype: int64

In [15]:
'a' in obj2

True

In [28]:
sdata = {'Lagos': 35000, 'Texas': 71000, 'Oregon': 16000, 'Utah': 5000}
obj3 = pd.Series(sdata)
obj3

Lagos      35000
Texas     71000
Oregon    16000
Utah       5000
dtype: int64

In [29]:
states = ['California', 'Lagos', 'Oregon', 'Texas', 'Utah']
obj4 = pd.Series(sdata, index=states)
obj4


California        NaN
Lagos          35000.0
Oregon        16000.0
Texas         71000.0
Utah           5000.0
dtype: float64

In [34]:
obj4['Lagos'] = 65000

obj4

California        NaN
Lagos          65000.0
Oregon        16000.0
Texas         71000.0
Utah           5000.0
dtype: float64

In [27]:
# pd.isnull(obj3)

pd.notnull(obj3)


California    False
Lagos           True
Oregon         True
Texas          True
Utah           True
dtype: bool

In [25]:
#to find the NaN positions, we could use pd.isnull() and pd.notnull()

obj3.isnull() #this is a series instance method
# obj3.notnull()  

California     True
Lagos          False
Oregon        False
Texas         False
Utah          False
dtype: bool

Operations on Series
* Addition
* Subtraction
* Division

In [35]:
# Addition
obj3 + obj4

California         NaN
Lagos          100000.0
Oregon         32000.0
Texas         142000.0
Utah           10000.0
dtype: float64

In [36]:
# Division
obj3/obj4

California         NaN
Lagos          0.538462
Oregon        1.000000
Texas         1.000000
Utah          1.000000
dtype: float64

In [37]:
# integer division
obj3//obj4

California    NaN
Lagos          0.0
Oregon        1.0
Texas         1.0
Utah          1.0
dtype: float64

In [41]:
# Both the Series object itself and its index have a name attribute,
obj4.name = 'population'
obj4.index.name = 'state'

obj4

state
California        NaN
Lagos          65000.0
Oregon        16000.0
Texas         71000.0
Utah           5000.0
Name: population, dtype: float64

DataFrame

A DataFrame represents a tabular, spreadsheet-like data structure containing an ordered
collection of columns, each of which can be a different value type (numeric,
string, boolean, etc.). The DataFrame has both a row and column index; it can be
thought of as a dict of Series (one for all sharing the same index)

In [68]:
data = {'state': ['Lagos', 'Lagos', 'Lagos', 'Kano', 'Kano'],
        'year': [2000, 2001, 2002, 2001, 2002],
        'pop': [1.5, 1.7, 3.6, 2.4, 2.9]}
frame = pd.DataFrame(data)
frame

Unnamed: 0,state,year,pop
0,Lagos,2000,1.5
1,Lagos,2001,1.7
2,Lagos,2002,3.6
3,Kano,2001,2.4
4,Kano,2002,2.9


In [69]:
pd.DataFrame(data, columns = ['year', 'state', 'pop'])

Unnamed: 0,year,state,pop
0,2000,Lagos,1.5
1,2001,Lagos,1.7
2,2002,Lagos,3.6
3,2001,Kano,2.4
4,2002,Kano,2.9


In [70]:
frame2 = pd.DataFrame(data, columns=['year', 'state', 'pop', 'debt'],
                   index=['one', 'two', 'three', 'four', 'five'])
frame2

Unnamed: 0,year,state,pop,debt
one,2000,Lagos,1.5,
two,2001,Lagos,1.7,
three,2002,Lagos,3.6,
four,2001,Kano,2.4,
five,2002,Kano,2.9,


In [71]:
frame2.columns

Index(['year', 'state', 'pop', 'debt'], dtype='object')

In [72]:
# A column in a DataFrame can be retrieved as a Series either by dict-like notation or by
# attribute:
frame2.state
# print(frame2['year'])
# print(frame2.year)


one      Lagos
two      Lagos
three    Lagos
four      Kano
five      Kano
Name: state, dtype: object

In [73]:
# Columns can be modified by assignment. For example, the empty 'debt' column could
# be assigned a scalar value or an array of values:
import numpy as np
frame2['debt'] = np.arange(5)
frame2


Unnamed: 0,year,state,pop,debt
one,2000,Lagos,1.5,0
two,2001,Lagos,1.7,1
three,2002,Lagos,3.6,2
four,2001,Kano,2.4,3
five,2002,Kano,2.9,4


In [74]:
frame2['debt'] = pd.Series([900,210,322], index = ['two','three', 'four'])
frame2

Unnamed: 0,year,state,pop,debt
one,2000,Lagos,1.5,
two,2001,Lagos,1.7,900.0
three,2002,Lagos,3.6,210.0
four,2001,Kano,2.4,322.0
five,2002,Kano,2.9,


In [75]:
frame2 = pd.DataFrame(data, index =([1,2,3,4,5]))
frame2

Unnamed: 0,state,year,pop
1,Lagos,2000,1.5
2,Lagos,2001,1.7
3,Lagos,2002,3.6
4,Kano,2001,2.4
5,Kano,2002,2.9


In [76]:
# Assigning a column that doesn’t exist will create a new column. The del keyword will delete columns as with a dict:
frame2['western'] = frame2.state =='Lagos'
frame2

Unnamed: 0,state,year,pop,western
1,Lagos,2000,1.5,True
2,Lagos,2001,1.7,True
3,Lagos,2002,3.6,True
4,Kano,2001,2.4,False
5,Kano,2002,2.9,False


In [77]:
del frame2['western']

frame2.columns

Index(['state', 'year', 'pop'], dtype='object')

In [81]:
pop = {'Kano' : {2001: 2.4, 2002 : 2.9}, 'Lagos' : 
{2000 : 1.5, 2001: 1.7, 2002 : 3.6}}

frame3 = pd.DataFrame(pop, index = [2000,2001,2002])
frame3

Unnamed: 0,Kano,Lagos
2000,,1.5
2001,2.4,1.7
2002,2.9,3.6


In [82]:
# transpose the result:
frame3.T

Unnamed: 0,2000,2001,2002
Kano,,2.4,2.9
Lagos,1.5,1.7,3.6


In [85]:
frame3.index.name = 'year'; frame3.columns.name = 'States'
frame3

States,Kano,Lagos
year,Unnamed: 1_level_1,Unnamed: 2_level_1
2000,,1.5
2001,2.4,1.7
2002,2.9,3.6


In [90]:
frame2.values 

array([['Lagos', 2000, 1.5],
       ['Lagos', 2001, 1.7],
       ['Lagos', 2002, 3.6],
       ['Kano', 2001, 2.4],
       ['Kano', 2002, 2.9]], dtype=object)

Index Objects

pandas’s Index objects are responsible for holding the axis labels and other metadata (like the axis name or names).

In [91]:
obj = pd.Series(range(3), index=['a', 'b', 'c'])
obj


a    0
b    1
c    2
dtype: int64

In [93]:
a_index = obj.index
a_index

Index(['a', 'b', 'c'], dtype='object')

In [95]:
# Index objects are immutable and thus can’t be modified by the user:
# a_index[1] ='d'


In [96]:
# Immutability is important so that Index objects can be safely shared among data structures:
b_index = pd.Index(np.arange(3))

obj2 = pd.Series([1.5,-2.5,0], index = b_index)

obj2

0    1.5
1   -2.5
2    0.0
dtype: float64

In [97]:
obj2.index is b_index

True

Reindexing

A critical method on pandas objects is reindex, which means to create a new object with the data conformed to a new index.

In [100]:
obj = pd.Series([4.5, 7.2, -5.3, 3.6], index=['d', 'b', 'a', 'c'])
obj

d    4.5
b    7.2
a   -5.3
c    3.6
dtype: float64

In [101]:
obj2 = obj.reindex(['a', 'b', 'c', 'd', 'e'])
obj2

a   -5.3
b    7.2
c    3.6
d    4.5
e    NaN
dtype: float64

In [103]:
obj2 = obj.reindex(['a', 'b', 'c', 'd', 'e'], fill_value=0)
obj2

a   -5.3
b    7.2
c    3.6
d    4.5
e    0.0
dtype: float64

Dropping entries from an axis

the drop method will return a new object with the indicated value or values deleted from an axis:

In [105]:
obj = pd.Series(np.arange(5.), index=['a', 'b', 'c', 'd', 'e'])
obj

a    0.0
b    1.0
c    2.0
d    3.0
e    4.0
dtype: float64

In [106]:
new_obj = obj.drop('c')

new_obj

a    0.0
b    1.0
d    3.0
e    4.0
dtype: float64

In [107]:
obj.drop(['d','c'])

a    0.0
b    1.0
e    4.0
dtype: float64

In [121]:
data = pd.DataFrame(np.arange(16).reshape((4,4)), index = ['Lagos, Abuja, Niger, Jigawa'.split(',')], columns= ['1', '2', '3', '4'])
data

Unnamed: 0,1,2,3,4
Lagos,0,1,2,3
Abuja,4,5,6,7
Niger,8,9,10,11
Jigawa,12,13,14,15


In [122]:
data.drop(['Lagos'])

  data.drop(['Lagos'])


Unnamed: 0,1,2,3,4
Abuja,4,5,6,7
Niger,8,9,10,11
Jigawa,12,13,14,15


In [123]:
data

Unnamed: 0,1,2,3,4
Lagos,0,1,2,3
Abuja,4,5,6,7
Niger,8,9,10,11
Jigawa,12,13,14,15


In [127]:
# data.drop(columns= '2')
data.drop(['2'], axis = 1)

Unnamed: 0,1,3,4
Lagos,0,2,3
Abuja,4,6,7
Niger,8,10,11
Jigawa,12,14,15


Indexing, selection, and filtering

In [132]:
# Indexing
print(data['2'])
data[['2','4']]

Lagos       1
 Abuja      5
 Niger      9
 Jigawa    13
Name: 2, dtype: int32


Unnamed: 0,2,4
Lagos,1,3
Abuja,5,7
Niger,9,11
Jigawa,13,15


In [133]:
data[:2]

Unnamed: 0,1,2,3,4
Lagos,0,1,2,3
Abuja,4,5,6,7


In [135]:
data

Unnamed: 0,1,2,3,4
Lagos,0,1,2,3
Abuja,4,5,6,7
Niger,8,9,10,11
Jigawa,12,13,14,15


In [146]:
data[data > 5] = 0
data

Unnamed: 0,1,2,3,4
Lagos,0,1,2,3
Abuja,4,5,0,0
Niger,0,0,0,0
Jigawa,0,0,0,0


In [144]:
data[data['2'] >= 5]

Unnamed: 0,1,2,3,4
Abuja,4,5,6,7
Niger,8,9,10,11
Jigawa,12,13,14,15


In [2]:
import pandas as pd
pd.read_csv('survey.csv')


Unnamed: 0,Year,Industry_aggregation_NZSIOC,Industry_code_NZSIOC,Industry_name_NZSIOC,Units,Variable_code,Variable_name,Variable_category,Value,Industry_code_ANZSIC06
0,2020,Level 1,99999,All industries,Dollars (millions),H01,Total income,Financial performance,733258,ANZSIC06 divisions A-S (excluding classes K633...
1,2020,Level 1,99999,All industries,Dollars (millions),H04,"Sales, government funding, grants and subsidies",Financial performance,660630,ANZSIC06 divisions A-S (excluding classes K633...
2,2020,Level 1,99999,All industries,Dollars (millions),H05,"Interest, dividends and donations",Financial performance,54342,ANZSIC06 divisions A-S (excluding classes K633...
3,2020,Level 1,99999,All industries,Dollars (millions),H07,Non-operating income,Financial performance,18285,ANZSIC06 divisions A-S (excluding classes K633...
4,2020,Level 1,99999,All industries,Dollars (millions),H08,Total expenditure,Financial performance,654872,ANZSIC06 divisions A-S (excluding classes K633...
...,...,...,...,...,...,...,...,...,...,...
37075,2013,Level 3,ZZ11,Food product manufacturing,Percentage,H37,Quick ratio,Financial ratios,52,"ANZSIC06 groups C111, C112, C113, C114, C115, ..."
37076,2013,Level 3,ZZ11,Food product manufacturing,Percentage,H38,Margin on sales of goods for resale,Financial ratios,40,"ANZSIC06 groups C111, C112, C113, C114, C115, ..."
37077,2013,Level 3,ZZ11,Food product manufacturing,Percentage,H39,Return on equity,Financial ratios,12,"ANZSIC06 groups C111, C112, C113, C114, C115, ..."
37078,2013,Level 3,ZZ11,Food product manufacturing,Percentage,H40,Return on total assets,Financial ratios,5,"ANZSIC06 groups C111, C112, C113, C114, C115, ..."


In [3]:
pd.read_csv('https://raw.githubusercontent.com/dphi-official/Datasets/master/exam_scores.csv')


Unnamed: 0,gender,race/ethnicity,parental level of education,lunch,test preparation course,math score,reading score,writing score
0,male,group B,bachelor's degree,standard,none,74,68,67
1,female,group C,some college,standard,completed,58,68,66
2,male,group C,some college,free/reduced,none,66,65,65
3,female,group D,bachelor's degree,free/reduced,none,74,75,73
4,male,group D,some college,standard,none,78,77,71
...,...,...,...,...,...,...,...,...
995,female,group C,some high school,standard,none,68,77,72
996,female,group E,some college,standard,none,98,81,94
997,female,group E,associate's degree,free/reduced,none,67,67,67
998,female,group C,high school,standard,none,63,68,70


In [4]:
exam_scores.shape

NameError: name 'exam_scores' is not defined