## Understanding NumPy Array

In [1]:
# Creating an array
import numpy as np
a = np.array([2,4,6,8,10])
print(a)

[ 2  4  6  8 10]


In [2]:
# Creating an array using arange()
import numpy as np
a = np.arange(1,11)
print(a)

[ 1  2  3  4  5  6  7  8  9 10]


In [3]:
import numpy as np

p = np.zeros((3,3))   # Create an array of all zeros
print(p) 

q = np.ones((2,2))    # Create an array of all ones
print(q)

r = np.full((2,2), 4)  # Create a constant array
print(r) 

s = np.eye(4)         # Create a 2x2 identity matrix
print(s) 

t = np.random.random((3,3))  # Create an array filled with random values
print(t)

[[0. 0. 0.]
 [0. 0. 0.]
 [0. 0. 0.]]
[[1. 1.]
 [1. 1.]]
[[4 4]
 [4 4]]
[[1. 0. 0. 0.]
 [0. 1. 0. 0.]
 [0. 0. 1. 0.]
 [0. 0. 0. 1.]]
[[0.59054856 0.78109647 0.66402512]
 [0.86504473 0.45696972 0.02128842]
 [0.79086473 0.14487524 0.71841093]]


In [4]:
# Creating an array using arange()
import numpy as np
a = np.arange(1,11)
print(type(a))
print(a.dtype)

<class 'numpy.ndarray'>
int64


In [5]:
# check shape pf Array
print(a.shape)

(10,)


In [6]:
a = np.array([[5,6],[7,8]])
print(a)

[[5 6]
 [7 8]]


In [7]:
print(a[0,0])

5


In [8]:
print(a[0,1])

6


In [9]:
print(a[1,0])

7


In [10]:
print(a[1,1])

8


## NumPy Array Numerical Data Types

In [11]:
print(np.float64(21))

21.0


In [12]:
print(np.int8(21.0)) 

21


In [13]:
print(np.bool(21))

True


In [14]:
print(np.bool(0)) 

False


In [15]:
print(np.bool(21.0)) 

True


In [16]:
print(np.float(True)) 

1.0


In [17]:
print(np.float(False)) 

0.0


In [18]:
arr=np.arange(1,11, dtype= np.float32)

print(arr)

[ 1.  2.  3.  4.  5.  6.  7.  8.  9. 10.]


In [19]:
np.int(42.0 + 1.j)

TypeError: can't convert complex to int

In [23]:
c= complex(42, 1)
print(c)

(42+1j)


In [24]:
print(c.real,c.imag)

42.0 1.0


In [21]:
# Creating an array
import numpy as np
a = np.array([2,4,6,8,10])

print(a.dtype)

int64


In [22]:
print(a.dtype.itemsize)

8


In [27]:
# Create numpy array using arange() function
var1=np.arange(1,11, dtype='f')

print(var1)

[ 1.  2.  3.  4.  5.  6.  7.  8.  9. 10.]


In [28]:
print(np.arange(1,6, dtype='D'))

[1.+0.j 2.+0.j 3.+0.j 4.+0.j 5.+0.j]


In [29]:
print(np.dtype(float))

float64


In [30]:
print(np.dtype('f'))

float32


In [31]:
print(np.dtype('d')) 

float64


In [32]:
print(np.dtype('f8'))

float64


In [36]:
var2=np.array([1,2,3],dtype='float64')

print(var2.dtype.char)

d


In [39]:
print(var2.dtype.type)

<class 'numpy.float64'>


## Manipulating Shape of NumPy Array

In [43]:
# Create an array
arr = np.arange(12)

In [45]:
# Reshape the array dimension
new_arr=arr.reshape(4,3)

print(new_arr)

[[ 0  1  2]
 [ 3  4  5]
 [ 6  7  8]
 [ 9 10 11]]


In [46]:
# Reshape the array dimension
new_arr2=arr.reshape(3,4)

print(new_arr2)

[[ 0  1  2  3]
 [ 4  5  6  7]
 [ 8  9 10 11]]


In [47]:
# Create an array
arr=np.arange(1,10).reshape(3,3)
print(arr)

[[1 2 3]
 [4 5 6]
 [7 8 9]]


In [48]:
# flatten the array
print(arr.flatten())

[1 2 3 4 5 6 7 8 9]


In [49]:
# ravel() function 
print(arr.ravel())

[1 2 3 4 5 6 7 8 9]


In [50]:
# Transpose the matrix
print(arr.transpose())

[[1 4 7]
 [2 5 8]
 [3 6 9]]


In [53]:
# resize the matrix
arr.resize(1,9)
print(arr)

[[1 2 3 4 5 6 7 8 9]]


## Stacking of Numpy arrays

In [54]:
arr1 = np.arange(1,10).reshape(3,3)
print(arr1)

[[1 2 3]
 [4 5 6]
 [7 8 9]]


In [55]:
arr2 = 2*arr1
print(arr2)

[[ 2  4  6]
 [ 8 10 12]
 [14 16 18]]


In [56]:
arr3=np.hstack((arr1, arr2))

print(arr3)

[[ 1  2  3  2  4  6]
 [ 4  5  6  8 10 12]
 [ 7  8  9 14 16 18]]


In [59]:
# Horizontal stacking using concatenate() function
arr4=np.concatenate((arr1, arr2), axis=1)
print(arr4)

[[ 1  2  3  2  4  6]
 [ 4  5  6  8 10 12]
 [ 7  8  9 14 16 18]]


In [60]:
arr5=np.vstack((arr1, arr2))
print(arr5)

[[ 1  2  3]
 [ 4  5  6]
 [ 7  8  9]
 [ 2  4  6]
 [ 8 10 12]
 [14 16 18]]


In [61]:
arr6=np.concatenate((arr1, arr2), axis=0) 
print(arr6)

[[ 1  2  3]
 [ 4  5  6]
 [ 7  8  9]
 [ 2  4  6]
 [ 8 10 12]
 [14 16 18]]


In [62]:
arr7=np.dstack((arr1, arr2))
print(arr7)

[[[ 1  2]
  [ 2  4]
  [ 3  6]]

 [[ 4  8]
  [ 5 10]
  [ 6 12]]

 [[ 7 14]
  [ 8 16]
  [ 9 18]]]


In [63]:
# Create 1-D array
arr1 = np.arange(4,7) 
print(arr1)

[4 5 6]


In [65]:
# Create 1-D array
arr2 = 2 * arr1
print(arr2)

[ 8 10 12]


In [67]:
# Create column stack
arr_col_stack = np.column_stack((arr1,arr2))
print(arr_col_stack)

[[ 4  8]
 [ 5 10]
 [ 6 12]]


In [68]:
# Create row stack
arr_row_stack = np.row_stack((arr1,arr2)) 
print(arr_row_stack)

[[ 4  5  6]
 [ 8 10 12]]


## Partitioning Numpy Array

In [69]:
# Create an array
arr=np.arange(1,10).reshape(3,3)
print(arr)

[[1 2 3]
 [4 5 6]
 [7 8 9]]


In [71]:
# Peroform horizontal splitting
arr_hor_split=np.hsplit(arr, 3)

print(arr_hor_split)

[array([[1],
       [4],
       [7]]), array([[2],
       [5],
       [8]]), array([[3],
       [6],
       [9]])]


In [72]:
# vertical split
arr_ver_split=np.vsplit(arr, 3)

print(arr_ver_split)

[array([[1, 2, 3]]), array([[4, 5, 6]]), array([[7, 8, 9]])]


In [73]:
# split with axis=0
arr_split=np.split(arr,3,axis=0)

print(arr_split)

[array([[1, 2, 3]]), array([[4, 5, 6]]), array([[7, 8, 9]])]


In [30]:
# split with axis=1
np.split(arr,3,axis=1)

[array([[1],
        [4],
        [7]]), array([[2],
        [5],
        [8]]), array([[3],
        [6],
        [9]])]

## Changing Datatype of NumPy Arrays

In [16]:
# Create an array
arr=np.arange(1,10).reshape(3,3)
print("Integer Array:",arr)

# Change datatype of array
arr=arr.astype(float)

# print array
print("Float Array:", arr)

# Check new data type of array
print("Changed Datatype:", arr.dtype)

Integer Array: [[1 2 3]
 [4 5 6]
 [7 8 9]]
Float Array: [[1. 2. 3.]
 [4. 5. 6.]
 [7. 8. 9.]]
Changed Datatype: float64


In [74]:
# Change datatype of array
arr=arr.astype(float)

# Check new data type of array
print(arr.dtype)

float64


In [75]:
# Create an array
arr=np.arange(1,10)

# Convert NumPy array to Python List
list1=arr.tolist()
print(list1)

[1, 2, 3, 4, 5, 6, 7, 8, 9]


## Creating NumPy views and copies

In [18]:
# Create NumPy Array
arr = np.arange(1,5).reshape(2,2)
print(arr)

# Create no copy only assignment
arr_no_copy=arr

# Create Deep Copy
arr_copy=arr.copy()

# Create shallow copy using View
arr_view=arr.view()

print("Original Array: ",id(arr))
print("Assignment: ",id(arr_no_copy))
print("Deep Copy: ",id(arr_copy))
print("Shallow Copy(View): ",id(arr_view))

[[1 2]
 [3 4]]
Original Array:  140586368570352
Assignment:  140586368570352
Deep Copy:  140586368570752
Shallow Copy(View):  140586368570832


In [23]:
# Update the values of original array
arr[1]=[99,89]

# Check values of array view
print("View Array:\n", arr_view)

# Check values of array copy
print("Copied Array:\n", arr_copy)

View Array:
 [[ 1  2]
 [99 89]]
Copied Array:
 [[1 2]
 [3 4]]


## Slicing NumPy Array

In [76]:
# Create NumPy Array
arr = np.arange(10) 
print(arr)

[0 1 2 3 4 5 6 7 8 9]


In [77]:
print(arr[3:6])

[3 4 5]


In [78]:
print(arr[3:])

[3 4 5 6 7 8 9]


In [79]:
print(arr[-3:])

[7 8 9]


In [80]:
print(arr[2:7:2])

[2 4 6]


## Boolean and Fancy Indexing

In [82]:
# Create NumPy Array
arr = np.arange(21,41,2)
print("Orignial Array:\n",arr)

# Boolean Indexing
print("After Boolean Condition:",arr[arr>30])

Orignial Array:
 [21 23 25 27 29 31 33 35 37 39]
After Boolean Condition: [31 33 35 37 39]


In [32]:
# Create NumPy Array
arr = np.arange(1,21).reshape(5,4)
print("Orignial Array:\n",arr)

# Selecting 2nd and 3rd row
indices = [1,2]
print("Selected 1st and 2nd Row:\n", arr[indices])

# Selecting 3nd and 4th row
indices = [2,3]
print("Selected 3rd and 4th Row:\n", arr[indices])

Orignial Array:
 [[ 1  2  3  4]
 [ 5  6  7  8]
 [ 9 10 11 12]
 [13 14 15 16]
 [17 18 19 20]]
Selected 1st and 2nd Row:
 [[ 5  6  7  8]
 [ 9 10 11 12]]
Selected 3rd and 4th Row:
 [[ 9 10 11 12]
 [13 14 15 16]]


In [32]:
# Create row and column indices
row = np.array([1, 2])
col = np.array([2, 3])

print("Selected Sub-Array:", arr[row, col])

Selected Sub-Array: [ 7 12]


## Broadcasting arrays

In [83]:
# Create NumPy Array
arr1 = np.arange(1,5).reshape(2,2) 
print(arr1)

[[1 2]
 [3 4]]


In [84]:
# Create another NumPy Array
arr2 = np.arange(5,9).reshape(2,2) 
print(arr2)

[[5 6]
 [7 8]]


In [85]:
# Add two matrices
print(arr1+arr2)

[[ 6  8]
 [10 12]]


In [86]:
# Multiply two matrices
print(arr1*arr2)

[[ 5 12]
 [21 32]]


In [87]:
# Add a scaler value
print(arr1 + 3)

[[4 5]
 [6 7]]


In [88]:
# Multiply with a scalar value
print(arr1 * 3)

[[ 3  6]
 [ 9 12]]


## Create DataFrame

In [33]:
# Import pandas library 
import pandas as pd 
# Create empty DataFrame
df = pd.DataFrame() 

# Header of dataframe. 
df.head()

In [None]:
df

In [122]:
# Create dictionary of list
data = {'Name': ['Vijay', 'Sundar', 'Satyam', 'Indira'], 'Age': [23, 45, 46, 52 ]}   

# Create the pandas DataFrame 
df = pd.DataFrame(data)

# Header of dataframe. 
df.head()

Unnamed: 0,Name,Age
0,Vijay,23
1,Sundar,45
2,Satyam,46
3,Indira,52


In [125]:
# Pandas DataFrame by lists of dicts. 
# Initialise data to lists. 
data =[ {'Name': 'Vijay',  'Age': 23},{'Name': 'Sundar',  'Age': 25},{'Name': 'Shankar',  'Age': 26}]
# Creates DataFrame. 
df = pd.DataFrame(data,columns=['Name','Age']) 
# Print dataframe header 
df.head()  

Unnamed: 0,Name,Age
0,Vijay,23
1,Sundar,25
2,Shankar,26


In [124]:
# Creating DataFrame using list of tuples.
data = [('Vijay', 23),( 'Sundar', 45), ('Satyam', 46), ('Indira',52)] 
# Create dataframe
df = pd.DataFrame(data, columns=['Name','Age'])
# Print dataframe header 
df.head()  

Unnamed: 0,Name,Age
0,Vijay,23
1,Sundar,45
2,Satyam,46
3,Indira,52


## Pandas Series

In [130]:
# Creating Pandas Series using Dictionary
dict1 = {0 : 'Ajay', 1 : 'Jay', 2 : 'Vijay'}
# Create Pandas Series
series = pd.Series(dict1)
# Show series
series

0     Ajay
1      Jay
2    Vijay
dtype: object

In [134]:
# load Pandas and NumPy
import pandas as pd
import numpy as np
# Create NumPy array
arr = np.array([51,65,48,59, 68])
# Create Pandas Series
series = pd.Series(arr)
series

0    51
1    65
2    48
3    59
4    68
dtype: int64

In [135]:
# load Pandas and NumPy
import pandas as pd
import numpy as np
# Create Pandas Series
series = pd.Series(10, index=[0, 1, 2, 3, 4, 5])
series

0    10
1    10
2    10
3    10
4    10
5    10
dtype: int64

In [99]:
# Import pandas 
import pandas as pd

# Load data using read_csv() 
df = pd.read_csv("WHO_first9cols.csv")

# Show initial 5 records
df.head()

Unnamed: 0,Country,CountryID,Continent,Adolescent fertility rate (%),Adult literacy rate (%),Gross national income per capita (PPP international $),Net primary school enrolment ratio female (%),Net primary school enrolment ratio male (%),Population (in thousands) total
0,Afghanistan,1,1,151.0,28.0,,,,26088.0
1,Albania,2,2,27.0,98.7,6000.0,93.0,94.0,3172.0
2,Algeria,3,3,6.0,69.9,5940.0,94.0,96.0,33351.0
3,Andorra,4,2,,,,83.0,83.0,74.0
4,Angola,5,3,146.0,67.4,3890.0,49.0,51.0,16557.0


In [100]:
# Show last 5 records
df.tail()

Unnamed: 0,Country,CountryID,Continent,Adolescent fertility rate (%),Adult literacy rate (%),Gross national income per capita (PPP international $),Net primary school enrolment ratio female (%),Net primary school enrolment ratio male (%),Population (in thousands) total
197,Vietnam,198,6,25.0,90.3,2310.0,91.0,96.0,86206.0
198,West Bank and Gaza,199,1,,,,,,
199,Yemen,200,1,83.0,54.1,2090.0,65.0,85.0,21732.0
200,Zambia,201,3,161.0,68.0,1140.0,94.0,90.0,11696.0
201,Zimbabwe,202,3,101.0,89.5,,88.0,87.0,13228.0


In [101]:
# Show the shape of DataFrame
print("Shape:", df.shape)

Shape: (202, 9)


In [102]:
# Check the column list of DataFrame
print("List of Columns:", df.columns)

List of Columns: Index(['Country', 'CountryID', 'Continent', 'Adolescent fertility rate (%)',
       'Adult literacy rate (%)',
       'Gross national income per capita (PPP international $)',
       'Net primary school enrolment ratio female (%)',
       'Net primary school enrolment ratio male (%)',
       'Population (in thousands) total'],
      dtype='object')


In [103]:
# Show the datatypes of columns
print("Data types:", df.dtypes)

Data types: Country                                                    object
CountryID                                                   int64
Continent                                                   int64
Adolescent fertility rate (%)                             float64
Adult literacy rate (%)                                   float64
Gross national income per capita (PPP international $)    float64
Net primary school enrolment ratio female (%)             float64
Net primary school enrolment ratio male (%)               float64
Population (in thousands) total                           float64
dtype: object


In [110]:
# Select a series
country_series=df['Country']

In [111]:
# check datatype of series
type(country_series)

pandas.core.series.Series

In [112]:
print(country_series.index)

RangeIndex(start=0, stop=202, step=1)


In [113]:
# Convert Pandas Series into List
print(country_series.values)

['Afghanistan' 'Albania' 'Algeria' 'Andorra' 'Angola'
 'Antigua and Barbuda' 'Argentina' 'Armenia' 'Australia' 'Austria'
 'Azerbaijan' 'Bahamas' 'Bahrain' 'Bangladesh' 'Barbados' 'Belarus'
 'Belgium' 'Belize' 'Benin' 'Bermuda' 'Bhutan' 'Bolivia'
 'Bosnia and Herzegovina' 'Botswana' 'Brazil' 'Brunei Darussalam'
 'Bulgaria' 'Burkina Faso' 'Burundi' 'Cambodia' 'Cameroon' 'Canada'
 'Cape Verde' 'Central African Republic' 'Chad' 'Chile' 'China' 'Colombia'
 'Comoros' 'Congo, Dem. Rep.' 'Congo, Rep.' 'Cook Islands' 'Costa Rica'
 "Cote d'Ivoire" 'Croatia' 'Cuba' 'Cyprus' 'Czech Republic' 'Denmark'
 'Djibouti' 'Dominica' 'Dominican Republic' 'Ecuador' 'Egypt'
 'El Salvador' 'Equatorial Guinea' 'Eritrea' 'Estonia' 'Ethiopia' 'Fiji'
 'Finland' 'France' 'French Polynesia' 'Gabon' 'Gambia' 'Georgia'
 'Germany' 'Ghana' 'Greece' 'Grenada' 'Guatemala' 'Guinea' 'Guinea-Bissau'
 'Guyana' 'Haiti' 'Honduras' 'Hong Kong, China' 'Hungary' 'Iceland'
 'India' 'Indonesia' 'Iran (Islamic Republic of)' 'Iraq' 'I

In [114]:
# Country name
print(country_series.name)

'Country'

In [116]:
# Pandas Series Slicing
country_series[-5:]

197               Vietnam
198    West Bank and Gaza
199                 Yemen
200                Zambia
201              Zimbabwe
Name: Country, dtype: object

In [130]:
# Creating Pandas Series using Dictionary
dict1 = {0 : 'Ajay', 1 : 'Jay', 2 : 'Vijay'}
# Create Pandas Series
series = pd.Series(dict1)
# Show series
series

0     Ajay
1      Jay
2    Vijay
dtype: object

In [134]:
# load Pandas and NumPy
import pandas as pd
import numpy as np
# Create NumPy array
arr = np.array([51,65,48,59, 68])
# Create Pandas Series
series = pd.Series(arr)
series

0    51
1    65
2    48
3    59
4    68
dtype: int64

In [135]:
# load Pandas and NumPy
import pandas as pd
import numpy as np
# Create Pandas Series
series = pd.Series(10, index=[0, 1, 2, 3, 4, 5])
series

0    10
1    10
2    10
3    10
4    10
5    10
dtype: int64

## Querying Data

In [199]:
!pip install quandl

Collecting quandl
  Downloading https://files.pythonhosted.org/packages/07/ab/8cd479fba8a9b197a43a0d55dd534b066fb8e5a0a04b5c0384cbc5d663aa/Quandl-3.5.0-py2.py3-none-any.whl
Collecting inflection>=0.3.1 (from quandl)
  Downloading https://files.pythonhosted.org/packages/52/c1/36be286d85dbd76527fb613527222a795d7c071da195fa916e7bf3cb03cb/inflection-0.4.0-py2.py3-none-any.whl
Installing collected packages: inflection, quandl
Successfully installed inflection-0.4.0 quandl-3.5.0


In [204]:
import quandl

sunspots = quandl.get("SIDC/SUNSPOTS_A")

sunspots.head()

LimitExceededError: (Status 429) (Quandl Error QELx01) You have exceeded the anonymous user limit of 50 calls per day. To make more calls today, please register for a free Quandl account and then include your API key with your requests.

In [207]:
sunspots.head()

Unnamed: 0_level_0,Yearly Mean Total Sunspot Number,Yearly Mean Standard Deviation,Number of Observations,Definitive/Provisional Indicator
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1700-12-31,8.3,,,1.0
1701-12-31,18.3,,,1.0
1702-12-31,26.7,,,1.0
1703-12-31,38.3,,,1.0
1704-12-31,60.0,,,1.0


In [208]:
sunspots.tail()

Unnamed: 0_level_0,Yearly Mean Total Sunspot Number,Yearly Mean Standard Deviation,Number of Observations,Definitive/Provisional Indicator
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2015-12-31,69.8,6.4,8903.0,1.0
2016-12-31,39.8,3.9,9940.0,1.0
2017-12-31,21.7,2.5,11444.0,1.0
2018-12-31,7.0,1.1,12611.0,1.0
2019-12-31,3.6,0.5,12401.0,0.0


In [210]:
sunspots.columns

Index(['Yearly Mean Total Sunspot Number', 'Yearly Mean Standard Deviation',
       'Number of Observations', 'Definitive/Provisional Indicator'],
      dtype='object')

In [212]:
# Select columns
sunspots_filtered=sunspots[['Yearly Mean Total Sunspot Number','Definitive/Provisional Indicator']]

# Show top 5 records
sunspots_filtered.head()

Unnamed: 0_level_0,Yearly Mean Total Sunspot Number,Definitive/Provisional Indicator
Date,Unnamed: 1_level_1,Unnamed: 2_level_1
1700-12-31,8.3,1.0
1701-12-31,18.3,1.0
1702-12-31,26.7,1.0
1703-12-31,38.3,1.0
1704-12-31,60.0,1.0


In [213]:
# Select rows using index
sunspots["20020101": "20131231"]

Unnamed: 0_level_0,Yearly Mean Total Sunspot Number,Yearly Mean Standard Deviation,Number of Observations,Definitive/Provisional Indicator
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2002-12-31,163.6,9.8,6588.0,1.0
2003-12-31,99.3,7.1,7087.0,1.0
2004-12-31,65.3,5.9,6882.0,1.0
2005-12-31,45.8,4.7,7084.0,1.0
2006-12-31,24.7,3.5,6370.0,1.0
2007-12-31,12.6,2.7,6841.0,1.0
2008-12-31,4.2,2.5,6644.0,1.0
2009-12-31,4.8,2.5,6465.0,1.0
2010-12-31,24.9,3.4,6328.0,1.0
2011-12-31,80.8,6.7,6077.0,1.0


In [215]:
# Boolean Filter 
sunspots[sunspots['Yearly Mean Total Sunspot Number'] > sunspots['Yearly Mean Total Sunspot Number'].mean()]



Unnamed: 0_level_0,Yearly Mean Total Sunspot Number,Yearly Mean Standard Deviation,Number of Observations,Definitive/Provisional Indicator
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1705-12-31,96.7,,,1.0
1717-12-31,105.0,,,1.0
1718-12-31,100.0,,,1.0
1726-12-31,130.0,,,1.0
1727-12-31,203.3,,,1.0
1728-12-31,171.7,,,1.0
1729-12-31,121.7,,,1.0
1736-12-31,116.7,,,1.0
1737-12-31,135.0,,,1.0
1738-12-31,185.0,,,1.0


## Statistics

In [136]:
# Import pandas 
import pandas as pd

# Load data using read_csv() 
df = pd.read_csv("WHO_first9cols.csv")

# Show initial 5 records
df.head()

Unnamed: 0,Country,CountryID,Continent,Adolescent fertility rate (%),Adult literacy rate (%),Gross national income per capita (PPP international $),Net primary school enrolment ratio female (%),Net primary school enrolment ratio male (%),Population (in thousands) total
0,Afghanistan,1,1,151.0,28.0,,,,26088.0
1,Albania,2,2,27.0,98.7,6000.0,93.0,94.0,3172.0
2,Algeria,3,3,6.0,69.9,5940.0,94.0,96.0,33351.0
3,Andorra,4,2,,,,83.0,83.0,74.0
4,Angola,5,3,146.0,67.4,3890.0,49.0,51.0,16557.0


In [152]:
df.shape

(202, 9)

In [137]:
# Describe the dataset
df.describe()

Unnamed: 0,CountryID,Continent,Adolescent fertility rate (%),Adult literacy rate (%),Gross national income per capita (PPP international $),Net primary school enrolment ratio female (%),Net primary school enrolment ratio male (%),Population (in thousands) total
count,202.0,202.0,177.0,131.0,178.0,179.0,179.0,189.0
mean,101.5,3.579208,59.457627,78.871756,11250.11236,84.03352,85.698324,34099.64
std,58.456537,1.808263,49.105286,20.41576,12586.753417,17.788047,15.451212,131837.7
min,1.0,1.0,0.0,23.6,260.0,6.0,11.0,2.0
25%,51.25,2.0,19.0,68.4,2112.5,79.0,79.5,1328.0
50%,101.5,3.0,46.0,86.5,6175.0,90.0,90.0,6640.0
75%,151.75,5.0,91.0,95.3,14502.5,96.0,96.0,20971.0
max,202.0,7.0,199.0,99.8,60870.0,100.0,100.0,1328474.0


In [138]:
# Count number of observation
df.count()

Country                                                   202
CountryID                                                 202
Continent                                                 202
Adolescent fertility rate (%)                             177
Adult literacy rate (%)                                   131
Gross national income per capita (PPP international $)    178
Net primary school enrolment ratio female (%)             179
Net primary school enrolment ratio male (%)               179
Population (in thousands) total                           189
dtype: int64

In [142]:
# Compute median of all the columns
df.median()

CountryID                                                  101.5
Continent                                                    3.0
Adolescent fertility rate (%)                               46.0
Adult literacy rate (%)                                     86.5
Gross national income per capita (PPP international $)    6175.0
Net primary school enrolment ratio female (%)               90.0
Net primary school enrolment ratio male (%)                 90.0
Population (in thousands) total                           6640.0
dtype: float64

In [143]:
# Compute minimum of all the columns
df.min()

Country                                                   Afghanistan
CountryID                                                           1
Continent                                                           1
Adolescent fertility rate (%)                                       0
Adult literacy rate (%)                                          23.6
Gross national income per capita (PPP international $)            260
Net primary school enrolment ratio female (%)                       6
Net primary school enrolment ratio male (%)                        11
Population (in thousands) total                                     2
dtype: object

In [151]:
# Compute maximum of all the columns
df.max()

Country                                                      Zimbabwe
CountryID                                                         202
Continent                                                           7
Adolescent fertility rate (%)                                     199
Adult literacy rate (%)                                          99.8
Gross national income per capita (PPP international $)          60870
Net primary school enrolment ratio female (%)                     100
Net primary school enrolment ratio male (%)                       100
Population (in thousands) total                           1.32847e+06
dtype: object

In [146]:
# Compute standard deviation of all the columns
df.std()

CountryID                                                     58.456537
Continent                                                      1.808263
Adolescent fertility rate (%)                                 49.105286
Adult literacy rate (%)                                       20.415760
Gross national income per capita (PPP international $)     12586.753417
Net primary school enrolment ratio female (%)                 17.788047
Net primary school enrolment ratio male (%)                   15.451212
Population (in thousands) total                           131837.708677
dtype: float64

## Grouping Pandas DataFrames

In [157]:
df.head()

Unnamed: 0,Country,CountryID,Continent,Adolescent fertility rate (%),Adult literacy rate (%),Gross national income per capita (PPP international $),Net primary school enrolment ratio female (%),Net primary school enrolment ratio male (%),Population (in thousands) total
0,Afghanistan,1,1,151.0,28.0,,,,26088.0
1,Albania,2,2,27.0,98.7,6000.0,93.0,94.0,3172.0
2,Algeria,3,3,6.0,69.9,5940.0,94.0,96.0,33351.0
3,Andorra,4,2,,,,83.0,83.0,74.0
4,Angola,5,3,146.0,67.4,3890.0,49.0,51.0,16557.0


In [156]:
# Group By Dataframe on the basis of Continent column
df.groupby('Continent').mean()

Unnamed: 0_level_0,CountryID,Adolescent fertility rate (%),Adult literacy rate (%),Gross national income per capita (PPP international $),Net primary school enrolment ratio female (%),Net primary school enrolment ratio male (%),Population (in thousands) total
Continent,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
1,110.238095,37.3,76.9,14893.529412,85.789474,88.315789,16843.35
2,100.333333,20.5,97.911538,19777.083333,92.911111,93.088889,17259.627451
3,99.354167,111.644444,61.690476,3050.434783,67.574468,72.021277,16503.195652
4,56.285714,49.6,91.6,24524.0,95.0,94.4,73577.333333
5,94.774194,77.888889,87.940909,7397.142857,89.137931,88.517241,15637.241379
6,121.228571,39.26087,87.607143,12167.2,89.04,89.96,25517.142857
7,80.777778,57.333333,69.8125,2865.555556,85.444444,88.888889,317683.666667


In [158]:
df.groupby('Continent').mean()['Adult literacy rate (%)']

Continent
1    76.900000
2    97.911538
3    61.690476
4    91.600000
5    87.940909
6    87.607143
7    69.812500
Name: Adult literacy rate (%), dtype: float64

## Joins

In [3]:
# Import pandas 
import pandas as pd

# Load data using read_csv() 
dest = pd.read_csv("dest.csv")

# Show DataFrame
dest.head()

Unnamed: 0,EmpNr,Dest
0,5,The Hague
1,3,Amsterdam
2,9,Rotterdam


In [4]:
# Load data using read_csv() 
tips = pd.read_csv("tips.csv")

# Show DataFrame
tips.head()

Unnamed: 0,EmpNr,Amount
0,5,10.0
1,9,5.0
2,7,2.5


In [5]:
# Join DataFrames using Inner Join
df_inner= pd.merge(dest, tips, on='EmpNr', how='inner')
df_inner.head()

Unnamed: 0,EmpNr,Dest,Amount
0,5,The Hague,10.0
1,9,Rotterdam,5.0


In [6]:
# Join DataFrames using Outer Join
df_outer= pd.merge(dest, tips, on='EmpNr', how='outer')
df_outer.head()

Unnamed: 0,EmpNr,Dest,Amount
0,5,The Hague,10.0
1,3,Amsterdam,
2,9,Rotterdam,5.0
3,7,,2.5


In [172]:
# Join DataFrames using Right Outer Join
df_right= pd.merge(dest, tips, on='EmpNr', how='right')
df_right

Unnamed: 0,EmpNr,Dest,Amount
0,5,The Hague,10.0
1,9,Rotterdam,5.0
2,7,,2.5


In [174]:
# Join DataFrames using Left Outer Join
df_left= pd.merge(dest, tips, on='EmpNr', how='left')
df_left

Unnamed: 0,EmpNr,Dest,Amount
0,5,The Hague,10.0
1,3,Amsterdam,
2,9,Rotterdam,5.0


## Missing Values

In [175]:
# Import pandas 
import pandas as pd

# Load data using read_csv() 
df = pd.read_csv("WHO_first9cols.csv")

# Show initial 5 records
df.head()

Unnamed: 0,Country,CountryID,Continent,Adolescent fertility rate (%),Adult literacy rate (%),Gross national income per capita (PPP international $),Net primary school enrolment ratio female (%),Net primary school enrolment ratio male (%),Population (in thousands) total
0,Afghanistan,1,1,151.0,28.0,,,,26088.0
1,Albania,2,2,27.0,98.7,6000.0,93.0,94.0,3172.0
2,Algeria,3,3,6.0,69.9,5940.0,94.0,96.0,33351.0
3,Andorra,4,2,,,,83.0,83.0,74.0
4,Angola,5,3,146.0,67.4,3890.0,49.0,51.0,16557.0


In [177]:
# Count missing values in DataFrame
pd.isnull(df).sum()

Country                                                    0
CountryID                                                  0
Continent                                                  0
Adolescent fertility rate (%)                             25
Adult literacy rate (%)                                   71
Gross national income per capita (PPP international $)    24
Net primary school enrolment ratio female (%)             23
Net primary school enrolment ratio male (%)               23
Population (in thousands) total                           13
dtype: int64

In [178]:
# Count missing values in DataFrame
df.isnull().sum()

Country                                                    0
CountryID                                                  0
Continent                                                  0
Adolescent fertility rate (%)                             25
Adult literacy rate (%)                                   71
Gross national income per capita (PPP international $)    24
Net primary school enrolment ratio female (%)             23
Net primary school enrolment ratio male (%)               23
Population (in thousands) total                           13
dtype: int64

In [179]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 202 entries, 0 to 201
Data columns (total 9 columns):
Country                                                   202 non-null object
CountryID                                                 202 non-null int64
Continent                                                 202 non-null int64
Adolescent fertility rate (%)                             177 non-null float64
Adult literacy rate (%)                                   131 non-null float64
Gross national income per capita (PPP international $)    178 non-null float64
Net primary school enrolment ratio female (%)             179 non-null float64
Net primary school enrolment ratio male (%)               179 non-null float64
Population (in thousands) total                           189 non-null float64
dtypes: float64(6), int64(2), object(1)
memory usage: 14.3+ KB


In [180]:
# Drop all the missing values
df.dropna(inplace=True)

In [181]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 118 entries, 1 to 200
Data columns (total 9 columns):
Country                                                   118 non-null object
CountryID                                                 118 non-null int64
Continent                                                 118 non-null int64
Adolescent fertility rate (%)                             118 non-null float64
Adult literacy rate (%)                                   118 non-null float64
Gross national income per capita (PPP international $)    118 non-null float64
Net primary school enrolment ratio female (%)             118 non-null float64
Net primary school enrolment ratio male (%)               118 non-null float64
Population (in thousands) total                           118 non-null float64
dtypes: float64(6), int64(2), object(1)
memory usage: 9.2+ KB


In [182]:
# Load data using read_csv() 
df = pd.read_csv("WHO_first9cols.csv")

# Show initial 5 records
df.head()

Unnamed: 0,Country,CountryID,Continent,Adolescent fertility rate (%),Adult literacy rate (%),Gross national income per capita (PPP international $),Net primary school enrolment ratio female (%),Net primary school enrolment ratio male (%),Population (in thousands) total
0,Afghanistan,1,1,151.0,28.0,,,,26088.0
1,Albania,2,2,27.0,98.7,6000.0,93.0,94.0,3172.0
2,Algeria,3,3,6.0,69.9,5940.0,94.0,96.0,33351.0
3,Andorra,4,2,,,,83.0,83.0,74.0
4,Angola,5,3,146.0,67.4,3890.0,49.0,51.0,16557.0


In [184]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 202 entries, 0 to 201
Data columns (total 9 columns):
Country                                                   202 non-null object
CountryID                                                 202 non-null int64
Continent                                                 202 non-null int64
Adolescent fertility rate (%)                             177 non-null float64
Adult literacy rate (%)                                   131 non-null float64
Gross national income per capita (PPP international $)    178 non-null float64
Net primary school enrolment ratio female (%)             179 non-null float64
Net primary school enrolment ratio male (%)               179 non-null float64
Population (in thousands) total                           189 non-null float64
dtypes: float64(6), int64(2), object(1)
memory usage: 14.3+ KB


In [185]:
# Fill missing values with 0
df.fillna(0,inplace=True)

In [186]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 202 entries, 0 to 201
Data columns (total 9 columns):
Country                                                   202 non-null object
CountryID                                                 202 non-null int64
Continent                                                 202 non-null int64
Adolescent fertility rate (%)                             202 non-null float64
Adult literacy rate (%)                                   202 non-null float64
Gross national income per capita (PPP international $)    202 non-null float64
Net primary school enrolment ratio female (%)             202 non-null float64
Net primary school enrolment ratio male (%)               202 non-null float64
Population (in thousands) total                           202 non-null float64
dtypes: float64(6), int64(2), object(1)
memory usage: 14.3+ KB


## Pivot Table

In [7]:
# Import pandas 
import pandas as pd

# Load data using read_csv() 
purchase = pd.read_csv("purchase.csv")

# Show initial 10 records
purchase.head(10)

Unnamed: 0,Weather,Food,Price,Number
0,cold,soup,3.745401,8
1,hot,soup,9.507143,8
2,cold,icecream,7.319939,8
3,hot,chocolate,5.986585,8
4,cold,icecream,1.560186,8
5,hot,icecream,1.559945,8
6,cold,soup,0.580836,8


In [197]:
# Summarise dataframe using pivot table
pd.pivot_table(purchase,values='Number', index=['Weather',],
                    columns=['Food'], aggfunc=np.sum)

Food,chocolate,icecream,soup
Weather,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
cold,,16.0,16.0
hot,8.0,8.0,8.0


## Dealing with dates

In [217]:
# Date range function
pd.date_range('01-01-2000', periods=45, freq='D')

DatetimeIndex(['2000-01-01', '2000-01-02', '2000-01-03', '2000-01-04',
               '2000-01-05', '2000-01-06', '2000-01-07', '2000-01-08',
               '2000-01-09', '2000-01-10', '2000-01-11', '2000-01-12',
               '2000-01-13', '2000-01-14', '2000-01-15', '2000-01-16',
               '2000-01-17', '2000-01-18', '2000-01-19', '2000-01-20',
               '2000-01-21', '2000-01-22', '2000-01-23', '2000-01-24',
               '2000-01-25', '2000-01-26', '2000-01-27', '2000-01-28',
               '2000-01-29', '2000-01-30', '2000-01-31', '2000-02-01',
               '2000-02-02', '2000-02-03', '2000-02-04', '2000-02-05',
               '2000-02-06', '2000-02-07', '2000-02-08', '2000-02-09',
               '2000-02-10', '2000-02-11', '2000-02-12', '2000-02-13',
               '2000-02-14'],
              dtype='datetime64[ns]', freq='D')

In [218]:
# Convert argument to datetime
pd.to_datetime('1/1/1970')

Timestamp('1970-01-01 00:00:00')

In [223]:
# Convert argument to datetime in specified format
pd.to_datetime(['20200101', '20200102'], format='%Y%m%d')

DatetimeIndex(['2020-01-01', '2020-01-02'], dtype='datetime64[ns]', freq=None)

In [224]:
# Value Error
pd.to_datetime(['20200101', 'not a date'])

ValueError: ('Unknown string format:', 'not a date')

In [225]:
# Handle value error
pd.to_datetime(['20200101', 'not a date'], errors='coerce')

DatetimeIndex(['2020-01-01', 'NaT'], dtype='datetime64[ns]', freq=None)