Starting with Numpy

In [1]:
#load the library and check its version, just to make sure we aren't using an older version
import numpy as np
np.__version__
'1.12.1'

'1.12.1'

In [2]:
#create a list comprising numbers from 0 to 9
L = list(range(10))

In [3]:
#converting integers to string - this style of handling lists is known as list comprehension.
#List comprehension offers a versatile way to handle list manipulations tasks easily. We'll learn about them in future tutorials. Here's an example.  

[str(c) for c in L]
['0', '1', '2', '3', '4', '5', '6', '7', '8', '9']

[type(item) for item in L]
[int, int, int, int, int, int, int, int, int, int]

[int, int, int, int, int, int, int, int, int, int]

Creating Arrays

Numpy arrays are homogeneous in nature, i.e., they comprise one data type (integer, float, double, etc.) unlike lists.

In [5]:
#creating arrays
np.zeros(10, dtype='int')
np.array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0])


#creating a 3 row x 5 column matrix
np.ones((3,5), dtype=float)
np.array([[ 1.,  1.,  1.,  1.,  1.],
      [ 1.,  1.,  1.,  1.,  1.],
      [ 1.,  1.,  1.,  1.,  1.]])


#creating a matrix with a predefined value
np.full((3,5),1.23)
np.array([[ 1.23,  1.23,  1.23,  1.23,  1.23],
      [ 1.23,  1.23,  1.23,  1.23,  1.23],
      [ 1.23,  1.23,  1.23,  1.23,  1.23]])


#create an array with a set sequence
np.arange(0, 20, 2)
np.array([0, 2, 4, 6, 8,10,12,14,16,18])


#create an array of even space between the given range of values
np.linspace(0, 1, 5)
np.array([ 0., 0.25, 0.5 , 0.75, 1.])


#create a 3x3 array with mean 0 and standard deviation 1 in a given dimension
np.random.normal(0, 1, (3,3))
np.array([[ 0.72432142, -0.90024075,  0.27363808],
      [ 0.88426129,  1.45096856, -1.03547109],
      [-0.42930994, -1.02284441, -1.59753603]])


#create an identity matrix
np.eye(3)
np.array([[ 1.,  0.,  0.],
      [ 0.,  1.,  0.],
      [ 0.,  0.,  1.]])


#set a random seed
np.random.seed(0)


x1 = np.random.randint(10, size=6) #one dimension
x2 = np.random.randint(10, size=(3,4)) #two dimension
x3 = np.random.randint(10, size=(3,4,5)) #three dimension


print("x3 ndim:", x3.ndim)
print("x3 shape:", x3.shape)
print("x3 size: ", x3.size)
('x3 ndim:', 3)
('x3 shape:', (3, 4, 5))
('x3 size: ', 60)

x3 ndim: 3
x3 shape: (3, 4, 5)
x3 size:  60


('x3 size: ', 60)

Array Indexing

The important thing to remember is that indexing in python starts at zero.

In [None]:
x1 = np.array([4, 3, 4, 4, 8, 4])
x1
np.array([4, 3, 4, 4, 8, 4])

#assess value to index zero
x1[0]
4

#assess fifth value
x1[4]
8

#get the last value
x1[-1]
4

#get the second last value
x1[-2]
8

#in a multidimensional array, we need to specify row and column index
x2
np.array([[3, 7, 5, 5],
      [0, 1, 5, 9],
      [3, 0, 5, 0]])


#1st row and 2nd column value
x2[2,3]
0

#3rd row and last value from the 3rd column
x2[2,-1]
0


#replace value at 0,0 index
x2[0,0] = 12
x2
np.array([[12,  7,  5,  5],
      [ 0,  1,  5,  9],
      [ 3,  0,  5,  0]])

array([[12,  7,  5,  5],
       [ 0,  1,  5,  9],
       [ 3,  0,  5,  0]])

Array Slicing

Now, we'll learn to access multiple or a range of elements from an array.

In [8]:
x = np.arange(10)
x
np.array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9])


#from start to 4th position
x[:5]
np.array([0, 1, 2, 3, 4])


#from 4th position to end
x[4:]
np.array([4, 5, 6, 7, 8, 9])


#from 4th to 6th position
x[4:7]
np.array([4, 5, 6])


#return elements at even place
x[ : : 2]
np.array([0, 2, 4, 6, 8])


#return elements from first position step by two
x[1::2]
np.array([1, 3, 5, 7, 9])


#reverse the array
x[::-1]
np.array([9, 8, 7, 6, 5, 4, 3, 2, 1, 0])

array([9, 8, 7, 6, 5, 4, 3, 2, 1, 0])

Array Concatenation

Many a time, we are required to combine different arrays. So, instead of typing each of their elements manually, you can use array concatenation to handle such tasks easily.

In [9]:
#You can concatenate two or more arrays at once.
x = np.array([1, 2, 3])
y = np.array([3, 2, 1])
z = [21,21,21]
np.concatenate([x, y,z])
np.array([ 1,  2,  3,  3,  2,  1, 21, 21, 21])


#You can also use this function to create 2-dimensional arrays.
grid = np.array([[1,2,3],[4,5,6]])
np.concatenate([grid,grid])
np.array([[1, 2, 3],
      [4, 5, 6],
      [1, 2, 3],
      [4, 5, 6]])


#Using its axis parameter, you can define row-wise or column-wise matrix
np.concatenate([grid,grid],axis=1)
np.array([[1, 2, 3, 1, 2, 3],
      [4, 5, 6, 4, 5, 6]])

array([[1, 2, 3, 1, 2, 3],
       [4, 5, 6, 4, 5, 6]])

Until now, we used the concatenation function of arrays of equal dimension. But, what if you are required to combine a 2D array with 1D array? In such situations, np.concatenate might not be the best option to use. Instead, you can use np.vstack or np.hstack to do the task. Let's see how!

In [10]:
x = np.array([3,4,5])
grid = np.array([[1,2,3],[17,18,19]])
np.vstack([x,grid])
np.array([[ 3,  4,  5],
      [ 1,  2,  3],
      [17, 18, 19]])


#Similarly, you can add an array using np.hstack
z = np.array([[9],[9]])
np.hstack([grid,z])
np.array([[ 1,  2,  3,  9],
      [17, 18, 19,  9]])

array([[ 1,  2,  3,  9],
       [17, 18, 19,  9]])

Also, we can split the arrays based on pre-defined positions. Let's see how!

In [19]:
x = np.arange(10)
x
np.array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9])


x1,x2,x3 = np.split(x,[3,6])
print(x1,x2,x3)
([0, 1,2],[3, 4, 5],[6, 7, 8, 9])

grid = np.arange(16).reshape((4,4))
grid
upper,lower = np.vsplit(grid,[2])
print (upper, lower)
(np.array([[0, 1, 2, 3],
       [4, 5, 6, 7]]),np.array([[ 8,  9, 10, 11],
       [12, 13, 14, 15]]))

[0 1 2] [3 4 5] [6 7 8 9]
[[0 1 2 3]
 [4 5 6 7]] [[ 8  9 10 11]
 [12 13 14 15]]


(array([[0, 1, 2, 3],
        [4, 5, 6, 7]]),
 array([[ 8,  9, 10, 11],
        [12, 13, 14, 15]]))

In addition to the functions we learned above, there are several other mathematical functions available in the numpy library such as sum, divide, multiple, abs, power, mod, sin, cos, tan, log, var, min, mean, max, etc. which you can be used to perform basic arithmetic calculations. Feel free to refer to numpy documentation for more information on such functions.

Let's move on to pandas now. Make sure you following each line below because it'll help you in doing data manipulation using pandas.

Let's start with Pandas

In [20]:
#load library - pd is just an alias. I used pd because it's short and literally abbreviates pandas.
#You can use any name as an alias. 
import pandas as pd

In [21]:
#create a data frame - dictionary is used here where keys get converted to column names and values to row values.
data = pd.DataFrame({'Country': ['Russia','Colombia','Chile','Equador','Nigeria'],
                    'Rank':[121,40,100,130,11]})
data

Unnamed: 0,Country,Rank
0,Russia,121
1,Colombia,40
2,Chile,100
3,Equador,130
4,Nigeria,11


In [22]:
#We can do a quick analysis of any data set using:
data.describe()

Unnamed: 0,Rank
count,5.0
mean,80.4
std,52.300096
min,11.0
25%,40.0
50%,100.0
75%,121.0
max,130.0


Remember, describe() method computes summary statistics of integer / double variables. To get the complete information about the data set, we can use info() function.

We can sort the data by not just one column but multiple columns as well.

Often, we get data sets with duplicate rows, which is nothing but noise. Therefore, before training the model, we need to make sure we get rid of such inconsistencies in the data set. Let's see how we can remove duplicate rows.

In [56]:
#create another data with duplicated rows
data = pd.DataFrame({'k1':['one']*3 + ['two']*4, 'k2':[3,2,1,3,3,4,4]})
data

Unnamed: 0,k1,k2
0,one,3
1,one,2
2,one,1
3,two,3
4,two,3
5,two,4
6,two,4


In [57]:
#sort values 
data.sort_values(by='k2')

Unnamed: 0,k1,k2
2,one,1
1,one,2
0,one,3
3,two,3
4,two,3
5,two,4
6,two,4


In [58]:
#remove duplicates - ta da! 
data.drop_duplicates()

Unnamed: 0,k1,k2
0,one,3
1,one,2
2,one,1
3,two,3
5,two,4


In [59]:
data.drop_duplicates(subset='k1')

Unnamed: 0,k1,k2
0,one,3
3,two,3


Now, we will learn to categorize rows based on a predefined criteria. It happens a lot while data processing where you need to categorize a variable. For example, say we have got a column with country names and we want to create a new variable 'continent' based on these country names. In such situations, we will require the steps below:

In [60]:
data = pd.DataFrame({'food': ['bacon', 'pulled pork', 'bacon', 'Pastrami','corned beef', 'Bacon', 'pastrami', 'honey ham','nova lox'],
                 'ounces': [4, 3, 12, 6, 7.5, 8, 3, 5, 6]})
data

Unnamed: 0,food,ounces
0,bacon,4.0
1,pulled pork,3.0
2,bacon,12.0
3,Pastrami,6.0
4,corned beef,7.5
5,Bacon,8.0
6,pastrami,3.0
7,honey ham,5.0
8,nova lox,6.0


In [61]:
meat_to_animal = {
'bacon': 'pig',
'pulled pork': 'pig',
'pastrami': 'cow',
'corned beef': 'cow',
'honey ham': 'pig',
'nova lox': 'salmon'
}

def meat_2_animal(series):
    if series['food'] == 'bacon':
        return 'pig'
    elif series['food'] == 'pulled pork':
        return 'pig'
    elif series['food'] == 'pastrami':
        return 'cow'
    elif series['food'] == 'corned beef':
        return 'cow'
    elif series['food'] == 'honey ham':
        return 'pig'
    else:
        return 'salmon'


#create a new variable
data['animal'] = data['food'].map(str.lower).map(meat_to_animal)
data

Unnamed: 0,food,ounces,animal
0,bacon,4.0,pig
1,pulled pork,3.0,pig
2,bacon,12.0,pig
3,Pastrami,6.0,cow
4,corned beef,7.5,cow
5,Bacon,8.0,pig
6,pastrami,3.0,cow
7,honey ham,5.0,pig
8,nova lox,6.0,salmon


In [62]:
#another way of doing it is: convert the food values to the lower case and apply the function
lower = lambda x: x.lower()
data['food'] = data['food'].apply(lower)
data['animal2'] = data.apply(meat_2_animal, axis='columns')
data

Unnamed: 0,food,ounces,animal,animal2
0,bacon,4.0,pig,pig
1,pulled pork,3.0,pig,pig
2,bacon,12.0,pig,pig
3,pastrami,6.0,cow,cow
4,corned beef,7.5,cow,cow
5,bacon,8.0,pig,pig
6,pastrami,3.0,cow,cow
7,honey ham,5.0,pig,pig
8,nova lox,6.0,salmon,salmon


Another way to create a new variable is by using the assign function. With this tutorial, as you keep discovering the new functions, you'll realize how powerful pandas is.

In [63]:
data.assign(new_variable = data['ounces']*10)

Unnamed: 0,food,ounces,animal,animal2,new_variable
0,bacon,4.0,pig,pig,40.0
1,pulled pork,3.0,pig,pig,30.0
2,bacon,12.0,pig,pig,120.0
3,pastrami,6.0,cow,cow,60.0
4,corned beef,7.5,cow,cow,75.0
5,bacon,8.0,pig,pig,80.0
6,pastrami,3.0,cow,cow,30.0
7,honey ham,5.0,pig,pig,50.0
8,nova lox,6.0,salmon,salmon,60.0


Let's remove the column animal2 from our data frame.

In [64]:
data.drop('animal2',axis='columns',inplace=True)
data

Unnamed: 0,food,ounces,animal
0,bacon,4.0,pig
1,pulled pork,3.0,pig
2,bacon,12.0,pig
3,pastrami,6.0,cow
4,corned beef,7.5,cow
5,bacon,8.0,pig
6,pastrami,3.0,cow
7,honey ham,5.0,pig
8,nova lox,6.0,salmon


We frequently find missing values in our data set. A quick method for imputing missing values is by filling the missing value with any random number. Not just missing values, you may find lots of outliers in your data set, which might require replacing. Let's see how can we replace values.

Now, let's learn how to rename column names and axis (row names).

In [66]:
data = pd.DataFrame(np.arange(12).reshape((3, 4)),index=['Ohio', 'Colorado', 'New York'],columns=['one', 'two', 'three', 'four'])
data

Unnamed: 0,one,two,three,four
Ohio,0,1,2,3
Colorado,4,5,6,7
New York,8,9,10,11


In [67]:
#Using rename function
data.rename(index = {'Ohio':'SanF'}, columns={'one':'one_p','two':'two_p'},inplace=True)
data

Unnamed: 0,one_p,two_p,three,four
SanF,0,1,2,3
Colorado,4,5,6,7
New York,8,9,10,11


In [68]:
#You can also use string functions
data.rename(index = str.upper, columns=str.title,inplace=True)
data

Unnamed: 0,One_P,Two_P,Three,Four
SANF,0,1,2,3
COLORADO,4,5,6,7
NEW YORK,8,9,10,11


Next, we'll learn to categorize (bin) continuous variables.

In [69]:
ages = [20, 22, 25, 27, 21, 23, 37, 31, 61, 45, 41, 32]

We'll divide the ages into bins such as 18-25, 26-35,36-60 and 60 and above.

Also, we can pass a unique name to each label.

Let's proceed and learn about grouping data and creating pivots in pandas. It's an immensely important data analysis method which you'd probably have to use on every data set you work with.

In [73]:
df = pd.DataFrame({'key1' : ['a', 'a', 'b', 'b', 'a'],
                   'key2' : ['one', 'two', 'one', 'two', 'one'],
                   'data1' : np.random.randn(5),
                   'data2' : np.random.randn(5)})
df

Unnamed: 0,key1,key2,data1,data2
0,a,one,1.254414,1.149076
1,a,two,1.419102,-1.193578
2,b,one,-0.743856,1.141042
3,b,two,-2.517437,1.509445
4,a,one,-1.507096,1.067775


Now, let's see how to slice the data frame.

In [75]:
dates = pd.date_range('20130101',periods=6)
df = pd.DataFrame(np.random.randn(6,4),index=dates,columns=list('ABCD'))
df

Unnamed: 0,A,B,C,D
2013-01-01,-0.686589,0.014873,-0.375666,-0.038224
2013-01-02,0.367974,-0.044724,-0.302375,-2.224404
2013-01-03,0.724006,0.359003,1.076121,0.192141
2013-01-04,0.852926,0.018357,0.428304,0.996278
2013-01-05,-0.49115,0.712678,1.11334,-2.153675
2013-01-06,-0.416111,-1.070897,0.221139,-1.123057


In [76]:
#get first n rows from the data frame
df[:3]

Unnamed: 0,A,B,C,D
2013-01-01,-0.686589,0.014873,-0.375666,-0.038224
2013-01-02,0.367974,-0.044724,-0.302375,-2.224404
2013-01-03,0.724006,0.359003,1.076121,0.192141


In [77]:
#slice based on date range
df['20130101':'20130104']

Unnamed: 0,A,B,C,D
2013-01-01,-0.686589,0.014873,-0.375666,-0.038224
2013-01-02,0.367974,-0.044724,-0.302375,-2.224404
2013-01-03,0.724006,0.359003,1.076121,0.192141
2013-01-04,0.852926,0.018357,0.428304,0.996278


In [78]:
#slicing based on column names
df.loc[:,['A','B']]

Unnamed: 0,A,B
2013-01-01,-0.686589,0.014873
2013-01-02,0.367974,-0.044724
2013-01-03,0.724006,0.359003
2013-01-04,0.852926,0.018357
2013-01-05,-0.49115,0.712678
2013-01-06,-0.416111,-1.070897


In [79]:
#slicing based on both row index labels and column names
df.loc['20130102':'20130103',['A','B']]

Unnamed: 0,A,B
2013-01-02,0.367974,-0.044724
2013-01-03,0.724006,0.359003


In [81]:
#returns specific rows and columns using lists containing columns or row indexes
df.iloc[[1,5],[0,2]] 

Unnamed: 0,A,C
2013-01-02,0.367974,-0.302375
2013-01-06,-0.416111,0.221139


Similarly, we can do Boolean indexing based on column values as well. This helps in filtering a data set based on a pre-defined condition.

In [82]:
df[df.A > 1]

Unnamed: 0,A,B,C,D


In [83]:
#we can copy the data set
df2 = df.copy()
df2['E']=['one', 'one','two','three','four','three']
df2

Unnamed: 0,A,B,C,D,E
2013-01-01,-0.686589,0.014873,-0.375666,-0.038224,one
2013-01-02,0.367974,-0.044724,-0.302375,-2.224404,one
2013-01-03,0.724006,0.359003,1.076121,0.192141,two
2013-01-04,0.852926,0.018357,0.428304,0.996278,three
2013-01-05,-0.49115,0.712678,1.11334,-2.153675,four
2013-01-06,-0.416111,-1.070897,0.221139,-1.123057,three


In [84]:
#select rows based on column values
df2[df2['E'].isin(['two','four'])]

Unnamed: 0,A,B,C,D,E
2013-01-03,0.724006,0.359003,1.076121,0.192141,two
2013-01-05,-0.49115,0.712678,1.11334,-2.153675,four


In [85]:
#select all rows except those with two and four
df2[~df2['E'].isin(['two','four'])]

Unnamed: 0,A,B,C,D,E
2013-01-01,-0.686589,0.014873,-0.375666,-0.038224,one
2013-01-02,0.367974,-0.044724,-0.302375,-2.224404,one
2013-01-04,0.852926,0.018357,0.428304,0.996278,three
2013-01-06,-0.416111,-1.070897,0.221139,-1.123057,three


We can also use a query method to select columns based on a criterion. Let's see how!

In [86]:
#list all columns where A is greater than C
df.query('A > C')

Unnamed: 0,A,B,C,D
2013-01-02,0.367974,-0.044724,-0.302375,-2.224404
2013-01-04,0.852926,0.018357,0.428304,0.996278


In [87]:
#using OR condition
df.query('A < B | C > A')

Unnamed: 0,A,B,C,D
2013-01-01,-0.686589,0.014873,-0.375666,-0.038224
2013-01-03,0.724006,0.359003,1.076121,0.192141
2013-01-05,-0.49115,0.712678,1.11334,-2.153675
2013-01-06,-0.416111,-1.070897,0.221139,-1.123057


Pivot tables are extremely useful in analyzing data using a customized tabular format. I think, among other things, Excel is popular because of the pivot table option. It offers a super-quick way to analyze data.

In [88]:
#create a data frame
data = pd.DataFrame({'group': ['a', 'a', 'a', 'b','b', 'b', 'c', 'c','c'],
                 'ounces': [4, 3, 12, 6, 7.5, 8, 3, 5, 6]})
data

Unnamed: 0,group,ounces
0,a,4.0
1,a,3.0
2,a,12.0
3,b,6.0
4,b,7.5
5,b,8.0
6,c,3.0
7,c,5.0
8,c,6.0


Up till now, we've become familiar with the basics of pandas library using toy examples.