# Numpy

* NumPy is a fundamental library for scientific computing in python
* It provides support for arrays and matrices, along with a collection of mathematical functions to operate on these data structures


In [1]:
# install numpy
!pip install numpy

Defaulting to user installation because normal site-packages is not writeable


In [1]:
# import numpy
import numpy as np

## Single dimensional  Array

In [7]:
# create a array using numpy
## Create a 1D array
npArray1 = np.array([1,2,3,4,5])

In [8]:
print(npArray1)

[1 2 3 4 5]


In [9]:
print(type(npArray1))

<class 'numpy.ndarray'>


In [10]:
# Shape of array
npArray1.shape

(5,)

## 2D Array

In [11]:
npArray2 = npArray1.reshape(1,5)
print(npArray2)


[[1 2 3 4 5]]


In [12]:
npArray3 = np.array([[1,2,3],[4,5,6]])
print(npArray3)

[[1 2 3]
 [4 5 6]]


## Create numpy in various  ways

In [15]:
npArray4 = np.arange(20,10,-1)
print(npArray4)

[20 19 18 17 16 15 14 13 12 11]


In [17]:
npArray5 = np.ones((3,4))
print(npArray5)

[[1. 1. 1. 1.]
 [1. 1. 1. 1.]
 [1. 1. 1. 1.]]


In [18]:
#  identity matrix
npArray6 = np.eye(3)
print(npArray6)

[[1. 0. 0.]
 [0. 1. 0.]
 [0. 0. 1.]]


## NumPy array attributes

In [20]:
arr = np.array([[1,2,3],[4,5,6]])
print("Array \n" , arr)
print("shape : " , arr.shape)
print("Number of dimensions : ", arr.ndim)
print("Number of elements : ", arr.size)
print("Data type : ", arr.dtype)
print("Item size (in bytes): ", arr.itemsize)

Array 
 [[1 2 3]
 [4 5 6]]
shape :  (2, 3)
Number of dimensions :  2
Number of elements :  6
Data type :  int32
Item size (in bytes):  4


## NumPy Vectorized Operations

In [21]:
npArray7 = np.array([1,2 , 3, 4, 5])
npArray8 = np.array([10,15,20,25,30])

In [24]:
# Element wise addition
npAddition = npArray7 + npArray8
print(npAddition)

[11 17 23 29 35]


In [25]:
# Element wise subtraction
npSubtraction = npArray7 - npArray8
print(npSubtraction)


[ -9 -13 -17 -21 -25]


In [26]:
# Element wise multiplication 
npMultiplication = npArray7*npArray8
print(npMultiplication)

[ 10  30  60 100 150]


In [27]:
# Element wise division
npDivision = npArray7/npArray8
print(npDivision)

[0.1        0.13333333 0.15       0.16       0.16666667]


### Universal Functions

In [35]:
npArray9 = np.arange(1,100,10)

# Square root
npSqRoot = np.sqrt(npArray9)
print(npSqRoot)

[1.         3.31662479 4.58257569 5.56776436 6.40312424 7.14142843
 7.81024968 8.42614977 9.         9.53939201]


In [36]:
# Exponential
npExponen = np.exp(npArray9)
print(npExponen)

[2.71828183e+00 5.98741417e+04 1.31881573e+09 2.90488497e+13
 6.39843494e+17 1.40934908e+22 3.10429794e+26 6.83767123e+30
 1.50609731e+35 3.31740010e+39]


In [37]:
# Sine
npSine = np.sin(npArray9)
print(npSine)

[ 0.84147098 -0.99999021  0.83665564 -0.40403765 -0.15862267  0.67022918
 -0.96611777  0.95105465 -0.62988799  0.10598751]


In [39]:
# Natural log
npLog = np.log(npArray9)
print(npLog)

[0.         2.39789527 3.04452244 3.4339872  3.71357207 3.93182563
 4.11087386 4.26267988 4.39444915 4.51085951]


### Slicing and indexing

In [40]:
npArray10 = np.array([[1, 2 , 3 , 4 ], [5, 6, 7, 8], [9, 10, 11, 12]])
print(npArray10)

[[ 1  2  3  4]
 [ 5  6  7  8]
 [ 9 10 11 12]]


In [44]:
print(npArray10[0][1])

2


In [54]:
print(npArray10[1:,2:])

[[ 7  8]
 [11 12]]


In [55]:
npArray10[0,0] = 1000
print(npArray10)

[[1000    2    3    4]
 [   5    6    7    8]
 [   9   10   11   12]]


### Statistical  concepts


In [63]:
npArray11 = np.array([i**3 for i in range(5)])
mean = np.mean(npArray11)
print("mean : " , mean )

std_dev = np.std(npArray11)
print("Standard deviation : ", std_dev)

normalized_array = (npArray11 - mean)/std_dev

print(normalized_array)

print("New Mean = " , np.mean(normalized_array))

print("New Standard deviation = ", np.std(normalized_array))

mean :  20.0
Standard deviation :  24.041630560342615
[-0.83189033 -0.79029581 -0.4991342   0.29116162  1.83015873]
New Mean =  0.0
New Standard deviation =  1.0


### logical operations

In [64]:
npArray12 = np.arange(0,20,2)

print(npArray12 > 7)

[False False False False  True  True  True  True  True  True]


In [65]:
print(npArray12[npArray12 < 9])

[0 2 4 6 8]


In [75]:
print(npArray12[(npArray12 >= 5) & (npArray12 <= 9)])

[6 8]


# Pandas

* Pandas is a powerful data manipulation library in python
* It provides two primary data structures
    1. Series
        * Series is a one dimensional array like object
    2. DataFrame
        * DataFrame is a two dimensional, size mutable, and potentially heterogeneous tabular data structure with labeled axes (rows and columns)


In [1]:
!pip install pandas


Defaulting to user installation because normal site-packages is not writeable


In [2]:
#import pandas
import pandas as pd

## series
* A pandas series is a one dimensional array like object that can hold any datatype
* It is similar to a column in table 


In [5]:
data = [i for i  in range(10)]
print("data \n" , data)

pd_series_data = pd.Series(data)
print("Series data \n", pd_series_data)
print(type(pd_series_data))


data 
 [0, 1, 2, 3, 4, 5, 6, 7, 8, 9]
Series data 
 0    0
1    1
2    2
3    3
4    4
5    5
6    6
7    7
8    8
9    9
dtype: int64
<class 'pandas.core.series.Series'>


In [11]:
# Create a series from dictionary 

data = {'a':1, 'b':2, 'c': 3}
print("data \n: ", data)
series_dict = pd.Series(data)
print(series_dict)


data 
:  {'a': 1, 'b': 2, 'c': 3}
a    1
b    2
c    3
dtype: int64


In [12]:
data = [10,20,30]
index = ['a' , 'b', 'c']

print(pd.Series(data, index= index))

a    10
b    20
c    30
dtype: int64


## DataFrame 

In [14]:
# Create a DataFrame from a dictionary of list
data = {
    'Name' : ["Jegan", "Venujan" , "Dino"],
    'Age' : [10,15,20],
    'City': ["Badulla", "Batti" , "Vavuniya"]
}

In [15]:
print(data)

{'Name': ['Jegan', 'Venujan', 'Dino'], 'Age': [10, 15, 20], 'City': ['Badulla', 'Batti', 'Vavuniya']}


In [16]:
df = pd.DataFrame(data)
print(df)

      Name  Age      City
0    Jegan   10   Badulla
1  Venujan   15     Batti
2     Dino   20  Vavuniya


In [18]:
# Create a DataFrame from list of dictionary
data = [
    {"Name":"Jegan", "Age":10 , "City":"Badulla"},
    {"Name":"Venujan", "Age":15 , "City":"Batti"},
    {"Name":"Dino", "Age":20 , "City":"Vavuniya"}
]
df = pd.DataFrame(data)
print(df)

      Name  Age      City
0    Jegan   10   Badulla
1  Venujan   15     Batti
2     Dino   20  Vavuniya


In [20]:
df = pd.read_csv("./sales.csv")
df.head(5)

Unnamed: 0,Index,Customer Id,First Name,Last Name,Company,City,Country,Phone 1,Phone 2,Email,Subscription Date,Website
0,1,EB54EF1154C3A78,Heather,Callahan,Mosley-David,Lake Jeffborough,Norway,043-797-5229,915.112.1727,urangel@espinoza-francis.net,2020-08-26,http://www.escobar.org/
1,2,10dAcafEBbA5FcA,Kristina,Ferrell,"Horn, Shepard and Watson",Aaronville,Andorra,932-062-1802,(209)172-7124x3651,xreese@hall-donovan.com,2020-04-27,https://tyler-pugh.info/
2,3,67DAB15Ebe4BE4a,Briana,Andersen,Irwin-Oneal,East Jordan,Nepal,8352752061,(567)135-1918,haleybraun@blevins-sexton.com,2022-03-22,https://www.mack-bell.net/
3,4,6d350C5E5eDB4EE,Patty,Ponce,Richardson Group,East Kristintown,Northern Mariana Islands,302.398.3833,196-189-7767x770,hohailey@anthony.com,2020-07-02,https://delacruz-freeman.org/
4,5,5820deAdCF23EFe,Kathleen,Mccormick,Carson-Burch,Andresmouth,Macao,001-184-153-9683x1497,552.051.2979x342,alvaradojesse@rangel-shields.com,2021-01-17,https://welch.info/


In [21]:
# Access data from DataFrame
df['First Name']

0        Heather
1       Kristina
2         Briana
3          Patty
4       Kathleen
          ...   
9995      Meghan
9996    Jeremiah
9997       Peggy
9998        Evan
9999     Melissa
Name: First Name, Length: 10000, dtype: object

In [22]:
df.loc[3]

Index                                            4
Customer Id                        6d350C5E5eDB4EE
First Name                                   Patty
Last Name                                    Ponce
Company                           Richardson Group
City                              East Kristintown
Country                   Northern Mariana Islands
Phone 1                               302.398.3833
Phone 2                           196-189-7767x770
Email                         hohailey@anthony.com
Subscription Date                       2020-07-02
Website              https://delacruz-freeman.org/
Name: 3, dtype: object

In [24]:
df.iloc[7]

Index                                          8
Customer Id                      47C5cEE243c9A7b
First Name                                 Glenn
Last Name                                Wiggins
Company                             Glenn-Harvey
City                                  Ambershire
Country              Falkland Islands (Malvinas)
Phone 1                         245-207-5608x563
Phone 2                               8806867785
Email                     changkellie@howell.com
Subscription Date                     2021-04-02
Website                      http://carlson.com/
Name: 7, dtype: object

In [25]:
df.loc[10][5]

  df.loc[10][5]


'Lake Bobton'

In [32]:
# Accessing a specific Element
df.at[2, "First Name"]

'Briana'

In [34]:
df.iat[2,3]

'Andersen'

### Data Manipulation with DataFrame

In [None]:
## Adding a column 
df["Random"] = [(2*i**2 - 4*i**3)/i**2 for i in range(1,10001)]
print(df)

      Index      Customer Id First Name   Last Name                   Company  \
0         1  EB54EF1154C3A78    Heather    Callahan              Mosley-David   
1         2  10dAcafEBbA5FcA   Kristina     Ferrell  Horn, Shepard and Watson   
2         3  67DAB15Ebe4BE4a     Briana    Andersen               Irwin-Oneal   
3         4  6d350C5E5eDB4EE      Patty       Ponce          Richardson Group   
4         5  5820deAdCF23EFe   Kathleen   Mccormick              Carson-Burch   
...     ...              ...        ...         ...                       ...   
9995   9996  DD0caa06ebf217c     Meghan       Cline            Richardson LLC   
9996   9997  C1C62ff90D0AfED   Jeremiah        Pena               Hodge-Nixon   
9997   9998  2EfAa6Fe435A7DA      Peggy         Key                  Vang LLC   
9998   9999  5dB06C3Fab5CCCb       Evan    Humphrey                 Gould Ltd   
9999  10000  9903e8fF4D8Ff0F    Melissa  Montgomery              Burns-Holden   

                  City     

In [42]:
# Remove column
df.drop("Random", axis= 1, inplace=True)

In [43]:
df

Unnamed: 0,Index,Customer Id,First Name,Last Name,Company,City,Country,Phone 1,Phone 2,Email,Subscription Date,Website
0,1,EB54EF1154C3A78,Heather,Callahan,Mosley-David,Lake Jeffborough,Norway,043-797-5229,915.112.1727,urangel@espinoza-francis.net,2020-08-26,http://www.escobar.org/
1,2,10dAcafEBbA5FcA,Kristina,Ferrell,"Horn, Shepard and Watson",Aaronville,Andorra,932-062-1802,(209)172-7124x3651,xreese@hall-donovan.com,2020-04-27,https://tyler-pugh.info/
2,3,67DAB15Ebe4BE4a,Briana,Andersen,Irwin-Oneal,East Jordan,Nepal,8352752061,(567)135-1918,haleybraun@blevins-sexton.com,2022-03-22,https://www.mack-bell.net/
3,4,6d350C5E5eDB4EE,Patty,Ponce,Richardson Group,East Kristintown,Northern Mariana Islands,302.398.3833,196-189-7767x770,hohailey@anthony.com,2020-07-02,https://delacruz-freeman.org/
4,5,5820deAdCF23EFe,Kathleen,Mccormick,Carson-Burch,Andresmouth,Macao,001-184-153-9683x1497,552.051.2979x342,alvaradojesse@rangel-shields.com,2021-01-17,https://welch.info/
...,...,...,...,...,...,...,...,...,...,...,...,...
9995,9996,DD0caa06ebf217c,Meghan,Cline,Richardson LLC,Petersport,Cote d'Ivoire,001-270-136-4738x419,001-349-692-1755x055,nicolasandrade@bates.info,2022-04-27,http://thomas-ramos.com/
9996,9997,C1C62ff90D0AfED,Jeremiah,Pena,Hodge-Nixon,Briggsport,Namibia,+1-531-494-7645x4844,001-295-324-5502x5411,tracy68@wade.com,2022-01-17,https://www.campbell.com/
9997,9998,2EfAa6Fe435A7DA,Peggy,Key,Vang LLC,West Joyceport,United States Virgin Islands,520.109.2482x26052,+1-602-934-5901x95106,mclaughlinjulia@barnett-dorsey.com,2022-01-13,http://www.george-scott.com/
9998,9999,5dB06C3Fab5CCCb,Evan,Humphrey,Gould Ltd,Camachochester,Niger,001-465-242-9979,001-639-955-2116x751,velazquezjessica@villegas-wilcox.com,2020-04-14,https://www.payne.info/


In [44]:
df.describe()

Unnamed: 0,Index
count,10000.0
mean,5000.5
std,2886.89568
min,1.0
25%,2500.75
50%,5000.5
75%,7500.25
max,10000.0


# Data manipulation with Pandas and NumPy



In [48]:
df = pd.read_csv("./data.csv")


In [49]:
df.head(10)

Unnamed: 0,Date,Category,Value,Product,Sales,Region
0,2023-01-01,A,28.0,Product1,754.0,East
1,2023-01-02,B,39.0,Product3,110.0,North
2,2023-01-03,C,32.0,Product2,398.0,East
3,2023-01-04,B,8.0,Product1,522.0,East
4,2023-01-05,B,26.0,Product3,869.0,North
5,2023-01-06,B,54.0,Product3,192.0,West
6,2023-01-07,A,16.0,Product1,936.0,East
7,2023-01-08,C,89.0,Product1,488.0,West
8,2023-01-09,C,37.0,Product3,772.0,West
9,2023-01-10,A,22.0,Product2,834.0,West


In [50]:
df.tail(10)

Unnamed: 0,Date,Category,Value,Product,Sales,Region
40,2023-02-10,B,15.0,Product1,578.0,East
41,2023-02-11,C,97.0,Product1,256.0,East
42,2023-02-12,A,93.0,Product3,164.0,West
43,2023-02-13,A,43.0,Product3,949.0,East
44,2023-02-14,A,96.0,Product3,830.0,East
45,2023-02-15,B,99.0,Product2,599.0,West
46,2023-02-16,B,6.0,Product1,938.0,South
47,2023-02-17,B,69.0,Product3,143.0,West
48,2023-02-18,C,65.0,Product3,182.0,North
49,2023-02-19,C,11.0,Product3,708.0,North


In [51]:
df.describe()

Unnamed: 0,Value,Sales
count,47.0,46.0
mean,51.744681,557.130435
std,29.050532,274.598584
min,2.0,108.0
25%,27.5,339.0
50%,54.0,591.5
75%,70.0,767.5
max,99.0,992.0


In [54]:
df.dtypes

Date         object
Category     object
Value       float64
Product      object
Sales       float64
Region       object
dtype: object

## Handling missing values

In [55]:
df.isnull()

Unnamed: 0,Date,Category,Value,Product,Sales,Region
0,False,False,False,False,False,False
1,False,False,False,False,False,False
2,False,False,False,False,False,False
3,False,False,False,False,False,False
4,False,False,False,False,False,False
5,False,False,False,False,False,False
6,False,False,False,False,False,False
7,False,False,False,False,False,False
8,False,False,False,False,False,False
9,False,False,False,False,False,False


In [56]:
df.isnull().any()

Date        False
Category    False
Value        True
Product     False
Sales        True
Region      False
dtype: bool

In [57]:
df.isnull().any(axis = 1)

0     False
1     False
2     False
3     False
4     False
5     False
6     False
7     False
8     False
9     False
10    False
11     True
12    False
13    False
14    False
15     True
16    False
17     True
18    False
19    False
20    False
21    False
22    False
23    False
24    False
25    False
26    False
27    False
28     True
29    False
30    False
31    False
32    False
33     True
34    False
35     True
36    False
37     True
38    False
39    False
40    False
41    False
42    False
43    False
44    False
45    False
46    False
47    False
48    False
49    False
dtype: bool

In [58]:
df.isnull().sum()

Date        0
Category    0
Value       3
Product     0
Sales       4
Region      0
dtype: int64

In [61]:
df_filed_0 = df.fillna(0)

In [63]:
# filling missing value with mean
df['sales_fill_NA'] = df['Sales'].fillna(df['Sales'].mean())
df

Unnamed: 0,Date,Category,Value,Product,Sales,Region,sales_fill_NA
0,2023-01-01,A,28.0,Product1,754.0,East,754.0
1,2023-01-02,B,39.0,Product3,110.0,North,110.0
2,2023-01-03,C,32.0,Product2,398.0,East,398.0
3,2023-01-04,B,8.0,Product1,522.0,East,522.0
4,2023-01-05,B,26.0,Product3,869.0,North,869.0
5,2023-01-06,B,54.0,Product3,192.0,West,192.0
6,2023-01-07,A,16.0,Product1,936.0,East,936.0
7,2023-01-08,C,89.0,Product1,488.0,West,488.0
8,2023-01-09,C,37.0,Product3,772.0,West,772.0
9,2023-01-10,A,22.0,Product2,834.0,West,834.0
