###  ⚓️ Working with Numpy
<hr/>

In [1]:
import numpy as np

Objective: to convert a sample Python List into ND Array

In [2]:
sampleList=[10,20,30,40,50]

np.array() <---- this function converts the given list into ND Array

In [3]:
myarr = np.array ( sampleList )

In [4]:
type(myarr)

numpy.ndarray

In [5]:
print(sampleList) # this is List
print(myarr) # this is 1D Array

[10, 20, 30, 40, 50]
[10 20 30 40 50]


In [6]:
myarr.std()

14.142135623730951

In [8]:
np.median(myarr)

30.0

In [9]:
myarr.mean()

30.0

In [10]:
sampleList

[10, 20, 30, 40, 50]

In [11]:
myarr

array([10, 20, 30, 40, 50])

In [None]:
# ease of computing.

In [12]:
sampleList + 1

TypeError: can only concatenate list (not "int") to list

In [13]:
myarr + 1

array([11, 21, 31, 41, 51])

## High Performance 

<hr/>

In [14]:
# Memory Consumption Test

# Objective : to find how much memory is consumed by 1 integer in python vs numpy

In [15]:
# Python Test

import sys

print(sys.getsizeof(5),' bytes')

28  bytes


In [16]:
# Numpy Test

import numpy as np

print(np.array([1,2,3]).itemsize,' bytes')

8  bytes


In [None]:
# Speed Test

# Objective: Add 1 Million Numbers using python and Numpy.
# one taking less time will be the winner.

import numpy as np

import time

SIZE = 1000000

A1 = range(SIZE)
A2 = range(SIZE)

L1 = np.arange(SIZE)
L2 = np.arange(SIZE)

start = time.time() # this captures current time
result =[x+y for x,y in zip(A1,A2)]
stop = time.time()

print('Python took ', (stop-start) * 1000,' ms')


start = time.time()

result = L1 + L2

stop = time.time()

print('Numpy Took ',(stop-start) * 1000,' ms')

### Creating 1D , 2D and 3D Array
<hr/>

In [21]:
# Creating 1D Array


a =[10,20,30,40]

arr = np.array(a)

print(arr)

print(arr.ndim,' dimension(s)')

[10 20 30 40]
1  dimension(s)


In [22]:
# Creating 2D Array : group of 1D Array forms 2D

b=[
    [1,2,3],
    [4,5,6],
    [7,8,9]
]

b_arr = np.array(b)

print(b_arr)
print(b_arr.ndim,' dimension(s)')

[[1 2 3]
 [4 5 6]
 [7 8 9]]
2  dimension(s)


In [23]:
b_arr[2,0]

7

In [24]:
b_arr[-1,0]

7

In [None]:
## 3D Array : Group of 2D Array Forms 3D Array

In [25]:

c=[
    
    [
        [1,2,3],
        [4,5,6]
    ],
    
    [
        [7,8,9],
        [10,11,12]
    ],
    
    [
        [6,5,7],
        [9,10,11]
    ]
    
]

c_arr = np.array(c)
print(c_arr)
print(c_arr.ndim,' Dimension(s)')

[[[ 1  2  3]
  [ 4  5  6]]

 [[ 7  8  9]
  [10 11 12]]

 [[ 6  5  7]
  [ 9 10 11]]]
3  Dimension(s)


In [26]:
c_arr[-1,-1,-1]

11

In [27]:
c_arr[0,1,2]

6

### Slicing in Matrix
<hr/>

In [28]:
b=[
    [1,2,3,4,5],
    [6,7,8,9,10],
    [11,12,13,14,15],
    [16,17,18,19,20]
]

b_arr = np.array(b)
b_arr

array([[ 1,  2,  3,  4,  5],
       [ 6,  7,  8,  9, 10],
       [11, 12, 13, 14, 15],
       [16, 17, 18, 19, 20]])

Let us Find some statistical Facts from Data.

In [34]:
print('Sum ',b_arr.sum())
print('Min ',b_arr.min())
print('Max ',b_arr.max())
print('Mean ',b_arr.mean())

Sum  210
Min  1
Max  20
Mean  10.5


In [35]:
print('Column wise Sum ',b_arr.sum(axis=0))

Column wise Sum  [34 38 42 46 50]


In [36]:
print('Row Wise Sum ',b_arr.sum(axis=1))

Row Wise Sum  [15 40 65 90]


In [38]:
b_arr.min(axis=0)

array([1, 2, 3, 4, 5])

In [39]:
b_arr.min(axis=1)

array([ 1,  6, 11, 16])

In [None]:
# what is Axis ??
# axis defines orientation where
0 means Column
1 means Row

In [None]:
Row Indexes Targeted : 1 and 2   in slicing              1:3
        
Col Indexes Targeted : 1 ,2 and 3         in slicing  1:4

In [33]:
b_arr[1:3,1:4]

array([[ 7,  8,  9],
       [12, 13, 14]])

In [None]:
0 and 1 in slicing it is :2
2,3  and 4 in slicing 2:5

In [32]:
b_arr[:2 , 2:5]

array([[ 3,  4,  5],
       [ 8,  9, 10]])

In [31]:
b_arr[2,:]

array([11, 12, 13, 14, 15])

In [None]:
# Syntax:(slicing)

arr[rowIndex, colIndex]

In [None]:
Row Indexes Targeted : all is : in slicing
        
Col Indexe(s) Targted : 3

In [30]:
b_arr [ :, 3]

array([ 4,  9, 14, 19])

In [40]:
## Quick Activity:

# data of Riders


distance = [190,180,140,160,175,182,146,178]

time =[9,8,4,6,3,7,4,6]

# objective: using numpy find the speed of each Rider

In [41]:
d_arr  = np.array(distance)

t_arr  = np.array(time)

In [42]:
d_arr

array([190, 180, 140, 160, 175, 182, 146, 178])

In [43]:
t_arr

array([9, 8, 4, 6, 3, 7, 4, 6])

In [46]:
speed_arr = np.round(d_arr / t_arr, 2)

In [47]:
speed_arr

array([21.11, 22.5 , 35.  , 26.67, 58.33, 26.  , 36.5 , 29.67])

In [48]:
# filtering in Numpy

In [49]:
speed_arr[  speed_arr<30 ]

array([21.11, 22.5 , 26.67, 26.  , 29.67])

In [51]:
speed_arr[   np.logical_and(speed_arr>=25 ,  speed_arr<=30) ]

array([26.67, 26.  , 29.67])

In [None]:
Post break topics:
    
    Pandas (Introduction)
    Series & DataFrames.

## Pandas
<hr/>

In [None]:
How does provides provides it Analytical Functionalities ?

Pandas Exposes its Functionalities using its Data Structures


        - Series
        - DataFrame

In [None]:
Series : it is used to store/analyse/manipulate 1D or Linear Data . eg .list
DataFrame : it is used to store/analyse/manipulate 2D or tabular data . eg. excel sheet.

# Working with Series
<hr/>

In [64]:
import pandas as pd

Objective: to convert Python list into Series

In [65]:
ice_creme = ['vanilla','strawberry','rum-raisin','chocolate']

pd.Series() <---- this function converts the list into Series 

In [66]:
myseries = pd.Series(ice_creme)

In [67]:
type(myseries)

pandas.core.series.Series

In [68]:
myseries

0       vanilla
1    strawberry
2    rum-raisin
3     chocolate
dtype: object

## Working with DataFrame
<hr/>

In [69]:
import pandas as pd

pd.read_csv() # this function reads csv file content and returns data as DataFrame.

In [70]:
nba = pd.read_csv('nba.csv')

In [71]:
type(nba)

pandas.core.frame.DataFrame

In [72]:
nba

Unnamed: 0,Name,Team,Number,Position,Age,Height,Weight,College,Salary
0,Avery Bradley,Boston Celtics,0.0,PG,25.0,6-2,180.0,Texas,7730337.0
1,Jae Crowder,Boston Celtics,99.0,SF,25.0,6-6,235.0,Marquette,6796117.0
2,John Holland,Boston Celtics,30.0,SG,27.0,6-5,205.0,Boston University,
3,R.J. Hunter,Boston Celtics,28.0,SG,22.0,6-5,185.0,Georgia State,1148640.0
4,Jonas Jerebko,Boston Celtics,8.0,PF,29.0,6-10,231.0,,5000000.0
...,...,...,...,...,...,...,...,...,...
453,Shelvin Mack,Utah Jazz,8.0,PG,26.0,6-3,203.0,Butler,2433333.0
454,Raul Neto,Utah Jazz,25.0,PG,24.0,6-1,179.0,,900000.0
455,Tibor Pleiss,Utah Jazz,21.0,C,26.0,7-3,256.0,,2900000.0
456,Jeff Withey,Utah Jazz,24.0,C,26.0,7-0,231.0,Kansas,947276.0


# Data Inspection

In [73]:
nba.shape

(458, 9)

In [74]:
nba.columns

Index(['Name', 'Team', 'Number', 'Position', 'Age', 'Height', 'Weight',
       'College', 'Salary'],
      dtype='object')

In [75]:
nba.dtypes

Name         object
Team         object
Number      float64
Position     object
Age         float64
Height       object
Weight      float64
College      object
Salary      float64
dtype: object

In [76]:
nba.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 458 entries, 0 to 457
Data columns (total 9 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   Name      457 non-null    object 
 1   Team      457 non-null    object 
 2   Number    457 non-null    float64
 3   Position  457 non-null    object 
 4   Age       457 non-null    float64
 5   Height    457 non-null    object 
 6   Weight    457 non-null    float64
 7   College   373 non-null    object 
 8   Salary    446 non-null    float64
dtypes: float64(4), object(5)
memory usage: 32.3+ KB


In [77]:
nba.head()

Unnamed: 0,Name,Team,Number,Position,Age,Height,Weight,College,Salary
0,Avery Bradley,Boston Celtics,0.0,PG,25.0,6-2,180.0,Texas,7730337.0
1,Jae Crowder,Boston Celtics,99.0,SF,25.0,6-6,235.0,Marquette,6796117.0
2,John Holland,Boston Celtics,30.0,SG,27.0,6-5,205.0,Boston University,
3,R.J. Hunter,Boston Celtics,28.0,SG,22.0,6-5,185.0,Georgia State,1148640.0
4,Jonas Jerebko,Boston Celtics,8.0,PF,29.0,6-10,231.0,,5000000.0


In [78]:
nba.tail()

Unnamed: 0,Name,Team,Number,Position,Age,Height,Weight,College,Salary
453,Shelvin Mack,Utah Jazz,8.0,PG,26.0,6-3,203.0,Butler,2433333.0
454,Raul Neto,Utah Jazz,25.0,PG,24.0,6-1,179.0,,900000.0
455,Tibor Pleiss,Utah Jazz,21.0,C,26.0,7-3,256.0,,2900000.0
456,Jeff Withey,Utah Jazz,24.0,C,26.0,7-0,231.0,Kansas,947276.0
457,,,,,,,,,


In [79]:
nba.head(8)

Unnamed: 0,Name,Team,Number,Position,Age,Height,Weight,College,Salary
0,Avery Bradley,Boston Celtics,0.0,PG,25.0,6-2,180.0,Texas,7730337.0
1,Jae Crowder,Boston Celtics,99.0,SF,25.0,6-6,235.0,Marquette,6796117.0
2,John Holland,Boston Celtics,30.0,SG,27.0,6-5,205.0,Boston University,
3,R.J. Hunter,Boston Celtics,28.0,SG,22.0,6-5,185.0,Georgia State,1148640.0
4,Jonas Jerebko,Boston Celtics,8.0,PF,29.0,6-10,231.0,,5000000.0
5,Amir Johnson,Boston Celtics,90.0,PF,29.0,6-9,240.0,,12000000.0
6,Jordan Mickey,Boston Celtics,55.0,PF,21.0,6-8,235.0,LSU,1170960.0
7,Kelly Olynyk,Boston Celtics,41.0,C,25.0,7-0,238.0,Gonzaga,2165160.0


In [81]:
nba.isnull().sum()

Name         1
Team         1
Number       1
Position     1
Age          1
Height       1
Weight       1
College     85
Salary      12
dtype: int64

In [None]:
what if i want the rows from middle ?

In [82]:
nba.head()

Unnamed: 0,Name,Team,Number,Position,Age,Height,Weight,College,Salary
0,Avery Bradley,Boston Celtics,0.0,PG,25.0,6-2,180.0,Texas,7730337.0
1,Jae Crowder,Boston Celtics,99.0,SF,25.0,6-6,235.0,Marquette,6796117.0
2,John Holland,Boston Celtics,30.0,SG,27.0,6-5,205.0,Boston University,
3,R.J. Hunter,Boston Celtics,28.0,SG,22.0,6-5,185.0,Georgia State,1148640.0
4,Jonas Jerebko,Boston Celtics,8.0,PF,29.0,6-10,231.0,,5000000.0


In [83]:
nba.iloc[2]

Name             John Holland
Team           Boston Celtics
Number                   30.0
Position                   SG
Age                      27.0
Height                    6-5
Weight                  205.0
College     Boston University
Salary                    NaN
Name: 2, dtype: object

In [84]:
nba.iloc[3:6]

Unnamed: 0,Name,Team,Number,Position,Age,Height,Weight,College,Salary
3,R.J. Hunter,Boston Celtics,28.0,SG,22.0,6-5,185.0,Georgia State,1148640.0
4,Jonas Jerebko,Boston Celtics,8.0,PF,29.0,6-10,231.0,,5000000.0
5,Amir Johnson,Boston Celtics,90.0,PF,29.0,6-9,240.0,,12000000.0


In [85]:
nba.iloc[[2,6,9]]

Unnamed: 0,Name,Team,Number,Position,Age,Height,Weight,College,Salary
2,John Holland,Boston Celtics,30.0,SG,27.0,6-5,205.0,Boston University,
6,Jordan Mickey,Boston Celtics,55.0,PF,21.0,6-8,235.0,LSU,1170960.0
9,Marcus Smart,Boston Celtics,36.0,PG,22.0,6-4,220.0,Oklahoma State,3431040.0


## Selecting a Column from a DataFrame
<hr/>

In [86]:
nba.head(3)

Unnamed: 0,Name,Team,Number,Position,Age,Height,Weight,College,Salary
0,Avery Bradley,Boston Celtics,0.0,PG,25.0,6-2,180.0,Texas,7730337.0
1,Jae Crowder,Boston Celtics,99.0,SF,25.0,6-6,235.0,Marquette,6796117.0
2,John Holland,Boston Celtics,30.0,SG,27.0,6-5,205.0,Boston University,


In [89]:
nba.Name.head(3)

0    Avery Bradley
1      Jae Crowder
2     John Holland
Name: Name, dtype: object

In [90]:
nba['Name'].head(3)

0    Avery Bradley
1      Jae Crowder
2     John Holland
Name: Name, dtype: object

In [None]:
let us suppose we have a column 'my college'

nba.my college     ❎ 

nba['my college']  ✅ 

In [None]:
using (dot) we cannot access columns having spaces in them. eg. 'my college'

for such columns use [squareBracket] expression.

df['my column']

## Selecting Multiple Columns
<hr/>

In [91]:
nba.head(3)

Unnamed: 0,Name,Team,Number,Position,Age,Height,Weight,College,Salary
0,Avery Bradley,Boston Celtics,0.0,PG,25.0,6-2,180.0,Texas,7730337.0
1,Jae Crowder,Boston Celtics,99.0,SF,25.0,6-6,235.0,Marquette,6796117.0
2,John Holland,Boston Celtics,30.0,SG,27.0,6-5,205.0,Boston University,


In [93]:
myColumns = ['Name','Position','College']

nba[myColumns].head(3)

Unnamed: 0,Name,Position,College
0,Avery Bradley,PG,Texas
1,Jae Crowder,SF,Marquette
2,John Holland,SG,Boston University


In [95]:
# Directly

nba[['Name','Position','College']].head(3)

Unnamed: 0,Name,Position,College
0,Avery Bradley,PG,Texas
1,Jae Crowder,SF,Marquette
2,John Holland,SG,Boston University


In [102]:
nba.iloc[2:5].iloc[:,[1,2]]

Unnamed: 0,Team,Number
2,Boston Celtics,30.0
3,Boston Celtics,28.0
4,Boston Celtics,8.0


In [97]:
nba.head(1)

Unnamed: 0,Name,Team,Number,Position,Age,Height,Weight,College,Salary
0,Avery Bradley,Boston Celtics,0.0,PG,25.0,6-2,180.0,Texas,7730337.0


In [104]:
# Filtering Records

In [103]:
nba.head(3)

Unnamed: 0,Name,Team,Number,Position,Age,Height,Weight,College,Salary
0,Avery Bradley,Boston Celtics,0.0,PG,25.0,6-2,180.0,Texas,7730337.0
1,Jae Crowder,Boston Celtics,99.0,SF,25.0,6-6,235.0,Marquette,6796117.0
2,John Holland,Boston Celtics,30.0,SG,27.0,6-5,205.0,Boston University,


In [None]:
find the players from 'Texas' College 

In [106]:
texasPlayers = nba.College == 'Texas'

nba[texasPlayers].head(3)

Unnamed: 0,Name,Team,Number,Position,Age,Height,Weight,College,Salary
0,Avery Bradley,Boston Celtics,0.0,PG,25.0,6-2,180.0,Texas,7730337.0
66,Cory Joseph,Toronto Raptors,6.0,PG,24.0,6-3,190.0,Texas,7000000.0
133,P.J. Tucker,Phoenix Suns,17.0,SF,31.0,6-6,245.0,Texas,5500000.0


In [107]:
# Direct Expression

nba[nba.College == 'Texas']

Unnamed: 0,Name,Team,Number,Position,Age,Height,Weight,College,Salary
0,Avery Bradley,Boston Celtics,0.0,PG,25.0,6-2,180.0,Texas,7730337.0
66,Cory Joseph,Toronto Raptors,6.0,PG,24.0,6-3,190.0,Texas,7000000.0
133,P.J. Tucker,Phoenix Suns,17.0,SF,31.0,6-6,245.0,Texas,5500000.0
179,Tristan Thompson,Cleveland Cavaliers,13.0,C,25.0,6-9,238.0,Texas,14260870.0
208,Myles Turner,Indiana Pacers,33.0,PF,20.0,6-11,243.0,Texas,2357760.0
289,Jordan Hamilton,New Orleans Pelicans,25.0,SG,25.0,6-7,220.0,Texas,1015421.0
294,LaMarcus Aldridge,San Antonio Spurs,12.0,PF,30.0,6-11,240.0,Texas,19689000.0
384,D.J. Augustin,Denver Nuggets,12.0,PG,28.0,6-0,183.0,Texas,3000000.0
414,Kevin Durant,Oklahoma City Thunder,35.0,SF,27.0,6-9,240.0,Texas,20158622.0
