# 1.7 NUMPY

In [40]:
import numpy as np

In [41]:
np

<module 'numpy' from '/home/codespace/.local/lib/python3.12/site-packages/numpy/__init__.py'>

## Creating arrays with NUMPY

In [42]:
np.ones(5)

array([1., 1., 1., 1., 1.])

In [43]:
np.full(10, 38)

array([38, 38, 38, 38, 38, 38, 38, 38, 38, 38])

In [44]:
a=np.array([1,2,3,4,5,7,12])
a

array([ 1,  2,  3,  4,  5,  7, 12])

In [45]:
# starting from 0
a[3] = 100

In [46]:
a

array([  1,   2,   3, 100,   5,   7,  12])

In [47]:
# creating an array with a range of numbers
np.arange(15)

array([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14])

In [48]:
# creating an array with a range of numbers starting from 3 to 10
np.arange(3, 10)

array([3, 4, 5, 6, 7, 8, 9])

In [49]:
# creating an array with a range of numbers starting from 0 to 100 with 11 numbers
# the third argument is how many numbers we want in the range
# np.linspace(start, stop, num) 
np.linspace(0,100,11)

array([  0.,  10.,  20.,  30.,  40.,  50.,  60.,  70.,  80.,  90., 100.])

In [50]:
# the third argument is step size
# np.arange doesn't include 'stop' value, so it will stop before reaching 101, if we want to include 100, we set stop to 101 (or any value greater than 100)
np.arange(0,101, 10)

array([  0,  10,  20,  30,  40,  50,  60,  70,  80,  90, 100])

## Multi-dimentional arrays

In [51]:
np.zeros((3, 4))  # 3 rows, 4 columns

array([[0., 0., 0., 0.],
       [0., 0., 0., 0.],
       [0., 0., 0., 0.]])

In [52]:
n = np.array([
    [1, 2, 3], 
    [4, 5, 6], 
    [7, 8, 9]
    ])

n

array([[1, 2, 3],
       [4, 5, 6],
       [7, 8, 9]])

In [53]:
n[0,1] = 100 # changing value at first row, second column

n

array([[  1, 100,   3],
       [  4,   5,   6],
       [  7,   8,   9]])

In [54]:
n[2] # accessing the third row

array([7, 8, 9])

In [55]:
n[2] = [111, 222, 333]

n

array([[  1, 100,   3],
       [  4,   5,   6],
       [111, 222, 333]])

In [56]:
n[:, 2] # accessing the third column

array([  3,   6, 333])

In [57]:
n[1, :] # accessing the second row

array([4, 5, 6])

## Randomly generated arrays

In [58]:
np.random.rand(5,2)

array([[0.20464863, 0.61927097],
       [0.29965467, 0.26682728],
       [0.62113383, 0.52914209],
       [0.13457995, 0.51357812],
       [0.18443987, 0.78533515]])

In [59]:
np.random.seed(2) # setting seed for reproducibility, so that we get the same random numbers every time we run the code
np.random.rand(5,2)

array([[0.4359949 , 0.02592623],
       [0.54966248, 0.43532239],
       [0.4203678 , 0.33033482],
       [0.20464863, 0.61927097],
       [0.29965467, 0.26682728]])

In [60]:
np.random.seed(2)
np.random.randn(5,2) # generating random numbers from a normal distribution (mean=0, std=1)

array([[-0.41675785, -0.05626683],
       [-2.1361961 ,  1.64027081],
       [-1.79343559, -0.84174737],
       [ 0.50288142, -1.24528809],
       [-1.05795222, -0.90900761]])

In [61]:
np.random.seed(2)
100* np.random.randn(5,2)

array([[ -41.67578474,   -5.62668272],
       [-213.61960957,  164.02708084],
       [-179.34355852,  -84.17473657],
       [  50.28814172, -124.52880866],
       [-105.79522189,  -90.90076149]])

In [62]:
np.random.seed(2)
np.random.randint(low=0, high=100, size = (5,2)) # generating random integers between 0 and 100, size is 5 rows and 2 columns

array([[40, 15],
       [72, 22],
       [43, 82],
       [75,  7],
       [34, 49]])

## Element-wise operations

In [63]:
a= np.arange(5)
a

array([0, 1, 2, 3, 4])

In [64]:
a+2


array([2, 3, 4, 5, 6])

In [65]:
b = 10*(a+2)
b

array([20, 30, 40, 50, 60])

In [66]:
11*(a+b)

array([220, 341, 462, 583, 704])

## Comparison operations

In [67]:
a

array([0, 1, 2, 3, 4])

In [68]:
a >= 2

array([False, False,  True,  True,  True])

In [69]:
b

array([20, 30, 40, 50, 60])

In [70]:
a>b

array([False, False, False, False, False])

In [71]:
a[a>b-22] # this will return an array with elements of 'a' that are greater than 'b-22'

array([0])

In [72]:
a[a>b] # accessing elements of a that are greater than b

array([], dtype=int64)

## Summarizing operations

In [73]:
a

array([0, 1, 2, 3, 4])

In [74]:
a.mean()

np.float64(2.0)

In [75]:
a.std()

np.float64(1.4142135623730951)

In [76]:
n.min() # minimum value in the array

np.int64(1)

# 1.8 Linear Algebra

## Vector operations

![image.png](attachment:image.png)

linear algebrarows is typically column vectors not rows\

but in numpy usually it will be 

![image.png](attachment:image.png)

## Multiplication



### vector-vector multiplication (dot product)

We multiply each number seperately

![image.png](attachment:image.png)

The way we write in linear algebra is usually in format 'row vector * column vector' --- we can use 'transpose' row into column

![image.png](attachment:image.png)

In [2]:
import numpy as np

In [3]:
u = np.array([1, 2, 3])
v = np.array([4, 5, 6])

In [4]:
def vector_vectore_multiplication(u, v):
   assert u.shape[0] == v.shape[0] # number of rows in u and v must be the same
   
   n = u.shape[0]
   
   result = 0.0 # formatting result as a float
   
   for i in range(n):
         result = result +u[i] * v[i]
         
   return result
   

In [6]:
vector_vectore_multiplication(u, v)

np.float64(32.0)

In [None]:
np.dot(u, v) # dot product of u and v ----- same as vector_vectore_multiplication(u, v)

np.int64(32)

### Matrix - vector multiplication

![image-2.png](attachment:image-2.png)

In [13]:
U = np.array([
    [2, 4, 5, 6],
    [1, 2, 1, 2],
    [3, 1, 2, 1]
])

V = np.array([1, 0.5, 2, 1])

In [14]:
U.shape # shape of the array U, returns (3, 4) meaning 3 rows and 4 columns

(3, 4)

In [15]:
U.shape[0] # number of rows in U

3

![image.png](attachment:image.png)

In [17]:
U.dot(V)

array([20. ,  6. ,  8.5])

### Matrix - matrix multiplication

![image.png](attachment:image.png)

In [19]:
U = np.array([
    [2, 4, 5, 6],
    [1, 2, 1, 2],
    [3, 1, 2, 1]
])

X = np.array([
    [1, 1, 2],
    [0, 0.5, 1],
    [0, 2, 1],
    [2, 1, 0]
])

![image-2.png](attachment:image-2.png)

In [20]:
U.dot(X)

array([[14. , 20. , 13. ],
       [ 5. ,  6. ,  5. ],
       [ 5. ,  8.5,  9. ]])

## Identity Matrix

In [21]:
np.eye(10)

array([[1., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 1., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 1., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 1., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 1., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 1., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 1., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 1., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 1., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 0., 1.]])

An identity matrix is a square matrix with ones on the main diagonal and zeros everywhere else, acting as the multiplicative identity in matrix multiplication.

## Matrix inverse

The inverse of a square matrix \(A\) is another matrix \(A^{-1}\) such that when you multiply them, you get the identity matrix: \(A \cdot A^{-1} = I\). Not all matrices have an inverse; only square, non-singular matrices do.

矩阵的逆（逆矩阵）是指：对于一个方阵 \(A\)，存在另一个矩阵 \(A^{-1}\)，使得 \(A \cdot A^{-1} = I\)，其中 \(I\) 是单位矩阵。不是所有矩阵都有逆，只有方阵且非奇异（可逆）的矩阵才有逆矩阵。

To compute the inverse of a square matrix in NumPy, use `np.linalg.inv()`. The matrix must be square and invertible.



In [22]:
A = np.array([
    [1, 2],
    [3, 4]
])

A_inv = np.linalg.inv(A)
A_inv

array([[-2. ,  1. ],
       [ 1.5, -0.5]])



If you multiply a matrix by its inverse, you get the identity matrix.

# 1.9 Pandas 

https://github.com/alexeygrigorev/mlbookcamp-code/blob/master/appendix-d-pandas.ipynb

In [23]:
import numpy as np
import pandas as pd

## Data frames

In [24]:
data = [
    ['Nissan', 'Stanza', 1991, 138, 4, 'MANUAL', 'sedan', 2000],
    ['Hyundai', 'Sonata', 2017, None, 4, 'AUTOMATIC', 'Sedan', 27150],
    ['Lotus', 'Elise', 2010, 218, 4, 'MANUAL', 'convertible', 54990],
    ['GMC', 'Acadia',  2017, 194, 4, 'AUTOMATIC', '4dr SUV', 34450],
    ['Nissan', 'Frontier', 2017, 261, 6, 'MANUAL', 'Pickup', 32340],
]

columns = [
    'Make', 'Model', 'Year', 'Engine HP', 'Engine Cylinders',
    'Transmission Type', 'Vehicle_Style', 'MSRP'
]

In [26]:
df = pd.DataFrame(data, columns=columns)
df

Unnamed: 0,Make,Model,Year,Engine HP,Engine Cylinders,Transmission Type,Vehicle_Style,MSRP
0,Nissan,Stanza,1991,138.0,4,MANUAL,sedan,2000
1,Hyundai,Sonata,2017,,4,AUTOMATIC,Sedan,27150
2,Lotus,Elise,2010,218.0,4,MANUAL,convertible,54990
3,GMC,Acadia,2017,194.0,4,AUTOMATIC,4dr SUV,34450
4,Nissan,Frontier,2017,261.0,6,MANUAL,Pickup,32340


In [29]:
df.head(n=3)

Unnamed: 0,Make,Model,Year,Engine HP,Engine Cylinders,Transmission Type,Vehicle_Style,MSRP
0,Nissan,Stanza,1991,138.0,4,MANUAL,sedan,2000
1,Hyundai,Sonata,2017,,4,AUTOMATIC,Sedan,27150
2,Lotus,Elise,2010,218.0,4,MANUAL,convertible,54990


In [31]:
df['id'] = ['nis1', 'hyu1', 'lot2', 'gmc1', 'nis2']
df

Unnamed: 0,Make,Model,Year,Engine HP,Engine Cylinders,Transmission Type,Vehicle_Style,MSRP,id
0,Nissan,Stanza,1991,138.0,4,MANUAL,sedan,2000,nis1
1,Hyundai,Sonata,2017,,4,AUTOMATIC,Sedan,27150,hyu1
2,Lotus,Elise,2010,218.0,4,MANUAL,convertible,54990,lot2
3,GMC,Acadia,2017,194.0,4,AUTOMATIC,4dr SUV,34450,gmc1
4,Nissan,Frontier,2017,261.0,6,MANUAL,Pickup,32340,nis2


In [32]:
del df['id']
df

Unnamed: 0,Make,Model,Year,Engine HP,Engine Cylinders,Transmission Type,Vehicle_Style,MSRP
0,Nissan,Stanza,1991,138.0,4,MANUAL,sedan,2000
1,Hyundai,Sonata,2017,,4,AUTOMATIC,Sedan,27150
2,Lotus,Elise,2010,218.0,4,MANUAL,convertible,54990
3,GMC,Acadia,2017,194.0,4,AUTOMATIC,4dr SUV,34450
4,Nissan,Frontier,2017,261.0,6,MANUAL,Pickup,32340


In Python, `del` is a keyword used to delete objects.  
When you write `del df['id']`, it deletes (removes) the `'id'` column from the DataFrame `df`.

In [40]:
df.loc[[2, 4]]

Unnamed: 0,Make,Model,Year,Engine HP,Engine Cylinders,Transmission Type,Vehicle_Style,MSRP
2,Lotus,Elise,2010,218.0,4,MANUAL,convertible,54990
4,Nissan,Frontier,2017,261.0,6,MANUAL,Pickup,32340


In [39]:
df.loc[2]

Make                       Lotus
Model                      Elise
Year                        2010
Engine HP                  218.0
Engine Cylinders               4
Transmission Type         MANUAL
Vehicle_Style        convertible
MSRP                       54990
Name: 2, dtype: object

- `df.loc[[2]]` returns a **DataFrame** containing the row(s) with index label 2 (as a list, so the result is still a DataFrame).
- `df.loc[2]` returns a **Series** for the row with index label 2 (just that single row).

Use `df.loc[[2]]` if you want to keep the result as a DataFrame, and `df.loc[2]` if you want a Series.

In [34]:
df.iloc[2]

Make                       Lotus
Model                      Elise
Year                        2010
Engine HP                  218.0
Engine Cylinders               4
Transmission Type         MANUAL
Vehicle_Style        convertible
MSRP                       54990
Name: 2, dtype: object

`iloc` is used to select rows and columns by **integer position** (like row 0, 1, 2), while `loc` is used to select rows and columns by **label** (like the index name or column name).  
- `df.iloc[2]` gets the third row by position.  
- `df.loc[2]` gets the row with index label 2 (which may not be the third row if the index is customized).

In [43]:
df.index = ['aa', 'bb', 'cc', 'dd', 'ee']  
df

Unnamed: 0,Make,Model,Year,Engine HP,Engine Cylinders,Transmission Type,Vehicle_Style,MSRP
aa,Nissan,Stanza,1991,138.0,4,MANUAL,sedan,2000
bb,Hyundai,Sonata,2017,,4,AUTOMATIC,Sedan,27150
cc,Lotus,Elise,2010,218.0,4,MANUAL,convertible,54990
dd,GMC,Acadia,2017,194.0,4,AUTOMATIC,4dr SUV,34450
ee,Nissan,Frontier,2017,261.0,6,MANUAL,Pickup,32340


In [None]:
df.reset_index() # resetting the index, it will create a new column with the old index values and set a new default index (0, 1, 2, ...)

Unnamed: 0,index,Make,Model,Year,Engine HP,Engine Cylinders,Transmission Type,Vehicle_Style,MSRP
0,aa,Nissan,Stanza,1991,138.0,4,MANUAL,sedan,2000
1,bb,Hyundai,Sonata,2017,,4,AUTOMATIC,Sedan,27150
2,cc,Lotus,Elise,2010,218.0,4,MANUAL,convertible,54990
3,dd,GMC,Acadia,2017,194.0,4,AUTOMATIC,4dr SUV,34450
4,ee,Nissan,Frontier,2017,261.0,6,MANUAL,Pickup,32340


## Element-wise operations

In [46]:
df['Engine HP'] *2

aa    276.0
bb      NaN
cc    436.0
dd    388.0
ee    522.0
Name: Engine HP, dtype: float64

In [50]:
df['Year'] >= 2015

aa    False
bb     True
cc    False
dd     True
ee     True
Name: Year, dtype: bool

In [51]:
df.index = [1, 2, 3, 4, 5]  # setting index to a new list of values
df['Year'] >= 2015


1    False
2     True
3    False
4     True
5     True
Name: Year, dtype: bool

## Filtering

The structure `df[df[]]` is used in pandas to **filter rows** of a DataFrame based on a condition.

- Inside the brackets, you put a condition that returns a boolean Series (True/False for each row).
- Only the rows where the condition is `True` are kept.



This keeps only the rows where the `'Year'` column is greater than or equal to 2015.  
The part `df['Year'] >= 2015` creates a boolean Series, and `df[...]` uses it to filter the DataFrame.

In [53]:
df[
   df['Year'] >= 2015
]

Unnamed: 0,Make,Model,Year,Engine HP,Engine Cylinders,Transmission Type,Vehicle_Style,MSRP
2,Hyundai,Sonata,2017,,4,AUTOMATIC,Sedan,27150
4,GMC,Acadia,2017,194.0,4,AUTOMATIC,4dr SUV,34450
5,Nissan,Frontier,2017,261.0,6,MANUAL,Pickup,32340


In [54]:
df[
    (df['Year'] >= 2015) & (df['Engine HP'] > 200)
]

Unnamed: 0,Make,Model,Year,Engine HP,Engine Cylinders,Transmission Type,Vehicle_Style,MSRP
5,Nissan,Frontier,2017,261.0,6,MANUAL,Pickup,32340


## String operations

In [None]:
# str.lower() converts all characters in the string to lowercase
df['Vehicle_Style'].str.lower()

1          sedan
2          sedan
3    convertible
4        4dr suv
5         pickup
Name: Vehicle_Style, dtype: object

In [None]:
# str.replace() replaces all occurrences of a substring with another substring
df['Vehicle_Style'].str.replace(' ', '_').str.lower()

1          sedan
2          sedan
3    convertible
4        4dr_suv
5         pickup
Name: Vehicle_Style, dtype: object

In [58]:
df['Vehicle_Style'] = df['Vehicle_Style'].str.replace(' ', '_').str.lower()

df


Unnamed: 0,Make,Model,Year,Engine HP,Engine Cylinders,Transmission Type,Vehicle_Style,MSRP
1,Nissan,Stanza,1991,138.0,4,MANUAL,sedan,2000
2,Hyundai,Sonata,2017,,4,AUTOMATIC,sedan,27150
3,Lotus,Elise,2010,218.0,4,MANUAL,convertible,54990
4,GMC,Acadia,2017,194.0,4,AUTOMATIC,4dr_suv,34450
5,Nissan,Frontier,2017,261.0,6,MANUAL,pickup,32340


## Summarizing operations

In [61]:
df.MSRP.mean()

np.float64(30186.0)

In [62]:
df.MSRP.describe()

count        5.000000
mean     30186.000000
std      18985.044904
min       2000.000000
25%      27150.000000
50%      32340.000000
75%      34450.000000
max      54990.000000
Name: MSRP, dtype: float64

In [64]:
df.describe() # numeric columns only

Unnamed: 0,Year,Engine HP,Engine Cylinders,MSRP
count,5.0,4.0,5.0,5.0
mean,2010.4,202.75,4.4,30186.0
std,11.260551,51.29896,0.894427,18985.044904
min,1991.0,138.0,4.0,2000.0
25%,2010.0,180.0,4.0,27150.0
50%,2017.0,206.0,4.0,32340.0
75%,2017.0,228.75,4.0,34450.0
max,2017.0,261.0,6.0,54990.0


In [65]:
df.describe().round(2) # rounding the values in the DataFrame to 2 decimal places

Unnamed: 0,Year,Engine HP,Engine Cylinders,MSRP
count,5.0,4.0,5.0,5.0
mean,2010.4,202.75,4.4,30186.0
std,11.26,51.3,0.89,18985.04
min,1991.0,138.0,4.0,2000.0
25%,2010.0,180.0,4.0,27150.0
50%,2017.0,206.0,4.0,32340.0
75%,2017.0,228.75,4.0,34450.0
max,2017.0,261.0,6.0,54990.0


In [66]:
df.Make.nunique()  # number of unique values in the 'Make' column

4

In [68]:
df.nunique()  # number of unique values in the DataFrame

Make                 4
Model                5
Year                 3
Engine HP            4
Engine Cylinders     2
Transmission Type    2
Vehicle_Style        4
MSRP                 5
dtype: int64

## Missing value

In [70]:
df.isnull()

Unnamed: 0,Make,Model,Year,Engine HP,Engine Cylinders,Transmission Type,Vehicle_Style,MSRP
1,False,False,False,False,False,False,False,False
2,False,False,False,True,False,False,False,False
3,False,False,False,False,False,False,False,False
4,False,False,False,False,False,False,False,False
5,False,False,False,False,False,False,False,False


In [71]:
df.isnull().sum()  # counting the number of missing values in each column

Make                 0
Model                0
Year                 0
Engine HP            1
Engine Cylinders     0
Transmission Type    0
Vehicle_Style        0
MSRP                 0
dtype: int64

## Grouping

![image.png](attachment:image.png)

in sql

In [75]:
df.groupby('Transmission Type').MSRP.mean().round(2)  # grouping by 'Transmission Type' and calculating the average MSRP for each group

Transmission Type
AUTOMATIC    30800.00
MANUAL       29776.67
Name: MSRP, dtype: float64

## Getting the numpy arrays

`iloc` is used to select rows and columns by **integer position** (like row 0, 1, 2), while `loc` is used to select rows and columns by **label** (like the index name or column name).  
- `df.iloc[2]` gets the third row by position.  
- `df.loc[2]` gets the row with index label 2 (which may not be the third row if the index is customized).

`iloc` is used to select rows and columns by **integer position** (like row 0, 1, 2), while `loc` is used to select rows and columns by **label** (like the index name or column name).  
- `df.iloc[2]` gets the third row by position.  
- `df.loc[2]` gets the row with index label 2 (which may not be the third row if the index is customized).