In [1]:
w1, w2, w3 = 0.3, 0.2, 0.5

In [2]:
kanto_temp = 73
kanto_rainfall = 67
kanto_humidity = 43

In [3]:
kanto_yield_apples = kanto_temp * w1 + kanto_rainfall * w2 + kanto_humidity * w3

In [4]:
kanto_yield_apples

56.8

In [5]:
print("the expected yield of apples in kanto region is {} tons per hectare".format(kanto_yield_apples))

the expected yield of apples in kanto region is 56.8 tons per hectare


In [6]:
kanto = [73, 67, 43]
johto = [91, 88, 64]
hoenn = [87, 134, 58]
sinnoh = [102, 43, 37]
unova = [69, 96, 70]

In [7]:
weights = [w1, w2, w3]

##### DOT PRODUCT OF TWO VECTORS

In [8]:
def crop_yield(region, weights):
    result = 0
    for x, w in zip(region, weights):
        result += x * w
    return result

In [9]:
crop_yield(kanto, weights)

56.8

In [10]:
crop_yield(johto, weights)

76.9

In [11]:
crop_yield(unova, weights)

74.9

#### GOING FROM PYTHON LISTS TO NUMPY ARRAYS

In [12]:
import numpy as np

In [13]:
kanto = np.array([73, 67, 43])

In [14]:
kanto

array([73, 67, 43])

In [15]:
weights = np.array([w1, w2, w3])

In [16]:
weights

array([0.3, 0.2, 0.5])

In [17]:
type(kanto)

numpy.ndarray

In [18]:
type(weights)

numpy.ndarray

In [19]:
weights[0]

0.3

In [20]:
kanto[2]

43

#### OPERATING ON NUMPY ARRAYS

In [22]:
np.dot(kanto, weights)

56.8

In [24]:
kanto

array([73, 67, 43])

In [25]:
weights

array([0.3, 0.2, 0.5])

In [26]:
kanto * weights

array([21.9, 13.4, 21.5])

In [23]:
(kanto * weights).sum()

56.8

In [27]:
arr1 = np.array([1, 2, 3])
arr2 = np.array([4, 5, 6])

In [28]:
arr1 * arr2

array([ 4, 10, 18])

In [29]:
arr2.sum()

15

#### BENEFITS OF NUMPY ARRAYS

In [30]:
#PYTHON LISTS
arr1 = list(range(1000000))
arr2 = list(range(1000000, 2000000))

# numpy arrays
arr1_np = np.array(arr1)
arr2_np = np.array(arr2)

In [31]:
%%time
result = 0
for x1, x2 in zip(arr1, arr2):
    result += x1*x2
result

CPU times: total: 297 ms
Wall time: 312 ms


833332333333500000

In [32]:
%%time
np.dot(arr1_np, arr2_np)

CPU times: total: 0 ns
Wall time: 2 ms


-1942957984

In [None]:
# so np.dot is 100 times fster than using a for loop..useful in datasets with millions of data points

#### MULTIPLE DIMENSIONAL NUMPY ARRAYS

In [33]:
climate_data = np.array([[73, 67, 43],
                        [91, 88, 64],
                        [87, 134, 58],
                        [102, 43, 37],
                        [69, 96, 70]])

In [34]:
climate_data

array([[ 73,  67,  43],
       [ 91,  88,  64],
       [ 87, 134,  58],
       [102,  43,  37],
       [ 69,  96,  70]])

In [35]:
# 2d array (matrix)
climate_data.shape

(5, 3)

In [36]:
# 1d array (vector)
weights.shape

(3,)

In [37]:
# 3d array
arr3 = np.array([[[11, 12, 13],
                 [13, 14, 15]],
                [[15, 16, 17],
                [17, 18, 19.5]]])

In [38]:
arr3.shape

(2, 2, 3)

In [39]:
weights.dtype

dtype('float64')

In [40]:
climate_data.dtype

dtype('int32')

In [41]:
arr3.dtype

dtype('float64')

In [None]:
# to perform a matrix multiplication we can use np.matmul function or @ operator

In [42]:
climate_data

array([[ 73,  67,  43],
       [ 91,  88,  64],
       [ 87, 134,  58],
       [102,  43,  37],
       [ 69,  96,  70]])

In [43]:
weights

array([0.3, 0.2, 0.5])

In [44]:
np.matmul(climate_data, weights)

array([56.8, 76.9, 81.9, 57.7, 74.9])

In [45]:
climate_data @ weights

array([56.8, 76.9, 81.9, 57.7, 74.9])

### Arithmetic operations and broadcasting

In [2]:
import numpy as np

In [3]:
arr2 = np.array([[1, 2, 3, 4],
                [5, 6, 7, 8],
                [9, 1, 2, 3]])

In [4]:
arr3 = np.array([[12, 14, 15, 18],
                [20, 22, 24, 26],
                [28, 12, 14, 16]])

In [5]:
arr2 + arr3

array([[13, 16, 18, 22],
       [25, 28, 31, 34],
       [37, 13, 16, 19]])

In [6]:
#adding a scalar
arr2 + 3

array([[ 4,  5,  6,  7],
       [ 8,  9, 10, 11],
       [12,  4,  5,  6]])

In [7]:
# element wise substration
arr3 - arr2

array([[11, 12, 12, 14],
       [15, 16, 17, 18],
       [19, 11, 12, 13]])

In [8]:
#division by scalar
arr2 / 2

array([[0.5, 1. , 1.5, 2. ],
       [2.5, 3. , 3.5, 4. ],
       [4.5, 0.5, 1. , 1.5]])

In [9]:
# element wise multiplication
arr2 * arr3

array([[ 12,  28,  45,  72],
       [100, 132, 168, 208],
       [252,  12,  28,  48]])

In [10]:
# modulus with scalar
arr2 % 4

array([[1, 2, 3, 0],
       [1, 2, 3, 0],
       [1, 1, 2, 3]], dtype=int32)

In [None]:
# now broadcasting

In [11]:
arr2 = np.array([[1, 2, 3, 4],
                [5, 6, 7, 8],
                [9, 1, 2, 3]])

In [12]:
arr2.shape

(3, 4)

In [13]:
arr4 = np.array([4, 5, 6, 7])

In [14]:
arr4.shape

(4,)

In [15]:
arr2 + arr4

array([[ 5,  7,  9, 11],
       [ 9, 11, 13, 15],
       [13,  6,  8, 10]])

In [16]:
arr5 = np.array([7, 8])

In [17]:
arr2

array([[1, 2, 3, 4],
       [5, 6, 7, 8],
       [9, 1, 2, 3]])

In [18]:
arr2 + arr5

ValueError: operands could not be broadcast together with shapes (3,4) (2,) 

In [None]:
# numpy arrays also support comparison operations

In [20]:
arr1 = np.array([[1, 2, 3], [3, 4, 5]])
arr2 = np.array([[2, 2, 3], [1, 2, 5]])

In [21]:
arr1 == arr2

array([[False,  True,  True],
       [False, False,  True]])

In [22]:
arr1 != arr2

array([[ True, False, False],
       [ True,  True, False]])

In [23]:
arr1 >= arr2

array([[False,  True,  True],
       [ True,  True,  True]])

In [24]:
arr1 < arr2

array([[ True, False, False],
       [False, False, False]])

In [25]:
# it is used to count the number of equal elements in 2 arrays using the sum method
(arr1 == arr2).sum()

3

In [26]:
import os

In [27]:
os.getcwd()

'C:\\Users\\EMMANUEL\\PYTHON'

In [28]:
os.listdir('.')

['.ipynb_checkpoints',
 'Boston House Prices.csv',
 'Data analysis with python zero to pandas1.ipynb',
 'Data analysis with python zero to pandas2.ipynb',
 'Data analysis with python zero to pandas3.ipynb',
 'gdp.csv',
 'my_test.png',
 'new folder',
 'pivot_table.xlsx',
 'population_total.csv',
 'Python for data analysis excel users.ipynb',
 'StudentsPerformance.csv',
 'supermarket_sales.csv.xlsx',
 'supermarket_sales.xlsx']

In [30]:
#create a directory
os.makedirs('./data', exist_ok=True)

In [31]:
#to verify if the directory named data is created 
os.listdir('.')

['.ipynb_checkpoints',
 'Boston House Prices.csv',
 'data',
 'Data analysis with python zero to pandas1.ipynb',
 'Data analysis with python zero to pandas2.ipynb',
 'Data analysis with python zero to pandas3.ipynb',
 'gdp.csv',
 'my_test.png',
 'new folder',
 'pivot_table.xlsx',
 'population_total.csv',
 'Python for data analysis excel users.ipynb',
 'StudentsPerformance.csv',
 'supermarket_sales.csv.xlsx',
 'supermarket_sales.xlsx']

In [32]:
#method 2 to check if data is there
'data' in os.listdir('.')

True

In [33]:
#to check if the there is data in data;
os.listdir('./data')

[]

### FOR PARSING AND PROCESSING DATA FROM FILES

In [1]:
def parse_headers(header_line):
    return header_line.strip().split(',')

def parse_values(data_line):
    values = []
    for item in data_line.strip().split(','):
        if item == '':
            values.append(0.0)
        else:
            values.append(float(item))
    return values

def create_item_dict(values, headers):
    result = {}
    for value, header in zip(values, headers):
        result[header] = value
    return result