In [1]:
# working with the data using Python and csv package
#(to read and split up all the content)
import csv

#'~.csv' loaded from UCI ML repository; 
with open('winequality-red.csv', 'r') as f:
    wines = list(csv.reader(f, delimiter=';')) # 'reader' splits up the data according the given delimiter
            # calling the list type to get all ther rows from the file
print(wines[:5])

[['fixed acidity;"volatile acidity";"citric acid";"residual sugar";"chlorides";"free sulfur dioxide";"total sulfur dioxide";"density";"pH";"sulphates";"alcohol";"quality"'], ['7.4', '0.7', '0', '1.9', '0.076', '11', '34', '0.9978', '3.51', '0.56', '9.4', '5'], ['7.8', '0.88', '0', '2.6', '0.098', '25', '67', '0.9968', '3.2', '0.68', '9.8', '5'], ['7.8', '0.76', '0.04', '2.3', '0.092', '15', '54', '0.997', '3.26', '0.65', '9.8', '5'], ['11.2', '0.28', '0.56', '1.9', '0.075', '17', '60', '0.998', '3.16', '0.58', '9.8', '6']]


In [2]:
# finding the average quality of wines (last column)
qualities = [float(row[-1]) for row in wines[1:]]
#qualities
qual_mean = sum(qualities)/len(qualities)
round(qual_mean, 1)

5.6

In [3]:
# computing with that data is much easier and flexible wiht Numpy
import numpy as np

wines = np.array(wines[1:], dtype='float')
wines[:2]
wines.shape

array([[ 7.4   ,  0.7   ,  0.    ,  1.9   ,  0.076 , 11.    , 34.    ,
         0.9978,  3.51  ,  0.56  ,  9.4   ,  5.    ],
       [ 7.8   ,  0.88  ,  0.    ,  2.6   ,  0.098 , 25.    , 67.    ,
         0.9968,  3.2   ,  0.68  ,  9.8   ,  5.    ]])

(1599, 12)

In [4]:
np.set_printoptions(precision=3, suppress=True)

In [5]:
# another way of building the arrays
empty_array = np.zeros((3,4))
empty_array
random_array = np.random.random((3,4))
random_array

array([[0., 0., 0., 0.],
       [0., 0., 0., 0.],
       [0., 0., 0., 0.]])

array([[0.263, 0.176, 0.96 , 0.007],
       [0.266, 0.879, 0.978, 0.675],
       [0.651, 0.55 , 0.952, 0.907]])

In [6]:
# using numpy to read in files
wines = np.genfromtxt('winequality-red.csv', delimiter=';', skip_header=1)
wines[:1]

array([[ 7.4  ,  0.7  ,  0.   ,  1.9  ,  0.076, 11.   , 34.   ,  0.998,
         3.51 ,  0.56 ,  9.4  ,  5.   ]])

In [7]:
wines[2,3]

2.3

In [8]:
wines[:3,3]

array([1.9, 2.6, 2.3])

In [9]:
wines[:,3]

array([1.9, 2.6, 2.3, ..., 2.3, 2. , 3.6])

In [10]:
wines[3,:]

array([11.2  ,  0.28 ,  0.56 ,  1.9  ,  0.075, 17.   , 60.   ,  0.998,
        3.16 ,  0.58 ,  9.8  ,  6.   ])

In [11]:
wines[::] # or wines[:,:]

array([[ 7.4  ,  0.7  ,  0.   , ...,  0.56 ,  9.4  ,  5.   ],
       [ 7.8  ,  0.88 ,  0.   , ...,  0.68 ,  9.8  ,  5.   ],
       [ 7.8  ,  0.76 ,  0.04 , ...,  0.65 ,  9.8  ,  5.   ],
       ...,
       [ 6.3  ,  0.51 ,  0.13 , ...,  0.75 , 11.   ,  6.   ],
       [ 5.9  ,  0.645,  0.12 , ...,  0.71 , 10.2  ,  5.   ],
       [ 6.   ,  0.31 ,  0.47 , ...,  0.66 , 11.   ,  6.   ]])

In [12]:
wines[0,0] = 40; wines[0] # the same for all columns 
wines[0,0] = 7.4; wines[0]

array([40.   ,  0.7  ,  0.   ,  1.9  ,  0.076, 11.   , 34.   ,  0.998,
        3.51 ,  0.56 ,  9.4  ,  5.   ])

array([ 7.4  ,  0.7  ,  0.   ,  1.9  ,  0.076, 11.   , 34.   ,  0.998,
        3.51 ,  0.56 ,  9.4  ,  5.   ])

In [13]:
third_wine = wines[3,:]; print(third_wine) # form of list
third_wine # form of 1D-array
third_wine.shape

[11.2    0.28   0.56   1.9    0.075 17.    60.     0.998  3.16   0.58
  9.8    6.   ]


array([11.2  ,  0.28 ,  0.56 ,  1.9  ,  0.075, 17.   , 60.   ,  0.998,
        3.16 ,  0.58 ,  9.8  ,  6.   ])

(12,)

In [14]:
third_wine[1]

0.28

In [15]:
np.random.rand(3)

array([0.165, 0.863, 0.818])

In [16]:
wines.dtype # numpy accepts only element of single data type
    # suffix like 64 indicates how many bits of memory it takes up

dtype('float64')

In [17]:
# converting data types with 'astype()' function
wines.astype(int)
wines.astype(float) # returning conversion
wines.dtype

array([[ 7,  0,  0, ...,  0,  9,  5],
       [ 7,  0,  0, ...,  0,  9,  5],
       [ 7,  0,  0, ...,  0,  9,  5],
       ...,
       [ 6,  0,  0, ...,  0, 11,  6],
       [ 5,  0,  0, ...,  0, 10,  5],
       [ 6,  0,  0, ...,  0, 11,  6]])

array([[ 7.4  ,  0.7  ,  0.   , ...,  0.56 ,  9.4  ,  5.   ],
       [ 7.8  ,  0.88 ,  0.   , ...,  0.68 ,  9.8  ,  5.   ],
       [ 7.8  ,  0.76 ,  0.04 , ...,  0.65 ,  9.8  ,  5.   ],
       ...,
       [ 6.3  ,  0.51 ,  0.13 , ...,  0.75 , 11.   ,  6.   ],
       [ 5.9  ,  0.645,  0.12 , ...,  0.71 , 10.2  ,  5.   ],
       [ 6.   ,  0.31 ,  0.47 , ...,  0.66 , 11.   ,  6.   ]])

dtype('float64')

In [18]:
# checking the name property of the dtype
int_wines = wines.astype('int64')
int_wines.dtype.name

'int64'

In [19]:
# Numpy mathematical operations like(+, -, *, /, ^)
wines[:,11] + 10 # that operation does not change the 11th column

# like wines[:,11] += 10

array([15., 15., 15., ..., 16., 15., 16.])

In [20]:
wines[0]

array([ 7.4  ,  0.7  ,  0.   ,  1.9  ,  0.076, 11.   , 34.   ,  0.998,
        3.51 ,  0.56 ,  9.4  ,  5.   ])

In [21]:
# multiple array math
wines[:,11] + wines[:,11] # it is equivalent to wines[:,11]*2

array([10., 10., 10., ..., 12., 10., 12.])

In [22]:
# multplying array columns to learn more about their connected values
new_val = wines[:, 10] * wines[:, 11] # alcohol amount times quality
new_val
print(np.max(new_val), np.where(new_val == np.max(new_val)))

array([47., 49., 49., ..., 66., 51., 66.])

112.0 (array([ 588, 1269], dtype=int32),)


In [23]:
# Broadcasting - numpy checks compatibility by trying 
        # to match up elements before making math
        
# The last dimension is compared:
    # if the dimension lengths are equal or one is length of 1 - OK
    # if the dim lengths aren't equal and none is lenght of 1 - Error
# Continue checking dimensions until the shortes array is out of dimensions
# wines* np.array([1,2]) # error because no matching trailing dimension
wines.shape, np.array([1,2]).shape

((1599, 12), (2,))

In [24]:
# arrays which match can work math operations
array_one = np.array([[1,2],[3,4]])
array_two = np.array([4,5])
array_one + array_two

array([[5, 7],
       [7, 9]])

In [25]:
# example with wine data
rand_array = np.random.rand(12)
wines + rand_array # all data for particular colums has been
    # added by proper values from rand_array

array([[ 7.616,  0.76 ,  0.17 , ...,  1.43 ,  9.999,  5.98 ],
       [ 8.016,  0.94 ,  0.17 , ...,  1.55 , 10.399,  5.98 ],
       [ 8.016,  0.82 ,  0.21 , ...,  1.52 , 10.399,  5.98 ],
       ...,
       [ 6.516,  0.57 ,  0.3  , ...,  1.62 , 11.599,  6.98 ],
       [ 6.116,  0.705,  0.29 , ...,  1.58 , 10.799,  5.98 ],
       [ 6.216,  0.37 ,  0.64 , ...,  1.53 , 11.599,  6.98 ]])

In [26]:
# Numpy array methods - all can be viewed by 'dir(np)'
wines[:,11].sum() # the total of qualit ratings
wines.sum(axis=0)# sums for all columns
wines.sum(axis=1) # sums for all rows

9012.0

array([13303.1  ,   843.985,   433.29 ,  4059.55 ,   139.859, 25384.   ,
       74302.   ,  1593.798,  5294.47 ,  1052.38 , 16666.35 ,  9012.   ])

array([ 74.544, 123.055,  99.699, ..., 100.482, 105.215,  92.492])

In [27]:
# other methods behaving like the sum
wines.min(axis=0)
wines.max(axis=0)
wines.mean(axis=0)
wines.std(axis=0)

array([4.6  , 0.12 , 0.   , 0.9  , 0.012, 1.   , 6.   , 0.99 , 2.74 ,
       0.33 , 8.4  , 3.   ])

array([ 15.9  ,   1.58 ,   1.   ,  15.5  ,   0.611,  72.   , 289.   ,
         1.004,   4.01 ,   2.   ,  14.9  ,   8.   ])

array([ 8.32 ,  0.528,  0.271,  2.539,  0.087, 15.875, 46.468,  0.997,
        3.311,  0.658, 10.423,  5.636])

array([ 1.741,  0.179,  0.195,  1.409,  0.047, 10.457, 32.885,  0.002,
        0.154,  0.169,  1.065,  0.807])

In [49]:

# Numpy array comparisons
wines[:,11] > 5 # checking wines with quality better than 5

wines[(wines[:,11]>10)].shape # no wines with quality over 10

array([False, False, False, ...,  True, False,  True])

(0, 12)

In [77]:
# subsetting the arrays is like filtering for certain criteria
high_quality = wines[:,11]>7 # True values for wines complying the condition
wines[high_quality,:][:3,:] #from all wines of high quality show first 3 full rows

# subset with two conditions
high_quality_and_alcohol = (wines[:,10]>10) & (wines[:,11]>7)
wines[high_quality_and_alcohol,10:] # only 16 wines

# combining subsettin with assignment to overwrite certain values
wines[high_quality_and_alcohol,10:] = 20
wines[high_quality_and_alcohol][:3]

array([[ 7.9  ,  0.35 ,  0.46 ,  3.6  ,  0.078, 15.   , 37.   ,  0.997,
         3.35 ,  0.86 , 20.   , 20.   ],
       [10.3  ,  0.32 ,  0.45 ,  6.4  ,  0.073,  5.   , 13.   ,  0.998,
         3.23 ,  0.82 , 20.   , 20.   ],
       [ 5.6  ,  0.85 ,  0.05 ,  1.4  ,  0.045, 12.   , 88.   ,  0.992,
         3.56 ,  0.82 , 20.   , 20.   ]])

array([[20., 20.],
       [20., 20.],
       [20., 20.],
       [20., 20.],
       [20., 20.],
       [20., 20.],
       [20., 20.],
       [20., 20.],
       [20., 20.],
       [20., 20.],
       [20., 20.],
       [20., 20.],
       [20., 20.],
       [20., 20.],
       [20., 20.],
       [20., 20.]])

array([[ 7.9  ,  0.35 ,  0.46 ,  3.6  ,  0.078, 15.   , 37.   ,  0.997,
         3.35 ,  0.86 , 20.   , 20.   ],
       [10.3  ,  0.32 ,  0.45 ,  6.4  ,  0.073,  5.   , 13.   ,  0.998,
         3.23 ,  0.82 , 20.   , 20.   ],
       [ 5.6  ,  0.85 ,  0.05 ,  1.4  ,  0.045, 12.   , 88.   ,  0.992,
         3.56 ,  0.82 , 20.   , 20.   ]])

In [81]:
# reshaping numpy arrays
np.transpose(wines).shape # flipping the axes(rows/columns)

wines.ravel()[:25] # turning an array to one-dimensional array

(12, 1599)

array([ 7.4  ,  0.7  ,  0.   ,  1.9  ,  0.076, 11.   , 34.   ,  0.998,
        3.51 ,  0.56 ,  9.4  ,  5.   ,  7.8  ,  0.88 ,  0.   ,  2.6  ,
        0.098, 25.   , 67.   ,  0.997,  3.2  ,  0.68 ,  9.8  ,  5.   ,
        7.8  ])

In [82]:
wines[0,:].reshape(2,6)

array([[ 7.4  ,  0.7  ,  0.   ,  1.9  ,  0.076, 11.   ],
       [34.   ,  0.998,  3.51 ,  0.56 ,  9.4  ,  5.   ]])

In [89]:
# combining numpy arrays - first reading new file, than checking the shape
white_wines = np.genfromtxt('winequality-white.csv', delimiter=';', skip_header=1)
white_wines.shape

all_wines = np.vstack((wines, white_wines))
all_wines.shape

np.concatenate((wines, white_wines), axis=0).shape

(4898, 12)

(6497, 12)

(6497, 12)

In [90]:
# small challenge - checking a numpy understanding
zeros_1 = np.zeros((3,4))
zeros_2 = np.zeros((6,4))
zeros_1_2 = np.concatenate((zeros_1, zeros_2), axis=0)
first_column = zeros_1_2[:,0]; first_column

array([0., 0., 0., 0., 0., 0., 0., 0., 0.])