# 01intro_numpy

In [175]:
"""NumPy is a fundamental package for scientific computing with Python.
It implements N-dimensional array objects with some matrix and linear algebra operation.
Numpy provides the foundation for Python scientific data analisys with scipy."""

try:
  from google.colab import files
  g_colab = True
except:
  g_colab = False

# Importing numpy
import numpy as np        # numpy is written in C
import scipy.stats  # some module for utilities

# for high performace in data mining we need high performance data structures.
# Python has an extension for numerical computing and linear algebra which
# rely on a C implementation
# declaring 1-dimension array 
d1 = np.array([1, 2, 3, 4, 5, 6])

print(d1)
# print the predefinite propeties *nshape* e *ndim*
print(" d1 shape: %s \n d1 dimensions: %d\n" % (d1.shape, d1.ndim))

[1 2 3 4 5 6]
 d1 shape: (6,) 
 d1 dimensions: 1



In [176]:
# declaring 2-dimensions array
d2 = np.array([
    [1, 2, 3],
    [4, 5, 6]
])

print(d2)
print(" d2 shape: %s \n d2 dimensions: %d\n" % (d2.shape, d2.ndim))

[[1 2 3]
 [4 5 6]]
 d2 shape: (2, 3) 
 d2 dimensions: 2



In [177]:
# declaring 3-dimensions array
d3 = np.array([
    [[1, 2], [3, 4], [5, 6]],
    [[7, 8], [9, 10], [11, 12]]
])
print(d3)
print(" d3 shape: %s \n d3 dimensions: %d\n" % (d3.shape, d3.ndim))

[[[ 1  2]
  [ 3  4]
  [ 5  6]]

 [[ 7  8]
  [ 9 10]
  [11 12]]]
 d3 shape: (2, 3, 2) 
 d3 dimensions: 3



In [178]:
# iterating over numpy array
print("d1 elements")
for e in d1:
    print(e)

d1 elements
1
2
3
4
5
6


In [179]:
for row in d2:
    print(row)

[1 2 3]
[4 5 6]


In [180]:
print("\nd3 first axis")

for plane in d3:
    print(plane)


d3 first axis
[[1 2]
 [3 4]
 [5 6]]
[[ 7  8]
 [ 9 10]
 [11 12]]


In [181]:
print("\nd2 elements")
for row in d2:
    for e in row:
        print(e)


d2 elements
1
2
3
4
5
6


In [182]:
# functional programming: apply like

vfunc = np.vectorize(lambda x: x**2)
d1s = vfunc(d1)
print(d1)
d1s


[1 2 3 4 5 6]


array([ 1,  4,  9, 16, 25, 36])

In [183]:
# matrix operations componentwise
d1a = np.array([10, 11, 12, 13, 14, 15])
d2a = np.array([[10, 11], [12, 13], [14, 15]])
d1b = np.array([10, 10, 20, 20, 10, 15])

In [184]:
diff = d1 - d1a  
print("\nd1 - d1a:\n%s" % diff)


d1 - d1a:
[-9 -9 -9 -9 -9 -9]


In [185]:
add = d1 + d1a
print("\nd1 + d1a:\n%s" % add)


d1 + d1a:
[11 13 15 17 19 21]


In [186]:
mult = d1 * d1a
print("\nd1 * d1a:\n%s" % mult)


d1 * d1a:
[10 22 36 52 70 90]


In [187]:
sq = d1**2
print("\nd1^2:\n%s" % sq)


d1^2:
[ 1  4  9 16 25 36]


In [188]:
# some numpy algorithm
print("\nd1 min: %.2f" % np.min(d1))


d1 min: 1.00


In [189]:
print("d1 max: %.2f" % np.max(d1))

d1 max: 6.00


In [190]:
print("d1 mean: %.2f" % np.mean(d1))

d1 mean: 3.50


In [191]:
print("d1 median: %.2f" % np.median(d1))

d1 median: 3.50


In [192]:
print("d1 mode: %s" % str(scipy.stats.mode(d1)))

d1 mode: ModeResult(mode=array([1]), count=array([1]))


In [193]:
d1b = np.array([10, 10, 20, 20, 10, 15])
print("d1b mode: %s" % str(scipy.stats.mode(d1b)))  # bug: it is not multi-modal

d1b mode: ModeResult(mode=array([10]), count=array([3]))


# 02intro_pandas

In [194]:
"""
Pandas is an high level python package to work with CSV (Comma Separated Values) datasets. It is heavily based on Numpy.
We use it to load dataset from CSV files, clean up data, iterate thru them, convert them to numpy array.
"""

# importing pandas, a csv file format reader and lot more
import pandas as pd

file_name = "sample.csv"

if g_colab:
  uploaded = files.upload()
  import io
  dataset = pd.read_csv(io.BytesIO(uploaded[file_name]))
else:
  # loading csv
  dataset = pd.read_csv(file_name)

dataset

Unnamed: 0,name,surname,age,height,weight
0,mario,rossi,1,65.0,12.0
1,giorgio,bianchi,2,69.0,15.0
2,antonio,verdi,1,60.0,11.0
3,maria,bianchi,3,100.0,18.0
4,giulia,gialli,1,55.0,11.0
5,francesco,rossi,2,63.0,14.0
6,piero,"""rossi""",3,,
7,gaia,verdi,3,95.0,19.0
8,chiara,esposito,2,67.0,17.0


In [195]:
# printout column headers
dataset.columns

Index(['name', 'surname', 'age', 'height', 'weight'], dtype='object')

In [196]:
print("dataset has %d rows" % len(dataset))

dataset has 9 rows


In [197]:
# selecting columns

# get anonymized numerical data only
dataset_anon = dataset.drop(columns=["name", "surname"])
# note that the file is not saved, the modification are in RAM

dataset_anon

Unnamed: 0,age,height,weight
0,1,65.0,12.0
1,2,69.0,15.0
2,1,60.0,11.0
3,3,100.0,18.0
4,1,55.0,11.0
5,2,63.0,14.0
6,3,,
7,3,95.0,19.0
8,2,67.0,17.0


In [198]:
# printout dataset summary
print(dataset.describe())

            age      height     weight
count  9.000000    8.000000   8.000000
mean   2.000000   71.750000  14.625000
std    0.866025   16.516226   3.159453
min    1.000000   55.000000  11.000000
25%    1.000000   62.250000  11.750000
50%    2.000000   66.000000  14.500000
75%    3.000000   75.500000  17.250000
max    3.000000  100.000000  19.000000


In [199]:
print("\nWe have %d sample data (row number)" % len(dataset))


We have 9 sample data (row number)


In [200]:
# clean all row with Nan values, it will cause issues later. See pandas doc.
dataset_c1 = dataset.dropna(0)

print("We have %d sample after cleanup" % len(dataset_c1))
dataset_c1

We have 8 sample after cleanup


  dataset_c1 = dataset.dropna(0)


Unnamed: 0,name,surname,age,height,weight
0,mario,rossi,1,65.0,12.0
1,giorgio,bianchi,2,69.0,15.0
2,antonio,verdi,1,60.0,11.0
3,maria,bianchi,3,100.0,18.0
4,giulia,gialli,1,55.0,11.0
5,francesco,rossi,2,63.0,14.0
7,gaia,verdi,3,95.0,19.0
8,chiara,esposito,2,67.0,17.0


In [201]:
# fill up Nan with new values
dataset_c2 = dataset.fillna(1)
dataset_c2

Unnamed: 0,name,surname,age,height,weight
0,mario,rossi,1,65.0,12.0
1,giorgio,bianchi,2,69.0,15.0
2,antonio,verdi,1,60.0,11.0
3,maria,bianchi,3,100.0,18.0
4,giulia,gialli,1,55.0,11.0
5,francesco,rossi,2,63.0,14.0
6,piero,"""rossi""",3,1.0,1.0
7,gaia,verdi,3,95.0,19.0
8,chiara,esposito,2,67.0,17.0


In [202]:
# fill up Nan with new values
dataset_c2 = dataset.fillna({'height': 45, 'weight': 12})
# better estimate for nan?


In [203]:
# iterate over pandas dataset rows
for row in dataset.itertuples():
    print("%d, %d, %d" %(row.age, row.height, row.weight))  # you can access every colum in a row through his column name

1, 65, 12
2, 69, 15
1, 60, 11
3, 100, 18
1, 55, 11
2, 63, 14


ValueError: cannot convert float NaN to integer

In [None]:
# the same of above, but with apply syntax
def print_row(row):
    print("-- %d, %d, %d" %(row.age, row.height, row.weight))

# assign returned value to variable to avoid jupyter print them out
a = dataset.apply(print_row, axis=1)  # with axis = 1 we get columns by name
                                      # try axis = 0

-- 1, 65, 12
-- 2, 69, 15
-- 1, 60, 11
-- 3, 100, 18
-- 1, 55, 11
-- 2, 63, 14
-- 3, 45, 12
-- 3, 95, 19
-- 2, 67, 17


In [None]:
# we got a numpy array for height column
height = np.array(dataset['height'].values)
# now you can use numpy methods on the height data
print(height)

for e in height:
    print(e)


[ 65.  69.  60. 100.  55.  63.  45.  95.  67.]
65.0
69.0
60.0
100.0
55.0
63.0
45.0
95.0
67.0


In [None]:
# filtering out row in a dataset
# selecting people with age > 2

gt2 = dataset[dataset['age'] > 2]
gt2

# the statement "dataset['age'] > 2" returns an Pandas Series object which is a subclass 
# of numpy ndarray with some other features
# let's try them
# a = dataset['age'] < 3
# a

Unnamed: 0,name,surname,age,height,weight
3,maria,bianchi,3,100.0,18.0
6,piero,"""rossi""",3,95.0,
7,gaia,verdi,3,95.0,19.0


In [None]:
# filter conditions can be orred or ended
# remember to wrap condition in parentheses
filtered = dataset[(dataset['age'] < 3) & (dataset['weight'] > 11)]
filtered

Unnamed: 0,name,surname,age,height,weight
0,mario,rossi,1,65.0,12.0
1,giorgio,bianchi,2,69.0,15.0
5,francesco,rossi,2,63.0,14.0
8,chiara,esposito,2,67.0,17.0


In [227]:
# As excercise replace the nan in the dataset with the moda of the attribute
modes = dataset.mode(0)
print(dataset.fillna({'height': modes['height'][0], 'weight': modes['weight'][0]}))


        name   surname  age  height  weight
0      mario     rossi    1    65.0    12.0
1    giorgio   bianchi    2    69.0    15.0
2    antonio     verdi    1    60.0    11.0
3      maria   bianchi    3   100.0    18.0
4     giulia    gialli    1    55.0    11.0
5  francesco     rossi    2    63.0    14.0
6      piero   "rossi"    3    55.0    11.0
7       gaia     verdi    3    95.0    19.0
8     chiara  esposito    2    67.0    17.0
