# Aula 01 - Introdução ao Scikit-Learn
#### Material baseado no livro "Practical Machine Learning with Python" Sarkar, D. (et al.) (2018) 

---

# Numpy
## Arrays

In [2]:
import numpy as np
arr = np.array([1,3,4,5,6])
arr

array([1, 3, 4, 5, 6])

In [3]:
arr.shape

(5,)

In [4]:
arr.dtype

dtype('int32')

In [5]:
arr = np.array([1,'st','er',3])
arr.dtype

dtype('<U11')

In [6]:
#np.sum(arr)

### Creating arrays

In [7]:
arr = np.array([[1,2,3],[2,4,6],[8,8,8]])
arr.shape

(3, 3)

In [8]:
arr

array([[1, 2, 3],
       [2, 4, 6],
       [8, 8, 8]])

In [9]:
arr = np.zeros((2,4))
arr

array([[0., 0., 0., 0.],
       [0., 0., 0., 0.]])

In [10]:
arr = np.ones((2,4))
arr

array([[1., 1., 1., 1.],
       [1., 1., 1., 1.]])

In [11]:
arr = np.identity(3)
arr

array([[1., 0., 0.],
       [0., 1., 0.],
       [0., 0., 1.]])

In [12]:
arr = np.random.randn(3,4)
arr

array([[-1.17294973, -0.7176258 ,  1.43774354,  1.14874113],
       [ 0.69056559, -0.41634612,  0.43931288,  0.60970965],
       [ 0.82704888, -0.95830348,  0.87110525, -0.29793934]])

### Accessing array elements
#### Simple indexing

In [13]:
arr[1]

array([ 0.69056559, -0.41634612,  0.43931288,  0.60970965])

In [14]:
arr = np.arange(12).reshape(2,2,3)
arr

array([[[ 0,  1,  2],
        [ 3,  4,  5]],

       [[ 6,  7,  8],
        [ 9, 10, 11]]])

In [15]:
arr[0]

array([[0, 1, 2],
       [3, 4, 5]])

In [16]:
arr = np.arange(10)
arr[5:]


array([5, 6, 7, 8, 9])

In [17]:
arr[5:8]

array([5, 6, 7])

In [18]:
arr[:-5]

array([0, 1, 2, 3, 4])

In [19]:
arr = np.arange(12).reshape(2,2,3)
arr

array([[[ 0,  1,  2],
        [ 3,  4,  5]],

       [[ 6,  7,  8],
        [ 9, 10, 11]]])

In [20]:
arr[1:2]

array([[[ 6,  7,  8],
        [ 9, 10, 11]]])

In [21]:
arr = np.arange(27).reshape(3,3,3)
arr

array([[[ 0,  1,  2],
        [ 3,  4,  5],
        [ 6,  7,  8]],

       [[ 9, 10, 11],
        [12, 13, 14],
        [15, 16, 17]],

       [[18, 19, 20],
        [21, 22, 23],
        [24, 25, 26]]])

In [22]:
arr[:,:,2]

array([[ 2,  5,  8],
       [11, 14, 17],
       [20, 23, 26]])

In [23]:
arr[...,2]

array([[ 2,  5,  8],
       [11, 14, 17],
       [20, 23, 26]])

#### Advanced Indexing

In [24]:
arr = np.arange(9).reshape(3,3)
arr

array([[0, 1, 2],
       [3, 4, 5],
       [6, 7, 8]])

In [25]:
arr[[0,1,2],[1,0,0]]

array([1, 3, 6])

##### Boolean Indexing

In [26]:
cities = np.array(["delhi","banglaore","mumbai","chennai","bhopal"])
city_data = np.random.randn(5,3)
city_data

array([[ 0.95374776,  1.05253094,  0.63813633],
       [-0.56475547,  0.40506241,  0.60420777],
       [ 0.33532016, -0.11406904, -0.96402185],
       [-1.37376465,  0.69337669,  0.5765552 ],
       [ 0.44871748, -0.90859469,  0.42701475]])

In [27]:
city_data[cities =="delhi"]

array([[0.95374776, 1.05253094, 0.63813633]])

In [28]:
city_data[city_data >0]

array([0.95374776, 1.05253094, 0.63813633, 0.40506241, 0.60420777,
       0.33532016, 0.69337669, 0.5765552 , 0.44871748, 0.42701475])

In [29]:
city_data[city_data >0] = 0
city_data


array([[ 0.        ,  0.        ,  0.        ],
       [-0.56475547,  0.        ,  0.        ],
       [ 0.        , -0.11406904, -0.96402185],
       [-1.37376465,  0.        ,  0.        ],
       [ 0.        , -0.90859469,  0.        ]])

#### Operations on arrays

In [30]:
arr = np.arange(15).reshape(3,5)
arr

array([[ 0,  1,  2,  3,  4],
       [ 5,  6,  7,  8,  9],
       [10, 11, 12, 13, 14]])

In [31]:
arr + 5

array([[ 5,  6,  7,  8,  9],
       [10, 11, 12, 13, 14],
       [15, 16, 17, 18, 19]])

In [32]:
arr * 2

array([[ 0,  2,  4,  6,  8],
       [10, 12, 14, 16, 18],
       [20, 22, 24, 26, 28]])

In [33]:
arr1 = np.arange(15).reshape(5,3)
arr2 = np.arange(5).reshape(5,1)
arr2 + arr1


array([[ 0,  1,  2],
       [ 4,  5,  6],
       [ 8,  9, 10],
       [12, 13, 14],
       [16, 17, 18]])

In [34]:
arr1

array([[ 0,  1,  2],
       [ 3,  4,  5],
       [ 6,  7,  8],
       [ 9, 10, 11],
       [12, 13, 14]])

In [35]:
arr2

array([[0],
       [1],
       [2],
       [3],
       [4]])

In [36]:
arr1 = np.random.randn(5,3)
arr1

array([[-0.26918073,  0.85906466,  1.75918833],
       [ 0.31831192,  1.31468569,  0.91317115],
       [-0.92564747, -0.05541265, -1.25786654],
       [-1.34962361, -0.48109261, -1.02352383],
       [ 0.90038272,  1.06036372, -1.64391909]])

In [37]:
np.modf(arr1)

(array([[-0.26918073,  0.85906466,  0.75918833],
        [ 0.31831192,  0.31468569,  0.91317115],
        [-0.92564747, -0.05541265, -0.25786654],
        [-0.34962361, -0.48109261, -0.02352383],
        [ 0.90038272,  0.06036372, -0.64391909]]),
 array([[-0.,  0.,  1.],
        [ 0.,  1.,  0.],
        [-0., -0., -1.],
        [-1., -0., -1.],
        [ 0.,  1., -1.]]))

#### Linear algebra using numpy

In [38]:
A = np.array([[1,2,3],[4,5,6],[7,8,9]])
B = np.array([[9,8,7],[6,5,4],[1,2,3]])
A.dot(B)

array([[ 24,  24,  24],
       [ 72,  69,  66],
       [120, 114, 108]])

In [39]:
A = np.arange(15).reshape(3,5)
A.T

array([[ 0,  5, 10],
       [ 1,  6, 11],
       [ 2,  7, 12],
       [ 3,  8, 13],
       [ 4,  9, 14]])

# Pandas
## Data frames

In [40]:
import pandas as pd
d =  [{'city':'Delhi',"data":1000},
      {'city':'Banglaore',"data":2000},
      {'city':'Mumbai',"data":1000}]
pd.DataFrame(d)

Unnamed: 0,city,data
0,Delhi,1000
1,Banglaore,2000
2,Mumbai,1000


In [41]:
df = pd.DataFrame(d)

### Reading in data

In [42]:
city_data = pd.read_csv(filepath_or_buffer='simplemaps-worldcities-basic.csv')

FileNotFoundError: [Errno 2] No such file or directory: 'simplemaps-worldcities-basic.csv'

In [None]:
city_data.head(n=10)

In [None]:
city_data.tail()

In [None]:
series_es = city_data.lat

In [None]:
type(series_es)

In [None]:
series_es[1:10:2]

In [None]:
series_es[:7]

In [None]:
series_es[:-7315]

In [None]:
city_data[:7]

In [None]:
city_data.iloc[:5,:4]

In [None]:
city_data[city_data['pop'] > 10000000][city_data.columns[pd.Series(city_data.columns).str.startswith('l')]]

In [None]:
city_greater_10mil = city_data[city_data['pop'] > 10000000]
city_greater_10mil.rename(columns={'pop':'population'}, inplace=True)
city_greater_10mil.where(city_greater_10mil.population > 15000000)

In [None]:
df = pd.DataFrame(np.random.randn(8, 3),
columns=['A', 'B', 'C'])

### Operations on dataframes

In [None]:
nparray = df.values
type(nparray)

In [None]:
from numpy import nan
df.iloc[4,2] = nan

In [None]:
df

In [None]:
df.fillna(0)

In [None]:
columns_numeric = ['lat','lng','pop']

In [None]:
city_data[columns_numeric].mean()

In [None]:
city_data[columns_numeric].sum()

In [None]:
city_data[columns_numeric].count()

In [None]:
city_data[columns_numeric].median()

In [None]:
city_data[columns_numeric].quantile(0.8)

In [None]:
city_data[columns_numeric].sum(axis = 1).head()

In [None]:
city_data[columns_numeric].describe()

In [None]:
city_data1 = city_data.sample(3)

### Concatanating data frames

In [None]:
city_data2 = city_data.sample(3)
city_data_combine = pd.concat([city_data1,city_data2])
city_data_combine

In [None]:
country_data = city_data[['iso3','country']].drop_duplicates()

In [None]:
country_data.shape

In [None]:
country_data.head()

In [None]:
del(city_data['country'])

In [None]:
city_data.merge(country_data, 'inner').head()

# Scikit-learn

In [None]:
from sklearn import datasets
diabetes = datasets.load_diabetes()
X = diabetes.data[:10]
y = diabetes.target

In [None]:
X[:5]

In [None]:
y[:10]

In [None]:
feature_names=['age', 'sex', 'bmi', 'bp',
               's1', 's2', 's3', 's4', 's5', 's6']

## Scikit example regression

In [None]:
from sklearn import datasets
from sklearn.linear_model import Lasso

from sklearn import linear_model, datasets
from sklearn.model_selection import GridSearchCV

diabetes = datasets.load_diabetes()
X_train = diabetes.data[:310]
y_train = diabetes.target[:310]

X_test = diabetes.data[310:]
y_test = diabetes.target[310:]

learner = Lasso(random_state=0)
alphas = np.logspace(-4, -0.5, 30)

learner.fit(X_train, y_train)

In [None]:
y_pred = learner.predict(X_test)
y_pred

In [None]:
np.mean(abs(y_pred - y_test))