# Atividade 2 - Pré-Processamento
- José Maria dos Santos Leal
- Pedro Hércules de Sousa Dantas
- Marcos Paulo Fontes Leal

## Importando as bibliotecas

In [1]:
import pandas as pd
import numpy as np
import os
from sklearn.impute import SimpleImputer 
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import pickle
import requests
import matplotlib.pyplot as plt 

## Carregando os dados

In [2]:
DATASET_PATH = '/Users/JOSE MARIA/Downloads/6° periodo/Sistemas inteligentes'

In [3]:
DATASET_NAME = 'Covid_piaui.csv'

In [4]:
def load_data(dataset_path, dataset_name):
    csv_path = os.path.join(dataset_path, dataset_name)
    return pd.read_csv(csv_path)

In [5]:
data = load_data(DATASET_PATH, DATASET_NAME)

In [6]:
data = data.drop('is_last', axis=1)
data = data.drop('state', axis=1)
data = data.drop('place_type', axis=1)
data = data.drop('Unnamed: 0', axis=1)
data = data.drop('date', axis=1)

In [7]:
data_alvo = data['deaths'].copy()

In [8]:
data = data.drop('deaths', axis=1)

In [9]:
data_alvo

0       1
1       3
2       9
3       5
4      13
       ..
219     1
220     9
221     0
222     3
223    56
Name: deaths, Length: 224, dtype: int64

In [10]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 224 entries, 0 to 223
Data columns (total 8 columns):
 #   Column                          Non-Null Count  Dtype  
---  ------                          --------------  -----  
 0   city                            224 non-null    object 
 1   confirmed                       224 non-null    int64  
 2   order_for_place                 224 non-null    int64  
 3   estimated_population_2019       224 non-null    float64
 4   estimated_population            224 non-null    float64
 5   city_ibge_code                  224 non-null    float64
 6   confirmed_per_100k_inhabitants  224 non-null    float64
 7   death_rate                      224 non-null    float64
dtypes: float64(5), int64(2), object(1)
memory usage: 14.1+ KB


In [11]:
cidades = data['city'].values

In [12]:
latitudes = []
longitudes = []
for cidade in cidades:
    request = 'https://nominatim.openstreetmap.org/search?city='+cidade+'&state=Piaui&format=json'
    response = requests.get(request)
    response_data = response.json()[0]
    
    latitude = float(response_data["lat"])
    longitude = float(response_data["lon"])
    latitudes.append(latitude)
    longitudes.append(longitude)

In [13]:
data['latitude'] = latitudes

In [14]:
data['longitude'] = longitudes

In [15]:
data

Unnamed: 0,city,confirmed,order_for_place,estimated_population_2019,estimated_population,city_ibge_code,confirmed_per_100k_inhabitants,death_rate,latitude,longitude
0,Acauã,175,329,7084.0,7102.0,2200053.0,2464.09462,0.0057,-8.219542,-41.083059
1,Agricolândia,332,321,5139.0,5131.0,2200103.0,6470.47359,0.0090,-5.796784,-42.660513
2,Alagoinha do Piauí,256,335,7651.0,7665.0,2200251.0,3339.85649,0.0352,-7.009095,-40.939675
3,Alegrete do Piauí,460,317,4915.0,4918.0,2200277.0,9353.39569,0.0109,-7.244014,-40.860230
4,Alto Longá,541,351,14304.0,14339.0,2200301.0,3772.92698,0.0240,-5.254905,-42.207239
...,...,...,...,...,...,...,...,...,...,...
219,Vila Nova do Piauí,123,360,2971.0,2952.0,2211605.0,4166.66667,0.0081,-7.142922,-40.937856
220,Várzea Branca,148,337,4947.0,4938.0,2211357.0,2997.16484,0.0608,-9.237188,-42.964378
221,Várzea Grande,334,368,4391.0,4386.0,2211407.0,7615.13908,0.0000,-6.546083,-42.248040
222,Wall Ferraz,329,313,4462.0,4471.0,2211704.0,7358.53277,0.0091,-7.233920,-41.911407


In [16]:
data = data.drop('city', axis=1)

In [17]:
data

Unnamed: 0,confirmed,order_for_place,estimated_population_2019,estimated_population,city_ibge_code,confirmed_per_100k_inhabitants,death_rate,latitude,longitude
0,175,329,7084.0,7102.0,2200053.0,2464.09462,0.0057,-8.219542,-41.083059
1,332,321,5139.0,5131.0,2200103.0,6470.47359,0.0090,-5.796784,-42.660513
2,256,335,7651.0,7665.0,2200251.0,3339.85649,0.0352,-7.009095,-40.939675
3,460,317,4915.0,4918.0,2200277.0,9353.39569,0.0109,-7.244014,-40.860230
4,541,351,14304.0,14339.0,2200301.0,3772.92698,0.0240,-5.254905,-42.207239
...,...,...,...,...,...,...,...,...,...
219,123,360,2971.0,2952.0,2211605.0,4166.66667,0.0081,-7.142922,-40.937856
220,148,337,4947.0,4938.0,2211357.0,2997.16484,0.0608,-9.237188,-42.964378
221,334,368,4391.0,4386.0,2211407.0,7615.13908,0.0000,-6.546083,-42.248040
222,329,313,4462.0,4471.0,2211704.0,7358.53277,0.0091,-7.233920,-41.911407


## Dados de treino e teste

In [18]:
X_train, X_test, y_train, y_test = train_test_split(data, data_alvo, test_size = 0.2,random_state=1, shuffle=True)

## Feature Scaling

In [19]:
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)

In [20]:
print(X_train)

[[-0.16526711  0.75092493 -0.15927876 ... -0.39194614 -1.37497595
  -0.11785493]
 [-0.1220363  -1.57274837 -0.16359189 ... -0.60193695 -0.48330021
  -1.50238092]
 [-0.12187852  0.89392021 -0.16209235 ... -1.11620016  0.08613273
   0.15298155]
 ...
 [-0.13560509  0.32193909 -0.12324324 ...  0.61943818 -0.14702471
   0.69956309]
 [-0.14238949 -0.10704675 -0.1837353  ... -0.48622773  0.78774665
   0.43730021]
 [-0.10483864  0.03594853 -0.06632227 ... -0.14767111  1.24114717
   0.09664416]]


In [21]:
print(X_test)

[[-1.31976226e-01  6.07929654e-01 -8.59091445e-02 -8.53399565e-02
  -5.17380204e-01 -7.70565621e-01 -7.99071181e-01 -1.63732526e-01
   1.16752669e+00]
 [-1.65109329e-01 -1.60849719e+00 -1.70069321e-01 -1.70421861e-01
  -1.18335924e+00 -1.18329321e+00  1.02656322e+00 -8.39732772e-01
  -3.82720671e-01]
 [-1.34500653e-01  4.64934374e-01 -1.53388786e-01 -1.53171366e-01
  -8.10098426e-01 -7.36253915e-02 -9.49064618e-01 -1.28948471e+00
  -1.80769798e+00]
 [-1.43178370e-01 -4.64534946e-01 -1.61149331e-01 -1.61195927e-01
  -1.87997072e-01 -2.72794108e-01 -4.69085620e-01 -2.15867639e-01
  -9.94882152e-01]
 [-1.33869546e-01 -2.85790846e-01 -1.61288465e-01 -1.61011100e-01
  -8.02585125e-01  1.36640022e-01 -9.49064618e-01  6.02724132e-01
  -3.75446920e-01]
 [-1.52960524e-01 -2.68096179e+00 -1.83472493e-01 -1.83405939e-01
  -1.70265680e-01 -2.46518693e-01 -4.90513254e-01 -9.96404529e-01
  -1.39380933e-03]
 [-1.59271592e-01 -1.17951135e+00 -1.57593703e-01 -1.57530197e-01
  -9.24000075e-01 -1.0236614

## Salvando os conjuntos em pickle

In [22]:
pickle.dump(X_train, open('X_train.pickle', 'wb'))
pickle.dump(X_test, open('X_test.pickle', 'wb'))
pickle.dump(y_train, open('y_train.pickle', 'wb'))
pickle.dump(y_test, open('y_test.pickle', 'wb'))