# S02T01: Pré processamento dos dados

<img src="fluxo.png">

## Importando as bibliotecas

In [2]:
import pandas as pd
import numpy as np
import os
from sklearn.impute import SimpleImputer 
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import pickle

## Carregando os dados

In [3]:
DATASET_PATH = '/Users/JOSE MARIA/Downloads/6° periodo/Sistemas inteligentes'
DATASET_NAME = 'housing.csv'

In [4]:
def load_data(dataset_path=DATASET_PATH, dataset_name=DATASET_NAME):
    csv_path = os.path.join(dataset_path, dataset_name)
    return pd.read_csv(csv_path)

In [5]:
housing = load_data(DATASET_PATH,DATASET_NAME)
housing

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity
0,-122.23,37.88,41.0,880.0,129.0,322.0,126.0,8.3252,452600.0,NEAR BAY
1,-122.22,37.86,21.0,7099.0,1106.0,2401.0,1138.0,8.3014,358500.0,NEAR BAY
2,-122.24,37.85,52.0,1467.0,190.0,496.0,177.0,7.2574,352100.0,NEAR BAY
3,-122.25,37.85,52.0,1274.0,235.0,558.0,219.0,5.6431,341300.0,NEAR BAY
4,-122.25,37.85,52.0,1627.0,280.0,565.0,259.0,3.8462,342200.0,NEAR BAY
...,...,...,...,...,...,...,...,...,...,...
20635,-121.09,39.48,25.0,1665.0,374.0,845.0,330.0,1.5603,78100.0,INLAND
20636,-121.21,39.49,18.0,697.0,150.0,356.0,114.0,2.5568,77100.0,INLAND
20637,-121.22,39.43,17.0,2254.0,485.0,1007.0,433.0,1.7000,92300.0,INLAND
20638,-121.32,39.43,18.0,1860.0,409.0,741.0,349.0,1.8672,84700.0,INLAND


In [6]:
housing_target = housing["median_house_value"].copy()
housing = housing.drop('median_house_value', axis=1) 
housing

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,ocean_proximity
0,-122.23,37.88,41.0,880.0,129.0,322.0,126.0,8.3252,NEAR BAY
1,-122.22,37.86,21.0,7099.0,1106.0,2401.0,1138.0,8.3014,NEAR BAY
2,-122.24,37.85,52.0,1467.0,190.0,496.0,177.0,7.2574,NEAR BAY
3,-122.25,37.85,52.0,1274.0,235.0,558.0,219.0,5.6431,NEAR BAY
4,-122.25,37.85,52.0,1627.0,280.0,565.0,259.0,3.8462,NEAR BAY
...,...,...,...,...,...,...,...,...,...
20635,-121.09,39.48,25.0,1665.0,374.0,845.0,330.0,1.5603,INLAND
20636,-121.21,39.49,18.0,697.0,150.0,356.0,114.0,2.5568,INLAND
20637,-121.22,39.43,17.0,2254.0,485.0,1007.0,433.0,1.7000,INLAND
20638,-121.32,39.43,18.0,1860.0,409.0,741.0,349.0,1.8672,INLAND


## Tratando os dados faltantes

In [7]:
housing.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20640 entries, 0 to 20639
Data columns (total 9 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   longitude           20640 non-null  float64
 1   latitude            20640 non-null  float64
 2   housing_median_age  20640 non-null  float64
 3   total_rooms         20640 non-null  float64
 4   total_bedrooms      20433 non-null  float64
 5   population          20640 non-null  float64
 6   households          20640 non-null  float64
 7   median_income       20640 non-null  float64
 8   ocean_proximity     20640 non-null  object 
dtypes: float64(8), object(1)
memory usage: 1.4+ MB


In [8]:
housing_t = housing.dropna(subset=["total_bedrooms"]) 

In [9]:
housing_t.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 20433 entries, 0 to 20639
Data columns (total 9 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   longitude           20433 non-null  float64
 1   latitude            20433 non-null  float64
 2   housing_median_age  20433 non-null  float64
 3   total_rooms         20433 non-null  float64
 4   total_bedrooms      20433 non-null  float64
 5   population          20433 non-null  float64
 6   households          20433 non-null  float64
 7   median_income       20433 non-null  float64
 8   ocean_proximity     20433 non-null  object 
dtypes: float64(8), object(1)
memory usage: 1.6+ MB


In [10]:
housing_float = housing.drop('ocean_proximity', axis=1)
housing['total_bedrooms'].median()

435.0

In [11]:
imputer = SimpleImputer(missing_values=np.nan, strategy='median')
imputer.fit(housing_float)
housing_transform = imputer.transform(housing_float)
type(housing_transform)

numpy.ndarray

In [12]:
housing_t = pd.DataFrame(housing_transform, columns=housing_float.columns,index=housing.index)
housing_t.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20640 entries, 0 to 20639
Data columns (total 8 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   longitude           20640 non-null  float64
 1   latitude            20640 non-null  float64
 2   housing_median_age  20640 non-null  float64
 3   total_rooms         20640 non-null  float64
 4   total_bedrooms      20640 non-null  float64
 5   population          20640 non-null  float64
 6   households          20640 non-null  float64
 7   median_income       20640 non-null  float64
dtypes: float64(8)
memory usage: 1.3 MB


## Tratando os dados categóricos

In [13]:
housing_category = housing[['ocean_proximity']]
cat_encoder = OneHotEncoder(sparse=False)
housing_cat_1hot = cat_encoder.fit_transform(housing_category)
print(cat_encoder.categories_)
type(housing_cat_1hot)

[array(['<1H OCEAN', 'INLAND', 'ISLAND', 'NEAR BAY', 'NEAR OCEAN'],
      dtype=object)]


numpy.ndarray

In [14]:
ocean_proximity = pd.DataFrame(housing_cat_1hot,columns=['ocean_proximity_1', 'ocean_proximity_2','ocean_proximity_3',
                                    'ocean_proximity_4','ocean_proximity_5'])
ocean_proximity

Unnamed: 0,ocean_proximity_1,ocean_proximity_2,ocean_proximity_3,ocean_proximity_4,ocean_proximity_5
0,0.0,0.0,0.0,1.0,0.0
1,0.0,0.0,0.0,1.0,0.0
2,0.0,0.0,0.0,1.0,0.0
3,0.0,0.0,0.0,1.0,0.0
4,0.0,0.0,0.0,1.0,0.0
...,...,...,...,...,...
20635,0.0,1.0,0.0,0.0,0.0
20636,0.0,1.0,0.0,0.0,0.0
20637,0.0,1.0,0.0,0.0,0.0
20638,0.0,1.0,0.0,0.0,0.0


In [15]:
housing_encoded = pd.concat([housing_t,ocean_proximity], axis=1)
housing_encoded

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,ocean_proximity_1,ocean_proximity_2,ocean_proximity_3,ocean_proximity_4,ocean_proximity_5
0,-122.23,37.88,41.0,880.0,129.0,322.0,126.0,8.3252,0.0,0.0,0.0,1.0,0.0
1,-122.22,37.86,21.0,7099.0,1106.0,2401.0,1138.0,8.3014,0.0,0.0,0.0,1.0,0.0
2,-122.24,37.85,52.0,1467.0,190.0,496.0,177.0,7.2574,0.0,0.0,0.0,1.0,0.0
3,-122.25,37.85,52.0,1274.0,235.0,558.0,219.0,5.6431,0.0,0.0,0.0,1.0,0.0
4,-122.25,37.85,52.0,1627.0,280.0,565.0,259.0,3.8462,0.0,0.0,0.0,1.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
20635,-121.09,39.48,25.0,1665.0,374.0,845.0,330.0,1.5603,0.0,1.0,0.0,0.0,0.0
20636,-121.21,39.49,18.0,697.0,150.0,356.0,114.0,2.5568,0.0,1.0,0.0,0.0,0.0
20637,-121.22,39.43,17.0,2254.0,485.0,1007.0,433.0,1.7000,0.0,1.0,0.0,0.0,0.0
20638,-121.32,39.43,18.0,1860.0,409.0,741.0,349.0,1.8672,0.0,1.0,0.0,0.0,0.0


### Testando outro tipo de encoder

In [16]:
from sklearn.preprocessing import LabelEncoder
housing_category = housing['ocean_proximity']
le = LabelEncoder()
housing_category_le = le.fit_transform(housing_category)
type(housing_category_le)

numpy.ndarray

In [17]:
ocean_proximity = pd.DataFrame(housing_category_le,columns=['ocean_proximity'])
housing_encoded1 = pd.concat([housing_t,ocean_proximity], axis=1)
housing_encoded1

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,ocean_proximity
0,-122.23,37.88,41.0,880.0,129.0,322.0,126.0,8.3252,3
1,-122.22,37.86,21.0,7099.0,1106.0,2401.0,1138.0,8.3014,3
2,-122.24,37.85,52.0,1467.0,190.0,496.0,177.0,7.2574,3
3,-122.25,37.85,52.0,1274.0,235.0,558.0,219.0,5.6431,3
4,-122.25,37.85,52.0,1627.0,280.0,565.0,259.0,3.8462,3
...,...,...,...,...,...,...,...,...,...
20635,-121.09,39.48,25.0,1665.0,374.0,845.0,330.0,1.5603,1
20636,-121.21,39.49,18.0,697.0,150.0,356.0,114.0,2.5568,1
20637,-121.22,39.43,17.0,2254.0,485.0,1007.0,433.0,1.7000,1
20638,-121.32,39.43,18.0,1860.0,409.0,741.0,349.0,1.8672,1


In [24]:
print(housing)

       longitude  latitude  housing_median_age  total_rooms  total_bedrooms  \
0        -122.23     37.88                41.0        880.0           129.0   
1        -122.22     37.86                21.0       7099.0          1106.0   
2        -122.24     37.85                52.0       1467.0           190.0   
3        -122.25     37.85                52.0       1274.0           235.0   
4        -122.25     37.85                52.0       1627.0           280.0   
...          ...       ...                 ...          ...             ...   
20635    -121.09     39.48                25.0       1665.0           374.0   
20636    -121.21     39.49                18.0        697.0           150.0   
20637    -121.22     39.43                17.0       2254.0           485.0   
20638    -121.32     39.43                18.0       1860.0           409.0   
20639    -121.24     39.37                16.0       2785.0           616.0   

       population  households  median_income ocean_

## Dividando os dados em treino e teste

In [18]:
X_train, X_test, y_train, y_test = train_test_split(housing_encoded1, housing_target, test_size = 0.2,random_state=1, shuffle=True)

In [19]:
print(y_test)

4712     355000.0
2151      70700.0
15927    229400.0
82       112500.0
8161     225400.0
           ...   
2319      68200.0
5341     225000.0
16888    350000.0
6823     227300.0
11878    141700.0
Name: median_house_value, Length: 4128, dtype: float64


## Feature Scaling

In [20]:
#z = (x - u) / s
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)

In [21]:
print(X_train)

[[-1.42250942  0.97229046  1.85890297 ... -0.57159385 -0.36232605
   1.28811826]
 [-1.38265919  1.08459626  1.06434823 ... -0.51668155 -0.14102329
   1.28811826]
 [-0.8297373   1.06119922 -1.0014941  ... -1.29329827 -0.66144956
  -0.1168232 ]
 ...
 [ 0.65468363 -0.79652586  1.06434823 ... -0.54283026 -1.45044201
  -0.81929393]
 [ 1.20262424 -0.89011402 -1.47822694 ... -0.06169398 -0.65764311
  -0.1168232 ]
 [-1.30794002  1.00972573  0.50815991 ... -0.28918777 -0.83136525
   1.28811826]]


In [22]:
print(X_test)

[[ 0.60487084 -0.73569355  0.82598181 ...  0.32269207 -0.33102858
  -0.81929393]
 [-0.10247067  0.53710549  0.66707086 ... -0.16367395 -1.0032899
  -0.1168232 ]
 [-1.41752814  0.98164928  1.38217013 ... -0.24734983  0.0724551
   1.28811826]
 ...
 [-1.39262175  0.92081697 -0.20693936 ... -1.30114288  0.61952652
   1.99058899]
 [ 0.73438408 -0.72165533  1.06434823 ... -0.59512769  0.52806599
  -0.81929393]
 [ 1.09303611 -0.76844941  1.85890297 ... -0.49053284 -0.61894421
  -0.1168232 ]]


## Salvando os conjuntos em pickle

In [23]:
pickle.dump(X_train, open('X_train.pickle', 'wb'))
pickle.dump(X_test, open('X_test.pickle', 'wb'))
pickle.dump(y_train, open('y_train.pickle', 'wb'))
pickle.dump(y_test, open('y_test.pickle', 'wb'))

### Atividade (1.0 pt):  Pré-Processamento dos dados de COVID-19 no Piauí (equipe 3 integrantes)

Apresentar um jupyter notebook gere 4 pickles X_train.pickle, y_train.pickle, X_test.pickle e y_test.pickle referente ao dados pré-processados considerando o dataset dos casos de COVID-19 no estado do Piauí. Considere que que o alvo (y) do dataset é o atributo número de mortes (deaths)  