# Conceção e otimização de modelos de Machine Learning

### Dados
Neste trabalho, será usado o dataset <em>train.csv</em>

Este dataset contem casos 19 237 e 18 features, incluindo:

- Attributes
- ID
- Price: price of the care(Target Column)
- Levy
- Manufacturer
- Model
- Prod. year
- Category
- Leather interior
- Fuel type
- Engine volume
- Mileage
- Cylinders
- Gear box type
- Drive wheels
- Doors
- Wheel
- Color
- Airbags

## Importar Bibliotecas

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

## Ler Dados

In [None]:
#data = pd.read_csv('datasets/diabetes_binary_health_indicators_BRFSS2015.csv')
data = pd.read_csv('datasets/train.csv')

In [None]:
data.columns

In [None]:
data.head()

In [None]:
data.info()

## 1. Valores em Falta

In [None]:
data.drop(['ID'],axis=1,inplace=True)
data.head()

In [None]:
data.drop(['Color'],axis=1,inplace=True)
data.head()

In [None]:
data.duplicated().sum()

In [None]:
data.drop_duplicates(inplace=True)
data.reset_index(drop=True,inplace=True)
data.duplicated().sum()

data

In [None]:
data['Levy'] = data['Levy'].replace('-', np.nan)

In [None]:
data.isnull().sum()

In [None]:
sns.heatmap(data.isnull(), yticklabels=False, cbar=False, cmap='viridis')

Preencher ou eliminar valores em falta

In [None]:
data['Levy'].head()

In [None]:
data[data['Levy'].isnull()]

In [None]:
data['Levy'] = data['Levy'].replace( np.nan, -1)

In [None]:
data['Levy'].head()

In [None]:
data.isnull().sum()

In [None]:
data.nunique()

## 2. Tratamento de dados categóricos

In [None]:
print(data['Levy'].value_counts())

In [None]:
print(data['Levy'].value_counts().count())

In [None]:
print(data['Levy'].median())

In [None]:
data['Levy'] = data['Levy'].astype('float')
data['Levy'].std()

In [None]:
data['Levy'].mean()

In [None]:
data_r1 = data.copy()
data_r1['Levy'] = data_r1['Levy'].replace(-1, np.nan)

In [None]:
print("max -> " ,data['Levy'].max(), "| min -> ", data_r1['Levy'].min())

In [None]:
data.head()

In [None]:
#'Desconhecido:-1','Muito Baixo:0', 'Baixo:1', 'Médio:2', 'Alto:3', 'Muito Alto:4'
labels = ['-1','0', '1', '2', '3', '4']

bins = [-2,0, 100, 550, 1000, 8000, 12000]

# Crie uma nova coluna 'Levy_Category' com as categorias
data['Levy'] = pd.cut(data['Levy'], bins=bins, labels=labels, right=False)

# Visualize as categorias
print(data['Levy'].value_counts())

In [None]:
incidents_count = data['Levy'].value_counts()
sns.set(style="darkgrid")
sns.barplot(x=incidents_count.index, y=incidents_count.values)
plt.title('Frequency Distribution of Levy')
plt.ylabel('Number of Cases', fontsize=12)
plt.xlabel('Levy type', fontsize=12)
plt.show()

In [None]:
labels = data['Levy'].astype('category').cat.categories.tolist()
counts = data['Levy'].value_counts()
sizes = [counts[var_cat] for var_cat in labels]
fig1, ax1 = plt.subplots()
ax1.pie(sizes, labels=labels, autopct='%1.1f%%', shadow=True)
ax1.axis('equal')
plt.show()

In [None]:
data.Mileage = data.Mileage.map(lambda x: x.rstrip(' km'))
data.Mileage = data.Mileage.astype('float')
#data['Mileage'] = data['Mileage'].str.replace(' km', '')
print(data['Mileage'].median())
print("max -> " ,data['Mileage'].max(), "| min -> ", data['Mileage'].min())

In [None]:
#'Novo :0', 'Poucos Km's:1', 'Medio Km's:2', 'Muitos km's :3', 'Muitos Muitos Km's:4'
labels = ['0', '1', '2', '3', '4']

bins = [ 0, 65000, 130000, 200000,500000, 10000000000000]

# Crie uma nova coluna 'Levy_Category' com as categorias
data['Mileage'] = pd.cut(data['Mileage'], bins=bins, labels=labels, right=False)

# Visualize as categorias
print(data['Mileage'].value_counts())

In [372]:
data

Unnamed: 0,Price,Levy,Manufacturer,Model,Prod. year,Category,Leather interior,Fuel type,Engine volume,Mileage,Cylinders,Gear box type,Drive wheels,Doors,Wheel,Airbags
0,13328,3,LEXUS,RX 450,2010,Jeep,Yes,Hybrid,3.5,2,6.0,Automatic,4x4,04-May,Left wheel,12
1,16621,3,CHEVROLET,Equinox,2011,Jeep,No,Petrol,3,2,6.0,Tiptronic,4x4,04-May,Left wheel,8
2,8467,-1,HONDA,FIT,2006,Hatchback,No,Petrol,1.3,3,4.0,Variator,Front,04-May,Right-hand drive,2
3,3607,2,FORD,Escape,2011,Jeep,Yes,Hybrid,2.5,2,4.0,Automatic,4x4,04-May,Left wheel,0
4,11726,1,HONDA,FIT,2014,Hatchback,Yes,Petrol,1.3,1,4.0,Automatic,Front,04-May,Left wheel,4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
15710,470,2,TOYOTA,Prius,2011,Hatchback,Yes,Hybrid,1.8,3,4.0,Automatic,Front,04-May,Left wheel,12
15711,8467,-1,MERCEDES-BENZ,CLK 200,1999,Coupe,Yes,CNG,2.0 Turbo,3,4.0,Manual,Rear,02-Mar,Left wheel,5
15712,15681,2,HYUNDAI,Sonata,2011,Sedan,Yes,Petrol,2.4,2,4.0,Tiptronic,Front,04-May,Left wheel,8
15713,26108,2,HYUNDAI,Tucson,2010,Jeep,Yes,Diesel,2,1,4.0,Automatic,Front,04-May,Left wheel,4


In [None]:
fig = plt.figure(figsize=(10,10))
corr = data.corr(method='pearson')
sns.heatmap(corr, linecolor='black', linewidths=0.5)