## Classificando diferentes tipos de semente de abóbora

As sementes de abóbora são frequentemente consumidas como confeitos em todo o mundo devido à sua quantidade adequada de proteínas, gorduras, carboidratos e teores minerais. Este estudo foi realizado nos dois tipos de sementes de abóbora mais importantes e de qualidade, “Ürgüp Sivrisi” e “Çerçevelik”, geralmente cultivadas nas regiões de Ürgüp e Karacaören na Turquia.

### Features

- Perimeter (Perímetro)
- Major_Axis_Length (Comprimento do Eixo Principal)
- Minor_Axis_Length (Comprimento do Eixo Menor)
- Convex_Area (Área Convexa)
- Equiv Diameter (Diâmetro Equiv)
- Eccentricity (Excentricidade)
- Solidity (Solidez)
- Extent (Extensão)
- Roundness (Redondeza)
- Aspect_Ration (Proporção)
- Compactness (Compacidade)

### Target
Class (Çerçevelik, Ürgüp Sivrisi)






In [None]:
# Bibliotecas padrão
import numpy as np 
import pandas as pd 
import warnings
warnings.simplefilter('ignore')

# Visualização
import seaborn as sns
import matplotlib.pyplot as plt

# keras libraries
import tensorflow
from tensorflow import keras
from keras import models
from keras import layers
from keras import metrics
from keras.metrics import Precision
from tqdm.keras import TqdmCallback

# sklearn libraries
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_selection import mutual_info_classif
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.preprocessing import RobustScaler,MinMaxScaler,StandardScaler


In [None]:
df = pd.read_excel('Pumpkin_Seeds_Dataset.xlsx')

In [None]:
df.head()

In [None]:
df.shape

## Data Cleaning

In [None]:
df.isnull().sum()

## Estatística básica

In [None]:
df.describe().T

Verificando valores unicos em cada variável:

In [None]:
df['Class'].unique()

Verificando a proporção:

In [None]:
df['Class'].value_counts()

### Feature Transform

Transformando a variável target para binária:

In [None]:
#Utilizadno Label Enconder
le = LabelEncoder()
df.Class = le.fit_transform(df['Class']) 

In [None]:
df['Class'].unique()

In [None]:
df['Class'].value_counts()

## Análise Exploratória

In [None]:
#Potando histogramas para analisar a simetria dos dados
df.hist(bins=100, figsize=(12, 12))
plt.show()

In [None]:
correlation_matrix = df.corr().round(2)

fig, ax = plt.subplots(figsize=(15,10))    
sns.heatmap(data=correlation_matrix, annot=True, linewidths=.5, ax=ax)

In [None]:
df.info()

## Separando a base de dados

In [None]:
#X = df.drop('Class',axis=1)
X = df[['Aspect_Ration','Eccentricity', 'Solidity','Equiv_Diameter','Convex_Area','Major_Axis_Length', 'Perimeter', 'Area']] 
y = df['Class'] 

In [None]:
# convertendo as variáveis que não estão no formato float 
int_cols = X.select_dtypes(include=np.int_).columns.tolist()
for c in int_cols:
    X[c] = X[c].astype('float')

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20) #separando os dados

In [None]:
len(X_train)

In [None]:
len(X_test)

## Feature Scaling

In [None]:
scaler = StandardScaler()
scaler.fit(X_train)

In [None]:
X_train

In [None]:
X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)

In [None]:
X_train

In [None]:
len(X_train)

In [None]:
len(y_train)

In [None]:
y_train.shape

In [None]:
y_test.shape

In [None]:
# reshape() molda uma matriz sem alterar os dados da matriz. Moldando nossas variáveis resposta
y_train = np.asarray(y_train).astype('float32').reshape((-1,1))
y_test = np.asarray(y_test).astype('float32').reshape((-1,1))

In [None]:
y_train.shape

In [None]:
y_test.shape

## Construindo o modelo de Deep learning

In [None]:
input_shape = X_train.shape[1]  
input_shape

In [None]:
output_shape = y_train.shape[1] 
output_shape

In [None]:
import tensorflow as tf

In [None]:
tf.random.set_seed(7)

# define model params
input_shape = X_train.shape[1]                                                         
output_shape = y_train.shape[1]                                                                                
batch_size = 20                                     

# build model
model = models.Sequential()
# input layer
model.add(layers.Dense(
                        batch_size
                       ,input_shape=(input_shape,)
                       ,activation='relu'))
# hidden layer
model.add(layers.Dense(
                        10
                       ,activation='relu'))
# dropout layer
model.add(layers.Dropout(0.5))

# output layer
model.add(layers.Dense(
                        output_shape
                       ,activation='sigmoid'))

# compile model
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

# summmary
model.summary()

In [None]:
# model fit params
epoch = 100

# fit the keras model on the dataset
hist = model.fit(X_train
                  ,y_train
                  ,epochs = epoch
                  ,batch_size=batch_size
                  ,shuffle=True
                  ,validation_data=(X_test, y_test)
                  ,verbose=0
                  ,callbacks=[TqdmCallback(verbose=0)]
          )

# save model
model.save('pumpkinSeedClassification')

In [None]:
acc = '{:.2%}'.format(hist.history['accuracy'][-1])
print(f"The model has achieved an accuracy of {acc} with {epoch} epochs")

In [None]:
# visualize training results
acc = hist.history['accuracy']
val_acc = hist.history['val_accuracy']

loss = hist.history['loss']
val_loss = hist.history['val_loss']

epochs_range = range(epoch)

plt.figure(figsize=(20, 8))
plt.subplot(1, 2, 1)
plt.plot(epochs_range, acc, label='Acurácia de Treinamento')
plt.plot(epochs_range, val_acc, label='Acurácia de Validação')
plt.legend(loc='lower right')
plt.title('Training and Validation Accuracy')

plt.subplot(1, 2, 2)
plt.plot(epochs_range, loss, label='Erro de treinamento')
plt.plot(epochs_range, val_loss, label='Erro de Validação')
plt.legend(loc='upper right')
plt.title('Erro de treinamento vs validação')
plt.show()

In [None]:
# Predictions 
y_pred = model.predict(X_test)
y_pred_class = [round(x[0]) for x in y_pred]
y_test_class = y_test

# confusion matrix
cfm = confusion_matrix(y_test_class, y_pred_class)

# visualise confusion matrix
plt.figure(figsize=(8,8))
group_names = ['True Neg','False Pos','False Neg','True Pos']
group_counts = ["{0:0.0f}".format(value) for value in cfm.flatten()]
labels = [f"{v1}\n{v2}" for v1, v2 in zip(group_names,group_counts)]
sns.heatmap(cfm, annot=True, cmap='crest', cbar=False, fmt='g')
plt.title('Confusion Matrix')
plt.xlabel('Predicted')
plt.ylabel('True')
plt.show()

0 = Çerçevelik
1 = Ürgüp Sivrisi

In [None]:
# classification report
class_names = []
for i in y.unique():
    class_names.append(le.inverse_transform([i])[0])

print(classification_report(y_test_class, y_pred_class, target_names=class_names))