#Análisis del dataset de semillas de calabaza

In [None]:
from scipy.io.arff import loadarff
import pandas as pd


In [None]:
data = loadarff('Pumpkin_Seeds_Dataset.arff')
#descripción del dataset
print(data[1]) #hay dos variedades de semillas: “Ürgüp Sivrisi” y “Çerçevelik”.
#features
df = pd.DataFrame(data[0])

In [None]:
df.head()

In [None]:
df.info()

In [None]:
#Nuestro target es el tipo de semilla
X = df.drop(["Class"], axis=1)
print(X)
y = df["Class"].astype(str)

In [None]:
#Analizamos la variable target
target_names = y.unique()
print(y.unique())

In [None]:
#Borramos el contenido que no nos interesa
y = y.str.replace("b'", "") #borramos el prefijo
y = y.str.replace("'", "") #borramos el sufijo
print(y.unique())

In [None]:
#Si nos interesan etiquetas numéricas, las convertimos.
y =  y.replace({"CERCEVELIK":0, "URGUP_SIVRISI":1})
print(y.unique())

#KNN


In [None]:
from sklearn.model_selection import train_test_split

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.35, random_state=10)

In [None]:
from sklearn.neighbors import KNeighborsClassifier

In [None]:
KNN = KNeighborsClassifier(n_neighbors=7)
KNN.fit(X_train, y_train)

In [None]:
y_train_pred = KNN.predict(X_train)

In [None]:
y_test_pred = KNN.predict(X_test)

In [None]:
from sklearn.metrics import accuracy_score

In [None]:
print("Accuracy - Datos de train: {} " .format(accuracy_score(y_train, y_train_pred)))
print("Accuracy - Datos de test: {} ". format(accuracy_score(y_test, y_test_pred)))

In [None]:
train_score = []
test_score = []
k_value = list(range(1, 50))

In [None]:
#Vamos a hacer un barrido con distintos hiperparámetros
for i in k_value:
    KNN = KNeighborsClassifier(n_neighbors=i)
    KNN.fit(X_train,y_train)
    y_train_pred = KNN.predict(X_train)
    train_score.append(accuracy_score(y_train, y_train_pred))
    y_test_pred = KNN.predict(X_test)
    test_score.append(accuracy_score(y_test, y_test_pred))

In [None]:
import matplotlib.pyplot as plt
from IPython.display import set_matplotlib_formats
set_matplotlib_formats("retina")

In [None]:
plt.figure(figsize=(10,6))
plt.plot(k_value, train_score, color="blue", label = "accuracy train")
plt.plot(k_value, test_score, color="red", label = "accuracy test")
plt.legend()


In [None]:
#Realizamos la misma tarea pero con un método de SKLEARN

from sklearn.model_selection import GridSearchCV

grid = GridSearchCV(estimator=KNN,
             param_grid={'n_neighbors': [1, 30]},scoring='accuracy', return_train_score=True,verbose=1)
# hacemos fit del modelo
grid_search=grid.fit(X_train, y_train)
#Hace 5-fold cross validation automáticamente

In [None]:
print("El mejor parámetro ha sido: " + str(grid_search.best_params_))
accuracy = grid_search.best_score_ *100
print("Con una accuracy de : {:.2f}%".format(accuracy) )

In [None]:
#Ahora evaluamos los datos de test con el mejor hiperparámetro encontrado
KNN = KNeighborsClassifier(n_neighbors=1)
KNN.fit(X_train, y_train)
y_predict= KNN.predict(X_test)
test_accuracy=accuracy_score(y_test,y_predict)*100

print("La accuracy en test ha sido de: {:.2f}%".format(test_accuracy) )

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
# Analicemos los datos...
columns = df.columns.drop(['Class'])

fig, axes = plt.subplots(ncols=len(columns), figsize=(15,5))
plt.rc('axes', titlesize=10)     # fontsize of the axes title
plt.rc('axes', labelsize=10)    # fontsize of the x and y labels
plt.rc('xtick', labelsize=10)    # fontsize of the tick labels
plt.rc('ytick', labelsize=10)    # fontsize of the tick labels
fig.tight_layout(pad=2.0)

for column, axis in zip(columns, axes):
        sns.boxplot(data=df[column], ax=axis)
        axis.set_title(column)


In [None]:
#Recordad que los datos deben estar estandarizados (!)
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
X = scaler.fit_transform(X)
#Repitamos ahora todo el train volviendo a ejecutar las celdas anteriores

#PCA

In [None]:
#PCA ayuda a reducir la dimensionalidad mientras mantiene la información esencial sobre la variabilidad de los datos.

from sklearn.decomposition import PCA
import seaborn as sns

In [None]:
pca = PCA(n_components=2)
X_r = pca.fit(X).transform(X)

colors = ["navy", "turquoise", "darkorange"]
lw = 2

componentsDf = pd.DataFrame(data = X_r, columns = ['PC1', 'PC2'])
plt.figure(figsize=(12, 6))
sns.scatterplot(data=componentsDf, x="PC1", y="PC2", hue=y);

In [None]:
#Ahora vamos a usar PCA para reducir dimensiones antes de hacer clasificación
pca_2 = PCA(n_components=2)
X_r_train = pca_2.fit(X_train).transform(X_train)

KNN = KNeighborsClassifier(n_neighbors=10)
KNN.fit(X_r_train, y_train)
X_r_test=pca_2.transform(X_test)
y_test_pred = KNN.predict(X_r_test)
print("Accuracy - Datos de test")
accuracy_score(y_test, y_test_pred)

#t-SNE

In [None]:
from sklearn.manifold import TSNE


In [None]:
#Recordad: PCA es determinístico, pero t-SNE no lo es.
#Por lo tanto, ejecutar 2 veces el script va a resultar en 2 plots distintos.
X_embedded = TSNE(n_components=2, learning_rate='auto', init='random').fit_transform(X)

componentsDf_tsne = pd.DataFrame(data = X_embedded, columns = ['t-SNE1', 't-SNE2'])
plt.figure(figsize=(12, 6))
sns.scatterplot(data=componentsDf_tsne, x="t-SNE1", y="t-SNE2", hue=y);

#UMAP

In [None]:
!pip install umap-learn
from umap import UMAP


In [None]:
X_embedded =  UMAP(n_components=2, init='random', random_state=0).fit_transform(X)

componentsDf_UMAP = pd.DataFrame(data = X_embedded, columns = ['UMAP1', 'UMAP2'])
plt.figure(figsize=(12, 6))
sns.scatterplot(data=componentsDf_UMAP, x="UMAP1", y="UMAP2", hue=y);

