# PUMPKING CLASSIFICATION USING DECISION TREES AND RANDOM FOREST

Pumpkin seeds are frequently consumed as confection worldwide because of their adequate amount of protein, fat, carbohydrate, and mineral contents. This study was carried out on the two most important and quality types of pumpkin seeds, “Ürgüp Sivrisi” and “Çerçevelik”. 

In [59]:
import pandas as pd
import os
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import RobustScaler
from sklearn.metrics import f1_score

In [60]:
def train_val_test_split(df, rstate=42, shuffle=True, stratify=None):
    strat = df[stratify] if stratify else None
    train_set, test_set = train_test_split(
        df, test_size=0.4, random_state=rstate, shuffle=shuffle, stratify=strat)
    strat = test_set[stratify] if stratify else None
    val_set, test_set = train_test_split(
        test_set, test_size=0.5, random_state=rstate, shuffle=shuffle, stratify=strat)
    return (train_set, val_set, test_set)

In [61]:
def remove_labels(df, label_name):
    X = df.drop(label_name, axis=1)
    y = df[label_name].copy()
    return (X, y)

In [62]:
def evaluate_result(y_pred, y, y_prep_pred, y_prep, metric):
    print(metric.__name__, "WITHOUT preparation:", metric(y_pred, y, average='weighted'))
    print(metric.__name__, "WITH preparation:", metric(y_prep_pred, y_prep, average='weighted'))

In [63]:
#let's read the csv file that contains the dataset


path = os.getcwd() + '\data\Pumpkin_Seeds_Dataset.xlsx'
df = pd.read_excel(path, header=0, names=None)
df.head()

Unnamed: 0,Area,Perimeter,Major_Axis_Length,Minor_Axis_Length,Convex_Area,Equiv_Diameter,Eccentricity,Solidity,Extent,Roundness,Aspect_Ration,Compactness,Class
0,56276,888.242,326.1485,220.2388,56831,267.6805,0.7376,0.9902,0.7453,0.8963,1.4809,0.8207,Çerçevelik
1,76631,1068.146,417.1932,234.2289,77280,312.3614,0.8275,0.9916,0.7151,0.844,1.7811,0.7487,Çerçevelik
2,71623,1082.987,435.8328,211.0457,72663,301.9822,0.8749,0.9857,0.74,0.7674,2.0651,0.6929,Çerçevelik
3,66458,992.051,381.5638,222.5322,67118,290.8899,0.8123,0.9902,0.7396,0.8486,1.7146,0.7624,Çerçevelik
4,66107,998.146,383.8883,220.4545,67117,290.1207,0.8187,0.985,0.6752,0.8338,1.7413,0.7557,Çerçevelik


In [64]:
print("Length of the dataset:", len(df))
print("Number of characteristics:", len(df.columns))

Length of the dataset: 2500
Number of characteristics: 13


In [65]:
df["Class"].value_counts() #To know how many classes are there.

Çerçevelik       1300
Ürgüp Sivrisi    1200
Name: Class, dtype: int64

In [66]:
#The values of the output are categorical, so we have to transform them into numerical values. 
X = df.copy()
X["Class"] = X["Class"].factorize()[0]

In [67]:
corr_matrix = X.corr()
corr_matrix["Class"].sort_values(ascending=False)

Class                1.000000
Aspect_Ration        0.721796
Eccentricity         0.699319
Major_Axis_Length    0.561458
Perimeter            0.388345
Area                 0.170280
Convex_Area          0.168029
Equiv_Diameter       0.160303
Solidity             0.122674
Extent              -0.236076
Minor_Axis_Length   -0.401362
Roundness           -0.669514
Compactness         -0.726676
Name: Class, dtype: float64

In [68]:
X.corr()

Unnamed: 0,Area,Perimeter,Major_Axis_Length,Minor_Axis_Length,Convex_Area,Equiv_Diameter,Eccentricity,Solidity,Extent,Roundness,Aspect_Ration,Compactness,Class
Area,1.0,0.928548,0.789133,0.685304,0.999806,0.998464,0.159624,0.158388,-0.014018,-0.149378,0.15996,-0.160438,0.17028
Perimeter,0.928548,1.0,0.946181,0.392913,0.929971,0.928055,0.464601,0.06534,-0.1406,-0.500968,0.48788,-0.48444,0.388345
Major_Axis_Length,0.789133,0.946181,1.0,0.099376,0.789061,0.787078,0.704287,0.119291,-0.21499,-0.684972,0.729156,-0.726958,0.561458
Minor_Axis_Length,0.685304,0.392913,0.099376,1.0,0.685634,0.69002,-0.590877,0.090915,0.233576,0.558566,-0.598475,0.603441,-0.401362
Convex_Area,0.999806,0.929971,0.789061,0.685634,1.0,0.998289,0.159156,0.139178,-0.015449,-0.153615,0.159822,-0.160432,0.168029
Equiv_Diameter,0.998464,0.928055,0.787078,0.69002,0.998289,1.0,0.156246,0.159454,-0.01097,-0.145313,0.155762,-0.156411,0.160303
Eccentricity,0.159624,0.464601,0.704287,-0.590877,0.159156,0.156246,1.0,0.043991,-0.327316,-0.890651,0.950225,-0.981689,0.699319
Solidity,0.158388,0.06534,0.119291,0.090915,0.139178,0.159454,0.043991,1.0,0.067537,0.200836,0.02641,-0.019967,0.122674
Extent,-0.014018,-0.1406,-0.21499,0.233576,-0.015449,-0.01097,-0.327316,0.067537,1.0,0.352338,-0.329933,0.336984,-0.236076
Roundness,-0.149378,-0.500968,-0.684972,0.558566,-0.153615,-0.145313,-0.890651,0.200836,0.352338,1.0,-0.935233,0.933308,-0.669514


In [69]:
corr_matrix[corr_matrix["Class"] > 0.05]

Unnamed: 0,Area,Perimeter,Major_Axis_Length,Minor_Axis_Length,Convex_Area,Equiv_Diameter,Eccentricity,Solidity,Extent,Roundness,Aspect_Ration,Compactness,Class
Area,1.0,0.928548,0.789133,0.685304,0.999806,0.998464,0.159624,0.158388,-0.014018,-0.149378,0.15996,-0.160438,0.17028
Perimeter,0.928548,1.0,0.946181,0.392913,0.929971,0.928055,0.464601,0.06534,-0.1406,-0.500968,0.48788,-0.48444,0.388345
Major_Axis_Length,0.789133,0.946181,1.0,0.099376,0.789061,0.787078,0.704287,0.119291,-0.21499,-0.684972,0.729156,-0.726958,0.561458
Convex_Area,0.999806,0.929971,0.789061,0.685634,1.0,0.998289,0.159156,0.139178,-0.015449,-0.153615,0.159822,-0.160432,0.168029
Equiv_Diameter,0.998464,0.928055,0.787078,0.69002,0.998289,1.0,0.156246,0.159454,-0.01097,-0.145313,0.155762,-0.156411,0.160303
Eccentricity,0.159624,0.464601,0.704287,-0.590877,0.159156,0.156246,1.0,0.043991,-0.327316,-0.890651,0.950225,-0.981689,0.699319
Solidity,0.158388,0.06534,0.119291,0.090915,0.139178,0.159454,0.043991,1.0,0.067537,0.200836,0.02641,-0.019967,0.122674
Aspect_Ration,0.15996,0.48788,0.729156,-0.598475,0.159822,0.155762,0.950225,0.02641,-0.329933,-0.935233,1.0,-0.990778,0.721796
Class,0.17028,0.388345,0.561458,-0.401362,0.168029,0.160303,0.699319,0.122674,-0.236076,-0.669514,0.721796,-0.726676,1.0


In [70]:
train_set, val_set, test_set = train_val_test_split(df)

In [71]:
#In order to remove the last column where it states the class of the pumpking seeds

X_train, y_train = remove_labels(train_set, 'Class')
X_val, y_val = remove_labels(val_set, 'Class')
X_test, y_test = remove_labels(test_set, 'Class')

In [72]:
#Let's use the decision tree

from sklearn.tree import DecisionTreeClassifier

clf_tree = DecisionTreeClassifier(random_state=42, max_depth=8) #The more depth we put, the more likely to produce overfitting
clf_tree.fit(X_train, y_train)

DecisionTreeClassifier(max_depth=8, random_state=42)

In [73]:
# We predict with the training set
y_train_pred = clf_tree.predict(X_train)

In [74]:
print("F1 Score Train Set:", f1_score(y_train_pred, y_train, average='weighted'))

F1 Score Train Set: 0.9527339171412652


In [75]:
# We predict with the validation set
y_val_pred = clf_tree.predict(X_val)

In [76]:
# Let's compare
print("F1 Score Validation Set:", f1_score(y_val_pred, y_val, average='weighted'))

F1 Score Validation Set: 0.842227996071124


In [77]:
#Let's use random forest
from sklearn.ensemble import RandomForestClassifier


clf_rnd = RandomForestClassifier(n_estimators=30, random_state=42, n_jobs=-1)
clf_rnd.fit(X_train, y_train)

RandomForestClassifier(n_estimators=30, n_jobs=-1, random_state=42)

In [78]:
y_train_pred = clf_rnd.predict(X_train)
print("F1 Score Train Set:", f1_score(y_train_pred, y_train, average='weighted'))

F1 Score Train Set: 0.9973333333333333


In [80]:
y_pred = clf_rnd.predict(X_val)
print("F1 Score Train Set:", f1_score(y_pred, y_val, average='weighted'))

F1 Score Train Set: 0.8782354211925192


### We see that we obtain better results using the random forest!!