In [23]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.model_selection import train_test_split


In [24]:
df = pd.read_excel("datasets\\Raisin_Dataset.xlsx")


print(df.shape)
print(df.describe())
df.head()

(900, 8)
                Area  MajorAxisLength  MinorAxisLength  Eccentricity  \
count     900.000000       900.000000       900.000000    900.000000   
mean    87804.127778       430.929950       254.488133      0.781542   
std     39002.111390       116.035121        49.988902      0.090318   
min     25387.000000       225.629541       143.710872      0.348730   
25%     59348.000000       345.442898       219.111126      0.741766   
50%     78902.000000       407.803951       247.848409      0.798846   
75%    105028.250000       494.187014       279.888575      0.842571   
max    235047.000000       997.291941       492.275279      0.962124   

          ConvexArea      Extent    Perimeter  
count     900.000000  900.000000   900.000000  
mean    91186.090000    0.699508  1165.906636  
std     40769.290132    0.053468   273.764315  
min     26139.000000    0.379856   619.074000  
25%     61513.250000    0.670869   966.410750  
50%     81651.000000    0.707367  1119.509000  
75%   

Unnamed: 0,Area,MajorAxisLength,MinorAxisLength,Eccentricity,ConvexArea,Extent,Perimeter,Class
0,87524,442.246011,253.291155,0.819738,90546,0.758651,1184.04,Kecimen
1,75166,406.690687,243.032436,0.801805,78789,0.68413,1121.786,Kecimen
2,90856,442.267048,266.328318,0.798354,93717,0.637613,1208.575,Kecimen
3,45928,286.540559,208.760042,0.684989,47336,0.699599,844.162,Kecimen
4,79408,352.19077,290.827533,0.564011,81463,0.792772,1073.251,Kecimen


In [25]:

print(df["Class"].value_counts())
print(df.columns)

Class
Kecimen    450
Besni      450
Name: count, dtype: int64
Index(['Area', 'MajorAxisLength', 'MinorAxisLength', 'Eccentricity',
       'ConvexArea', 'Extent', 'Perimeter', 'Class'],
      dtype='object')


In [26]:
X = df.drop("Class", axis=1)
y = df["Class"]


X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42, test_size=0.2)

In [27]:
from sklearn.svm import SVC

model = SVC()
model.fit(X_train, y_train)

In [28]:
y_pred = model.predict(X_test)

In [29]:
class_report = classification_report(y_pred=y_pred, y_true=y_test)
print(class_report)

model.n_iter_

              precision    recall  f1-score   support

       Besni       0.88      0.79      0.83        86
     Kecimen       0.83      0.90      0.86        94

    accuracy                           0.85       180
   macro avg       0.85      0.85      0.85       180
weighted avg       0.85      0.85      0.85       180



array([225], dtype=int32)

In [30]:
model_lin = SVC(kernel="linear")
model_lin.fit(X_train, y_train)
y_pred_lin = model_lin.predict(X_test)
class_report_lin = classification_report(y_pred=y_pred_lin, y_true=y_test)
print(class_report_lin)
model_lin.n_iter_

              precision    recall  f1-score   support

       Besni       0.82      0.90      0.86        86
     Kecimen       0.90      0.82      0.86        94

    accuracy                           0.86       180
   macro avg       0.86      0.86      0.86       180
weighted avg       0.86      0.86      0.86       180



array([292224717], dtype=int32)

## Scale the data

In [31]:
y_train

10     Kecimen
334    Kecimen
244    Kecimen
678      Besni
306    Kecimen
        ...   
106    Kecimen
270    Kecimen
860      Besni
435    Kecimen
102    Kecimen
Name: Class, Length: 720, dtype: object

In [32]:
from sklearn.preprocessing import StandardScaler, MinMaxScaler

scaler = StandardScaler()


X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [33]:
model_scaled = SVC(kernel="rbf")

model_scaled.fit(X_train_scaled, y_train)

y_pred_scaled = model_scaled.predict(X_test_scaled)

In [34]:
class_report_scaled = classification_report(y_pred=y_pred_scaled, y_true=y_test)
print(class_report_scaled)
model_scaled.n_iter_

              precision    recall  f1-score   support

       Besni       0.86      0.81      0.84        86
     Kecimen       0.84      0.88      0.86        94

    accuracy                           0.85       180
   macro avg       0.85      0.85      0.85       180
weighted avg       0.85      0.85      0.85       180



array([371], dtype=int32)

In [35]:
from collections import Counter
print("Ground truth:", Counter(y_test))
print("Predicted   :", Counter(y_pred_scaled))


Ground truth: Counter({'Kecimen': 94, 'Besni': 86})
Predicted   : Counter({'Kecimen': 99, 'Besni': 81})


## Linear Kernel with scaled data

In [36]:
model_lin_scaled = SVC(kernel="linear")

model_lin_scaled.fit(X_train_scaled, y_train)

y_pred_lin_scaled = model_lin_scaled.predict(X_test_scaled)
class_report_lin_scaled = classification_report(y_pred=y_pred_lin_scaled, y_true=y_test)
print(class_report_lin_scaled)
model_lin_scaled.n_iter_

              precision    recall  f1-score   support

       Besni       0.84      0.87      0.86        86
     Kecimen       0.88      0.85      0.86        94

    accuracy                           0.86       180
   macro avg       0.86      0.86      0.86       180
weighted avg       0.86      0.86      0.86       180



array([1431], dtype=int32)

In [37]:
print("Ground truth:", Counter(y_test))
print("Predicted   :", Counter(y_pred_lin_scaled))

Ground truth: Counter({'Kecimen': 94, 'Besni': 86})
Predicted   : Counter({'Kecimen': 91, 'Besni': 89})


## Sklearn pipeline

In [38]:
from sklearn.pipeline import Pipeline

pipeline = Pipeline([
    ("scale", StandardScaler()),
    ('svc', SVC())
])
pipeline

In [39]:
pipeline.fit(X_train, y_train)

y_pred_pipeline = pipeline.predict(X_test)

In [40]:
pipeline_report = classification_report(y_pred=y_pred_pipeline, y_true=y_test)
print(pipeline_report)


              precision    recall  f1-score   support

       Besni       0.86      0.81      0.84        86
     Kecimen       0.84      0.88      0.86        94

    accuracy                           0.85       180
   macro avg       0.85      0.85      0.85       180
weighted avg       0.85      0.85      0.85       180



In [41]:
from sklearn.ensemble import VotingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC

log_model = LogisticRegression()
svc_model = SVC()
dt_model = DecisionTreeClassifier()



vc = VotingClassifier(estimators=[
    ("lr", log_model),
    ("svc", svc_model),
    ("dt", dt_model)
])

vc.fit(X_train, y_train)
y_pred_vc = vc.predict(X_test)
report = classification_report(y_pred=y_pred_vc, y_true=y_test)
print(report)

              precision    recall  f1-score   support

       Besni       0.87      0.87      0.87        86
     Kecimen       0.88      0.88      0.88        94

    accuracy                           0.88       180
   macro avg       0.88      0.88      0.88       180
weighted avg       0.88      0.88      0.88       180



In [43]:

log_model = LogisticRegression()
svc_model = SVC(probability=True)
dt_model = DecisionTreeClassifier()



vc = VotingClassifier(estimators=[
    ("lr", log_model),
    ("svc", svc_model),
    ("dt", dt_model)
], voting="soft")

vc.fit(X_train, y_train)
y_pred_vc = vc.predict(X_test)
report = classification_report(y_pred=y_pred_vc, y_true=y_test)
print(report)

              precision    recall  f1-score   support

       Besni       0.84      0.85      0.84        86
     Kecimen       0.86      0.85      0.86        94

    accuracy                           0.85       180
   macro avg       0.85      0.85      0.85       180
weighted avg       0.85      0.85      0.85       180



In [44]:
from sklearn.tree import DecisionTreeClassifier

dt = DecisionTreeClassifier()
dt.fit(X_train, y_train)
y_pred_dt = dt.predict(X_test)
class_report_dt = classification_report(y_pred=y_pred_dt, y_true=y_test)
print(class_report_dt)

              precision    recall  f1-score   support

       Besni       0.75      0.81      0.78        86
     Kecimen       0.82      0.76      0.78        94

    accuracy                           0.78       180
   macro avg       0.78      0.78      0.78       180
weighted avg       0.79      0.78      0.78       180



In [None]:
from sklearn.ensemble import RandomForestClassifier

rf_model = RandomForestClassifier(max_depth=7, min_samples_leaf=7, min_samples_split=10, n_estimators=20, class_weight='balanced')
rf_model.fit(X_train, y_train)
y_pred_rf = rf_model.predict(X_test)
class_report_rf = classification_report(y_pred=y_pred_rf, y_true=y_test)
print(class_report_rf)
train_tf = rf_model.score(X_train,y_train)
print("train score: ", train_tf)


              precision    recall  f1-score   support

       Besni       0.85      0.87      0.86        86
     Kecimen       0.88      0.86      0.87        94

    accuracy                           0.87       180
   macro avg       0.87      0.87      0.87       180
weighted avg       0.87      0.87      0.87       180

train score:  0.9166666666666666
