**Importing the necessary libraries**

In [165]:
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.preprocessing import QuantileTransformer, StandardScaler, OneHotEncoder, LabelEncoder, MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, ConfusionMatrixDisplay, f1_score
import seaborn as sns

**Import and see general statistics for csv file**

In [166]:
data = pd.read_csv("Raisin_Dataset.csv")

In [167]:
data.head

<bound method NDFrame.head of       Area  MajorAxisLength  MinorAxisLength  Eccentricity  ConvexArea  \
0    87524       442.246011       253.291155      0.819738       90546   
1    75166       406.690687       243.032436      0.801805       78789   
2    90856       442.267048       266.328318      0.798354       93717   
3    45928       286.540559       208.760042      0.684989       47336   
4    79408       352.190770       290.827533      0.564011       81463   
..     ...              ...              ...           ...         ...   
895  83248       430.077308       247.838695      0.817263       85839   
896  87350       440.735698       259.293149      0.808629       90899   
897  99657       431.706981       298.837323      0.721684      106264   
898  93523       476.344094       254.176054      0.845739       97653   
899  85609       512.081774       215.271976      0.907345       89197   

       Extent  Perimeter    Class  
0    0.758651   1184.040  Kecimen  
1    0.68

**Checks that there all values are present**

In [193]:
data.isnull().sum()

Area               0
MajorAxisLength    0
MinorAxisLength    0
Eccentricity       0
ConvexArea         0
Extent             0
Perimeter          0
Class              0
dtype: int64

**Assigns columns to X(features) and y(target)**

In [168]:
X = data.drop("Class", axis=1)
# X = data[["Area", "Eccentricity", "Extent"]]
y = data["Class"] == "Kecimen"



**Splits the dataset into training and testing data with 30% being for testing**

In [169]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=9, stratify=y)

In [170]:
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, stratify=y_train)

**Uses a label encoder to transform the two target strings into binary**

In [171]:
label_encoder = LabelEncoder()
data['Class'] = label_encoder.fit_transform(data['Class'])


**Fits the min max scaler to the training data**

In [172]:
X_train

Unnamed: 0,Area,MajorAxisLength,MinorAxisLength,Eccentricity,ConvexArea,Extent,Perimeter
97,59282,329.242566,233.774649,0.704164,60580,0.725926,920.573
600,89721,530.156574,223.499933,0.906794,95252,0.547433,1295.377
409,104921,452.863013,297.024187,0.754864,108211,0.726298,1254.861
248,62064,352.368670,227.864144,0.762775,64811,0.650566,1004.245
277,64875,393.214664,212.954824,0.840653,67701,0.770524,1014.789
...,...,...,...,...,...,...,...
776,165940,624.844959,340.695631,0.838275,170781,0.779464,1641.140
494,83555,457.546472,235.099871,0.857894,86694,0.711227,1159.779
30,88745,429.770355,265.690236,0.786009,90715,0.752064,1162.877
537,82853,430.114997,251.175700,0.811773,85292,0.746437,1139.840


In [173]:
X_train_scaled = MinMaxScaler().fit_transform(X_train)
X_train_scaled

array([[0.15554029, 0.16532336, 0.30346344, ..., 0.15190084, 0.71272311,
        0.15006015],
       [0.3017839 , 0.49113187, 0.26884352, ..., 0.31662438, 0.24456485,
        0.38270247],
       [0.37481202, 0.36579018, 0.51657822, ..., 0.37819142, 0.71369931,
        0.35755403],
       ...,
       [0.29709473, 0.3283424 , 0.41100072, ..., 0.29506951, 0.78127841,
        0.3004592 ],
       [0.26878672, 0.32890128, 0.36209503, ..., 0.26930532, 0.76652059,
        0.28616004],
       [0.65597509, 0.59589356, 0.69988017, ..., 0.65973034, 0.657014  ,
        0.58541321]])

Converts all the values into a value between 0 and 1 so that columns with significantly larger values do not dominate the decisions

**Fits a logistic regression machine learning model to the training data**

In [181]:
model_accuracy_results = []

In [188]:
data.Class.value_counts()

Class
1    450
0    450
Name: count, dtype: int64

In [189]:
log_reg = LogisticRegression(random_state=9, max_iter=10000)
log_reg.fit(X_train, y_train)
y_pred_val = log_reg.predict(X_val)
model_accuracy_results.append(accuracy_score(y_val, y_pred_val))


In [190]:
knn = KNeighborsClassifier()
knn.fit(X_train, y_train)
y_pred_val = knn.predict(X_val)
model_accuracy_results.append(accuracy_score(y_val, y_pred_val))


In [191]:
for score in model_accuracy_results:
    print(score)

0.8541666666666666
0.8541666666666666
0.8541666666666666
0.8541666666666666


**Tests the prediction of the model on test data**

**Creates a confusion matrix**

In [179]:
cnf_matrix = confusion_matrix(y_test, y_pred)

ValueError: Found input variables with inconsistent numbers of samples: [180, 144]

In [117]:
cnf_matrix


array([[80, 16],
       [17, 67]], dtype=int64)

The top left(True positive) and bottom right(True negative) have high values which shows the model is relatively accurate with predictions

In [118]:
accuracy_score(y_test, y_pred)

0.8166666666666667

In [119]:
f1_test_score = f1_score(y_test, y_pred)
f1_test_score

0.8023952095808383

In [120]:
test_record = {
    "Area": 75000,
    # "MajorAxisLength": 600,
    # "MinorAxisLength": 200,
    "Eccentricity": 0.7,
    # "ConvexArea": 200000,
    "Extent": 0.7,
    # "Perimeter": 1000}
}

test_record_df = pd.DataFrame([test_record])
test_prediction = log_reg.predict(test_record_df)

test_prediction

ValueError: The feature names should match those that were passed during fit.
Feature names seen at fit time, yet now missing:
- ConvexArea
- MajorAxisLength
- MinorAxisLength
- Perimeter
