<a href="https://colab.research.google.com/github/Kennismoles247/AI-Techcrush/blob/main/Raisin_Variety_Classification_Assignment.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [3]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, StratifiedKFold, GridSearchCV
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import joblib



In [7]:
df = pd.read_excel(r"/Raisin_Dataset.xlsx")

print(df.shape)      # prints (rows, columns), e.g., (900, 8)
print(df.head())     # shows first 5 rows
print(df.info())     # info about column names & types
print(df['Class'].value_counts())  # check how many of each variety



(900, 8)
    Area  MajorAxisLength  MinorAxisLength  Eccentricity  ConvexArea  \
0  87524       442.246011       253.291155      0.819738       90546   
1  75166       406.690687       243.032436      0.801805       78789   
2  90856       442.267048       266.328318      0.798354       93717   
3  45928       286.540559       208.760042      0.684989       47336   
4  79408       352.190770       290.827533      0.564011       81463   

     Extent  Perimeter    Class  
0  0.758651   1184.040  Kecimen  
1  0.684130   1121.786  Kecimen  
2  0.637613   1208.575  Kecimen  
3  0.699599    844.162  Kecimen  
4  0.792772   1073.251  Kecimen  
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 900 entries, 0 to 899
Data columns (total 8 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   Area             900 non-null    int64  
 1   MajorAxisLength  900 non-null    float64
 2   MinorAxisLength  900 non-null    float64
 3   Eccentricity   

In [12]:
import pandas as pd
from sklearn.model_selection import train_test_split, StratifiedKFold, GridSearchCV
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix


df = pd.read_excel("/Raisin_Dataset.xlsx")

X = df.drop(columns=['Class'])   # 7 numerical features
y = df['Class']                  # target (Kecimen / Besni)

# Encode target labels into 0/1
le = LabelEncoder()
y_enc = le.fit_transform(y)


# Train-Test Split (80:20, stratified)

X_train, X_test, y_train, y_test = train_test_split(
    X, y_enc, test_size=0.2, stratify=y_enc, random_state=42
)


# Pipeline + 5-Fold CV

pipe = Pipeline([
    ('scaler', StandardScaler()),
    ('clf', LogisticRegression(solver='saga', max_iter=10000, random_state=42))
])

skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

param_grid = {
    'clf__penalty': ['l2', 'l1'],
    'clf__C': [0.001, 0.01, 0.1, 1, 10, 100],
    'clf__class_weight': [None, 'balanced']
}

gs = GridSearchCV(
    estimator=pipe,
    param_grid=param_grid,
    cv=skf,
    scoring='accuracy',
    n_jobs=-1,
    verbose=2
)

gs.fit(X_train, y_train)

print("Best parameters:", gs.best_params_)
print("Best CV accuracy:", gs.best_score_)


# Final Evaluation on Test Set

best_model = gs.best_estimator_
y_pred = best_model.predict(X_test)

acc = accuracy_score(y_test, y_pred)
print("Test Accuracy:", acc)
print("\nClassification Report:\n", classification_report(y_test, y_pred, target_names=le.classes_))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))


Fitting 5 folds for each of 24 candidates, totalling 120 fits
Best parameters: {'clf__C': 0.1, 'clf__class_weight': None, 'clf__penalty': 'l2'}
Best CV accuracy: 0.8680555555555556
Test Accuracy: 0.8777777777777778

Classification Report:
               precision    recall  f1-score   support

       Besni       0.93      0.82      0.87        90
     Kecimen       0.84      0.93      0.88        90

    accuracy                           0.88       180
   macro avg       0.88      0.88      0.88       180
weighted avg       0.88      0.88      0.88       180

Confusion Matrix:
 [[74 16]
 [ 6 84]]
