In [None]:
<h1 style="background-color: #FAF9F6; font-family: newtimroman; font-size: 250%; color: darkblue; border: 12px solid #A9A9A9; border-radius: 20px; padding: 15px; text-align: center;">
    <b>Raisin Class Prediction</b>
</h1>

Data Set Information:

Images of Kecimen and Besni raisin varieties grown in Turkey were obtained with CVS. A total of 900 raisin grains were used, including 450 pieces from both varieties. These images were subjected to various stages of pre-processing and 7 morphological features were extracted. These features have been classified using three different artificial intelligence techniques.


Attribute Information:

1. Area: Gives the number of pixels within the boundaries of the raisin.
2. Perimeter: It measures the environment by calculating the distance between the boundaries of the raisin and the pixels around it.
3. MajorAxisLength: Gives the length of the main axis, which is the longest line that can be drawn on the raisin.
4. MinorAxisLength: Gives the length of the small axis, which is the shortest line that can be drawn on the raisin.
5. Eccentricity: It gives a measure of the eccentricity of the ellipse, which has the same moments as raisins.
6. ConvexArea: Gives the number of pixels of the smallest convex shell of the region formed by the raisin.
7. Extent: Gives the ratio of the region formed by the raisin to the total pixels in the bounding box.
8. Class: Kecimen and Besni raisin.

<div class="alert alert-warning alert-info" style="background-color: #FAF9F6; border: 5px solid #A9A9A9; border-radius: 10px;">
    <h3 style="font-weight: bold; color: darkblue;">Import Libraries</h3>
</div>

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import plotly
import plotly.express as px

#%matplotlib inline
#%matplotlib notebook

plt.rcParams["figure.figsize"] = (9,6)
import warnings
warnings.filterwarnings("ignore")
warnings.warn("this will not show")
pd.set_option('display.float_format', lambda x: '%.3f' % x)

<div class="alert alert-warning alert-info" style="background-color: #FAF9F6; border: 5px solid #A9A9A9; border-radius: 10px;">
    <h3 style="font-weight: bold; color: darkblue;">EDA and Visualization</h3>
</div>

In [None]:
df = pd.read_excel('Raisin_Dataset.xlsx')
df

In [None]:
df.info()

In [None]:
df.describe().T

In [None]:
df.shape

In [None]:
missing_count = df.isnull().sum()
value_count = df.isnull().count()
missing_percentage = round(missing_count / value_count * 100, 2)
missing_df = pd.DataFrame({"count": missing_count, "percentage": missing_percentage})
missing_df

In [None]:
df.rename(columns={column : column.lower() for column in df.columns}, inplace=True)

In [None]:
df.columns

In [None]:
df["class"].value_counts()

In [None]:
df["class"] = df["class"].map({"Kecimen":0,"Besni":1})
df["class"].value_counts()

In [None]:
sns.countplot(data=df, x="class");

In [None]:
sns.pairplot(data=df, hue="class");

In [None]:
sns.heatmap(df.corr(), annot=True);

In [None]:
df.corr()["class"].drop("class").sort_values().plot.barh();

In [None]:
numeric_df = df.select_dtypes(include=[np.number])

num_plots = len(numeric_df)

fig, axes = plt.subplots(4, 3, figsize=(20, 20))
axes = axes.flatten()  # Kolay iterasyon için düzleştirin

for i, col in enumerate(numeric_df):
    sns.boxplot(x=df[col], ax=axes[i])
    axes[i].set_title(f"{col} için Boxplot")

for j in range(i+1, len(axes)):
    fig.delaxes(axes[j])

plt.tight_layout()
plt.show()

In [None]:
df.columns

In [None]:
sns.boxplot(data=df, x="class", y="area");

In [None]:
sns.boxplot(data=df, x="class", y="majoraxislength");

In [None]:
sns.boxplot(data=df, x="class", y="minoraxislength");

In [None]:
sns.boxplot(data=df, x="class", y="eccentricity");

In [None]:
sns.boxplot(data=df, x="class", y="convexarea");

In [None]:
sns.boxplot(data=df, x="class", y="extent");

In [None]:
sns.boxplot(data=df, x="class", y="perimeter");

<div class="alert alert-warning alert-info" style="background-color: #FAF9F6; border: 5px solid #A9A9A9; border-radius: 10px;">
    <h3 style="font-weight: bold; color: darkblue;">Train | Test Split and Scaling</h3>
</div>

In [None]:
X = df.drop(["class"], axis=1)
y = df["class"]

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.15, random_state=42)

In [None]:
print("Train features shape : ", X_train.shape)
print("Train target shape   : ", y_train.shape)
print("Test features shape  : ", X_test.shape)
print("Test target shape    : ", y_test.shape)

In [None]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()

In [None]:
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

<div class="alert alert-warning alert-info" style="background-color: #FAF9F6; border: 5px solid #A9A9A9; border-radius: 10px;">
    <h3 style="font-weight: bold; color: darkblue;">Logistic Regression</h3>
</div>

In [None]:
from sklearn.linear_model import LogisticRegression

In [None]:
model = LogisticRegression()

In [None]:
model.fit(X_train_scaled, y_train)

In [None]:
y_pred = model.predict(X_test_scaled)
y_pred_proba = model.predict_proba(X_test_scaled)

In [None]:
test_data = pd.concat([X_test, y_test], axis = 1)
test_data["pred"] = y_pred
test_data["pred_proba"] = y_pred_proba[:, 1]
test_data.sample(10)

<div class="alert alert-warning alert-info" style="background-color: #FAF9F6; border: 5px solid #A9A9A9; border-radius: 10px;">
    <h3 style="font-weight: bold; color: darkblue;">Model Performance</h3>
</div>

In [None]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, log_loss
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report, ConfusionMatrixDisplay
from sklearn.metrics import ConfusionMatrixDisplay, confusion_matrix, classification_report

In [None]:
def eval_metric(model, X_train, y_train, X_test, y_test):
    y_train_pred = model.predict(X_train)
    y_pred = model.predict(X_test)
    
    print("Train_Set")
    print(confusion_matrix(y_train, y_train_pred))
    print(classification_report(y_train, y_train_pred))
    print("-----------------------------------------------------")
    print("Test_Set")
    print(confusion_matrix(y_test, y_pred))
    print(classification_report(y_test, y_pred))

In [None]:
eval_metric(model, X_train_scaled, y_train, X_test_scaled, y_test)

<div class="alert alert-warning alert-info" style="background-color: #FAF9F6; border: 5px solid #A9A9A9; border-radius: 10px;">
    <h3 style="font-weight: bold; color: darkblue;">CV</h3>
</div>

In [None]:
from sklearn.model_selection import cross_validate

In [None]:
model = LogisticRegression()

scores = cross_validate(model, X_train_scaled, y_train, scoring = ["precision", 
                                                                   "recall",
                                                                   "f1", 
                                                                   "accuracy"], cv = 10)

df_scores = pd.DataFrame(scores, index=range(1, 11))

In [None]:
df_scores.mean()[2:]

<div class="alert alert-warning alert-info" style="background-color: #FAF9F6; border: 5px solid #A9A9A9; border-radius: 10px;">
    <h3 style="font-weight: bold; color: darkblue;">GridSearchCV</h3>
</div>

In [None]:
from sklearn.model_selection import GridSearchCV

In [None]:
model = LogisticRegression()

penalty = ["l1", "l2"]
C = np.logspace(-1, 5, 10) # katsayı için logspace alınarak değerler verilmesi öneriliyor
class_weight = ["balanced", None]
solver = ["lbfgs", "liblinear", "sag", "saga"]


param_grid = {"penalty": penalty,
              "C": C,
              "class_weight":class_weight,
              "solver": solver}

grid_model = GridSearchCV(estimator = model, 
                          param_grid = param_grid, 
                          cv = 10, 
                          scoring = "recall", n_jobs = -1)

In [None]:
grid_model.fit(X_train_scaled, y_train)

In [None]:
grid_model.best_params_

In [None]:
eval_metric(grid_model, X_train_scaled, y_train, X_test_scaled, y_test)

In [None]:
ConfusionMatrixDisplay.from_estimator(grid_model, X_test, y_test);

In [None]:
ConfusionMatrixDisplay.from_estimator(grid_model, X_train, y_train);

<div class="alert alert-warning alert-info" style="background-color: #FAF9F6; border: 5px solid #A9A9A9; border-radius: 10px;">
    <h3 style="font-weight: bold; color: darkblue;">ROC (Receiver Operating Curve) and AUC (Area Under Curve)</h3>
</div>

In [None]:
from sklearn.metrics import RocCurveDisplay, PrecisionRecallDisplay, precision_recall_curve, roc_auc_score, auc, roc_curve, average_precision_score

In [None]:
RocCurveDisplay.from_estimator(grid_model, X_test_scaled, y_test);

In [None]:
PrecisionRecallDisplay.from_estimator(grid_model, X_test_scaled, y_test);

<div class="alert alert-warning alert-info" style="background-color: #FAF9F6; border: 5px solid #A9A9A9; border-radius: 10px;">
    <h3 style="font-weight: bold; color: darkblue;">Final Model</h3>
</div>

In [None]:
import pickle
pickle.dump(scaler, open("scaler_hearing", "wb"))

In [None]:
X_scaled = scaler.fit_transform(X)

In [None]:
final_model = LogisticRegression().fit(X_scaled, y)

In [None]:
pickle.dump(final_model, open("final_model_hearing", "wb"))