In [1]:
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
from sklearn.preprocessing import MinMaxScaler
from sklearn.linear_model import RidgeClassifier

In [None]:
# load data
df = pd.read_csv("manual classification network.csv")

# reformat the data

# first define a new df (there's some funky stuff with the csv)
new_df = pd.DataFrame()
new_df["label"] = df["Label"]
new_df.dropna(inplace=True)

# replace the headers with the 1st column (to account for the 'umbrella' headers like color and shape)
new_colnames = df.iloc[0]
df = df[1:]
df.columns = new_colnames

# replace all nans with 0s
df.fillna(0, inplace=True)

# put the dfs together into one big df
new_df = pd.concat([new_df, df.drop(columns=[df.columns[0]])], axis=1)

# rename the columns for consistency
new_df.columns = [x.lower() for x in new_df.columns]

# standardize the shape numbers using min-max scaling
scaler = MinMaxScaler()
new_df[["length", "width", "height"]] = scaler.fit_transform(new_df[["length", "width", "height"]])

# drop unnecessary columns
new_df.drop(["dull", "rough"], axis=1, inplace=True)
new_df = new_df.rename(columns={"shiny": "is_shiny", "smooth": "is_smooth"})

print(new_df)

In [3]:
# separate the data into labels and features
y = new_df["label"]
x = new_df.drop(["label"], axis=1)

# split the data (30% test 70% train)
X_train, X_test, y_train, y_test = train_test_split(x, 
                                                    y, 
                                                    test_size=0.3, 
                                                    random_state=67)

alpha = 1.0
max_iter = 1000
solver = 'auto'
tol = 1e-3

# should probably use grid search to fine tune hyperparams
ridge_classifier = RidgeClassifier(
    alpha=alpha, max_iter=max_iter, solver=solver, tol=tol)

ridge_classifier.fit(X_train, y_train)

In [4]:
# make a prediction
y_pred = ridge_classifier.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)

print(accuracy)

1.0


In [5]:
print("\nClassifiation Report:")
print(classification_report(y_test, y_pred))


Classifiation Report:
              precision    recall  f1-score   support

       Apple       1.00      1.00      1.00         3
      Orange       1.00      1.00      1.00         2

    accuracy                           1.00         5
   macro avg       1.00      1.00      1.00         5
weighted avg       1.00      1.00      1.00         5

