# Build Classification Models
1. Use OneR model as a baseline
2. Use LogistricRegression

In [91]:
import pandas as pd
df = pd.read_csv("../data/cleaned_cuisines_jiheng.csv")
y = df[["cuisine"]]
X = df.drop(["cuisine"], axis=1)
X.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3995 entries, 0 to 3994
Columns: 284 entries, almond to zucchini
dtypes: int64(284)
memory usage: 8.7 MB


In [92]:
# label y
from sklearn.preprocessing import LabelEncoder
labelEncoder = LabelEncoder()
y["cuisine"] = labelEncoder.fit_transform(y)

  y = column_or_1d(y, warn=True)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  after removing the cwd from sys.path.


In [93]:
labels_to_name = dict(zip([0,1,2,3,4],labelEncoder.inverse_transform([0,1,2,3,4])))
name_to_label = dict(zip(labelEncoder.inverse_transform([0,1,2,3,4]), [0,1,2,3,4]))

In [94]:
from sklearn.model_selection import train_test_split
X_train = pd.DataFrame()
y_train = pd.DataFrame()
X_test = pd.DataFrame()
y_test = pd.DataFrame()
for c in y["cuisine"].unique():
    index = y.cuisine == c
    X_train_temp, X_test_temp, y_train_temp, y_test_temp = train_test_split(X.loc[index], y.loc[index], test_size=0.2)
    X_train = pd.concat([X_train, X_train_temp])
    X_test = pd.concat([X_test, X_test_temp])
    y_train = pd.concat([y_train, y_train_temp])
    y_test = pd.concat([y_test, y_test_temp])

## Build OneR model
1. Given a column, for each value (0 or 1), find the most frequent cuisine, that's the prediction.
2. Calculate the accurcy for the prediction.
3. Find the column that produces the highest accurracy

In [95]:
def predict_one_r(X_train, y_train, col, X_test):
    one_pred = y_train.loc[X_train[col] == 0].mode()
    zero_pred = y_train.loc[X_train[col] == 1].mode()
    if (one_pred.shape[0] == 0):
        one_pred = 0
    else:
        one_pred = one_pred.iloc[0]["cuisine"]

    if (zero_pred.shape[0] == 0):
        zero_pred = 0
    else:
        zero_pred = zero_pred.iloc[0]["cuisine"]

    return X_test[col].map(lambda val: one_pred if val == 1 else zero_pred)

In [96]:

from sklearn.metrics import accuracy_score, f1_score
selected_column = None
accuracy = 0
y_pred = None
for col in X_train.columns:
    one_pred = y_train.loc[X_train[col] == 0].mode()
    zero_pred = y_train.loc[X_train[col] == 1].mode()
    if (one_pred.shape[0] == 0):
        continue
    else:
        one_pred = one_pred.iloc[0]["cuisine"]

    if (zero_pred.shape[0] == 0):
        continue
    else:
        zero_pred = zero_pred.iloc[0]["cuisine"]

    y_pred_col = X_train[col].map(lambda val: one_pred if val == 1 else zero_pred)
    score = accuracy_score(y_train, y_pred_col)
    if (score > accuracy):
        selected_column = col
        accuracy = score
        y_pred = y_pred_col

print(f"oneR model: column {selected_column}, accuracy: {accuracy}, f1_score: {f1_score(y_train, y_pred, average='weighted')}")
y_test_pred = predict_one_r(X_train, y_train, selected_column, X_test)


oneR model: column anise, accuracy: 0.19968701095461658, f1_score: 0.06657970258283329


In [97]:
from sklearn.metrics import classification_report
y_test_pred = predict_one_r(X_train, y_train, selected_column, X_test)
print(classification_report(y_test, y_test_pred))

              precision    recall  f1-score   support

           0       0.00      0.00      0.00       160
           1       0.00      0.00      0.00       160
           2       0.00      0.00      0.00       160
           3       0.00      0.00      0.00       160
           4       0.20      1.00      0.33       160

    accuracy                           0.20       800
   macro avg       0.04      0.20      0.07       800
weighted avg       0.04      0.20      0.07       800



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [98]:
from sklearn.linear_model import LogisticRegression
model = LogisticRegression(multi_class='ovr')
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
print(classification_report(y_test, y_pred))

  y = column_or_1d(y, warn=True)


              precision    recall  f1-score   support

           0       0.76      0.78      0.77       160
           1       0.90      0.92      0.91       160
           2       0.76      0.77      0.77       160
           3       0.85      0.79      0.82       160
           4       0.79      0.81      0.80       160

    accuracy                           0.81       800
   macro avg       0.81      0.81      0.81       800
weighted avg       0.81      0.81      0.81       800



In [99]:
from sklearn.ensemble import RandomForestClassifier
model = RandomForestClassifier()
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
print(classification_report(y_test, y_pred))

  This is separate from the ipykernel package so we can avoid doing imports until


              precision    recall  f1-score   support

           0       0.83      0.83      0.83       160
           1       0.89      0.90      0.89       160
           2       0.87      0.78      0.83       160
           3       0.87      0.81      0.84       160
           4       0.79      0.91      0.85       160

    accuracy                           0.85       800
   macro avg       0.85      0.85      0.85       800
weighted avg       0.85      0.85      0.85       800

