In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import GridSearchCV
DNAME = "FoodData_Central_csv_2020-04-29/"

In [2]:
train = pd.read_csv(DNAME+"train_reduced.csv").set_index("fdc_id")
y_train=train["is_meat"]
x_train=train.drop("is_meat",axis=1)
x_test = pd.read_csv(DNAME+"x_test.csv").set_index("fdc_id")
y_test = pd.read_csv(DNAME+"y_test_meat_target.csv").set_index("fdc_id")

In [3]:
x_test=x_test[x_train.columns]
x_test.shape

(34308, 183)

In [4]:
params = {"max_depth":range(3,23,5),
          "n_estimators":[100,300,500],
          "ccp_alpha":[0,.001]
         }

In [5]:
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier
rfc = RandomForestClassifier()
gs = GridSearchCV(estimator=rfc, param_grid=params,scoring="recall",cv=10,verbose=1,n_jobs=-1)



In [None]:
gs.fit(x_train, y_train.values.ravel())

Fitting 10 folds for each of 24 candidates, totalling 240 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:  1.5min


In [None]:
gs.best_estimator_

In [None]:
gs

In [None]:
from sklearn import metrics
gs_pred = gs.predict(x_test)
gs_recall = metrics.recall_score(y_test, gs_pred)
gs_recall

In [None]:
gs_f1 = metrics.f1_score(y_test, gs_pred)
gs_acc = metrics.accuracy_score(y_test, gs_pred)
gs_recall = metrics.recall_score(y_test, gs_pred)
gs_prec = metrics.precision_score(y_test, gs_pred)

print('f1,recall,acc,prec score: ', gs_f1,gs_acc,gs_recall,gs_prec)

In [None]:
gs_pred

In [None]:
y_test["prediction"]=gs_pred
errors = y_test[y_test["is_meat"]!=y_test["prediction"]]
#errors

In [None]:
pd.set_option('display.min_rows', 10)
full_error=pd.concat([errors,food_merge.loc[errors.index]["description"]],axis=1)
full_error[full_error["is_meat"]==1]

In [None]:
pd.set_option('display.min_rows', 20)
full_error[full_error["is_meat"]==0].iloc[[0,2,4,7]].reset_index()

In [None]:
# looks like the false positives are eggy foods and a couple other high protein things,
# but the false negatives look like things that are actually negative and got mislabeled
# when classifying the labels - like bollywood burger is a vegetarian burger, and vegan
# bacon. There still are some real false negatives but its even less than the stats indicate

In [None]:
full_error[full_error["is_meat"]==1].iloc[[0,2,3,4]].reset_index()

In [None]:
egg_index= food_merge[food_merge["food_category_id"]==1].index
egg_index

In [None]:
# egg and dairy code is 1 so we can get the index of egg and dairys using that
egg_index = food_merge[food_merge["food_category_id"]==1].index
egg_test = [i for i in egg_index if i in x_test.index]
gs_pred_egg = gs.predict(x_test.loc[egg_test])
display(metrics.accuracy_score(y_test.loc[egg_test], gs_pred_egg))
display(metrics.recall_score(y_test.loc[egg_test], gs_pred_egg))

In [None]:
# all the errors are false positives for dairy products... oh obviously 
# at least theres still pretty high accuracy for dairy

In [None]:
"""test_egg=y_test.loc[egg_test].astype(int)
display(a.shape)
egg_prediction=pd.DataFrame(gs_pred_egg)
egg_prediction.index=test_egg.index
pd.concat([a, b],axis=1)"""

In [None]:
# false negatives
false_negatives_i = full_error[full_error["is_meat"]==1].index
false_positives_i = full_error[full_error["is_meat"]==0].index

In [None]:
food_merge = pd.read_csv(DNAME+"food_merge.csv").set_index("fdc_id")

In [None]:
false_negatives = food_merge.loc[false_negatives_i]
false_positives = food_merge.loc[false_positives_i]
pd.set_option('display.min_rows', 15)
pd.set_option('display.max_rows', 20)
false_negatives

In [None]:
total_label_cnt = food_merge.loc[x_test.index].groupby("branded_food_category").count()["data_type"] 
total_label_cnt

In [None]:
false_neg_grp = false_negatives.groupby("branded_food_category").count()
falsen_percent = false_neg_grp.merge(total_label_cnt,left_index=True,right_index=True,how="left")
falsen_percent["per_fneg"] = falsen_percent["data_type_x"]/falsen_percent["data_type_y"]
falsen_percent = falsen_percent[["per_fneg","data_type_x","data_type_x"]]
falsen_percent.columns = ["per_fneg", "fneg_cnt", "total_cnt"]

In [None]:
false_pos_grp = false_positives.groupby("branded_food_category").count()
falsep_percent = false_pos_grp.merge(total_label_cnt,left_index=True,right_index=True,how="left")
falsep_percent["per_fpos"] = falsep_percent["data_type_x"]/falsep_percent["data_type_y"]
falsep_percent = falsep_percent[["per_fpos","data_type_x","data_type_y"]]
falsep_percent.columns = ["per_fpos", "fpos_cnt", "total_cnt"]

In [None]:
falsen_percent

In [None]:
falsep_percent

In [None]:
# ok they're all branded foods so this is easier
plt.barh(false_neg_grp.index,falsen_percent["per_fneg"])
plt.title("Percent of each category that is false negative")
plt.xlabel("Percent false negative")
# this is cutting left edge so guess we're screenshotting :)
plt.savefig("figs/"+"fneg_percent")

In [None]:
# ok they're all branded foods so this is easier
plt.barh(false_pos_grp.index,falsep_percent["per_fpos"])
plt.title("Percent of each category that is false positive")
plt.xlabel("Percent false positive")
plt.savefig("figs/"+"fpos_percent")