In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path
from sklearn.metrics import ConfusionMatrixDisplay, classification_report
import pickle

pd.options.display.max_columns = 150

In [3]:
root = Path().cwd().parent
data = root / 'data' / 'MO'
train_data = root / 'training' / 'data'

In [3]:
class_dict = pickle.load(open(train_data / 'class_dict_NEW2.pkl', 'rb'))
testdf = pd.read_csv(train_data / 'test_df.csv')
valdf = pd.read_csv(train_data / 'val_df.csv')
traindf = pd.read_csv(train_data / 'train_df.csv')

traindf['dataset'] = 'train'
valdf['dataset'] = 'val'
testdf['dataset'] = 'test'

df = pd.concat([traindf, valdf, testdf], axis=0).reset_index(drop=True)
df['ids'] = df['ids'].replace(r"b'", '', regex=True).replace(r"'", '', regex=True)

cd = {v:k for k, v in class_dict.items()}

In [10]:
df['correct'] = df['Species Pred'] == df['Species']
df['correct'] = df['correct'].astype(int)
group = df.groupby(['Species']).agg(total=('correct', 'count'), correct=('correct', 'sum'))

In [None]:
ddf = df.groupby("Species")['Species Pred'].value_counts()#.reset_index()
dfs = ddf.groupby("Species").std()
dfs.sort_values()

In [189]:
ddf['Agaricus californicus']

Species Pred
Agaricus californicus         146
Agaricus xanthodermus          69
Non-Diagnostic                 17
Agaricus campestris            10
Agaricus moelleri              10
Agaricus bitorquis              8
Agaricus hondensis              6
Coprinopsis atramentaria        5
Leucoagaricus leucothites       4
Agaricus arvensis               2
Agaricus augustus               2
Agaricus bernardi               2
Chlorophyllum molybdites        2
Chlorophyllum brunneum          2
Agaricus sylvicola              2
Psilocybe caerulescens          2
Agaricus placomyces             1
Amanita porphyria               1
Amanita bisporigera             1
Agrocybe praecox                1
Agaricus albolutescens          1
Volvopluteus gloiocephalus      1
Gyromitra esculenta             1
Lactarius argillaceifolius      1
Clitocybe glacialis             1
Panaeolus cinctulus             1
Stropharia coronilla            1
Pluteus petasatus               1
Leucoagaricus barssii           1
N

In [None]:
[("Agaricus bisporus", "Agaricus campestris", "Agaricus bitorquis"), ("Agaricus californicus", "Agaricus xanthoderma"), ("Auricularia americana", "Auricularia angiospermarum"),
 ("Cantherellus cascadensis", "Cantherellus formosus"), ("Cerioporus various", "Cerioporus leptocephalus"), ("Clavulina cinerea", "Clavulina corraloides"),
 ('Clitocybe tarda', 'Lepista sordida'), ('Coprinopsis atramentaria', 'Coprinopsis romagnesiana'), ("Craterellus cornucopioides", "Craterellus fallax", 'Craterellus calicornucopioides'),
 ("Exidia crenata", "Exidia recisa"), ('Ganoderma australe', 'Ganoderma applanatum'), ('Ganoderma megaloma', 'Ganoderma applanatum'), ("Ganoderma brownii", "Ganoderma applanatum"), 
 ("Ganoderma lobatum", "Ganoderma applanatum"), ("Gymnopilus aeruginosus", "Gymnopilus luteofolius"), ('Hygrocybe chlorophana', 'Hygrocybe flavescens'),
 ("Hydnellum scabrosum", "Sarcodon imbricatus"), ('Inonotus dryadeus', 'Pseudoinonotus dryadeus'),  ("Inosperma bongardii", "Entoloma sericeum", "Hypholoma lateritium", "Inocybe assimilata"),
 ("Laccaria bicolor", "Laccaria laccata"), ("Lactarius rufulus", "Lactarius rubidus"), ("Lactifluus glaucescens", "Lactifluus piperatus"), ("Lepista sordida", "Clitocybe nuda"), 
 ("Leratiomyces riparius", "Leratiomyces percevalii"), ("Lycoperdon molle", "Lycoperdon pyridorme", "Lycoperdon perlatum"), ("Melanoleuca cognata", "Melanoleuca alboflavida"),
 ("Mycena amicta", "Mycena subcaerulea"), ("Pleurotus populinus", "Pleurotus pulmonarius"), ('Pluteus brunneidiscus', 'Pluteus petasatus'), ('Pluteus primus', 'Pluteus cervinus'), 
 ("Psilocybe subaeruginosa", "Psilocybe cyanescens"), ("Russula cyanoxantha", "Russula variata"), ("Russula dissimulans", "Russula densifolia"), 
 ("Russula fragrantissima", "Russula grata"), ("Suillus ponderosus", "Suillus caerulescens"), ("Trametes aesculi", "Trametes gibbosa"), ('Trametes ochracea', 'Trametes versicolor'),
 ("Tricholoma ustale", "Tricholoma fulvum"),  ]

In [14]:
ddf = pd.read_csv(train_data / 'train_with_MO2.csv')
ddf['name'] = ddf['name'].replace({
    'Lepista nuda': 'Clitocybe nuda', 
    'Tremella foliacea': "Phaeotremella foliacea", 
    'Panaeolina foenisecii': 'Panaeolus foenisecii',
    'Panellus serotinus': 'Sarcomyxa serotina',
    'Fomitopsis cajanderi': 'Rhodofomes cajanderi',
    # 'Amanita amerirubescens': 'Amanita rubescens',
    'Trametes sanguinea': 'Pycnoporus sanguineus',
    'Polyporus alveolaris': 'Neofavolus alveolaris',
    'Trametes conchifer': 'Poronidulus conchifer',
    'Usnea longissima': 'Dolichousnea longissima',
    'Pisolithus arrhizus': 'Pisolithus arhizus',
    'Ricasolia quercizans': 'Lobaria quercizans',
    })

In [21]:
ddf['class_id'] = ddf.groupby('name')['name'].ngroup()
ddf.to_csv(train_data / 'train_with_MO3.csv', index=False)
class_d = ddf.groupby(['class_id'])['name'].first().to_dict()
pickle.dump(class_d, open(train_data / 'class_dict_NEW3.pkl', 'wb'))

In [None]:
group['percent'] = group['correct'] / group['total']
group['percent'] = group['percent'].round(2)
group.sort_values('percent').iloc[50:100]

In [None]:
dfs.sort_values().head(50)

In [19]:
cr = classification_report(preddf['Actual Label'], preddf['Prediction'], output_dict=True)
cr = pd.DataFrame(cr).transpose()
df = cr.iloc[:-3, :-1]

test = df.loc[df['dataset'] == 'test']
val = df.loc[df['dataset'] == 'val']
train = df.loc[df['dataset'] == 'train']
conf_train = pd.crosstab(train['Actual Label'], train['Prediction'], rownames=['Actual Label'], colnames=['Predicted'], margins=True)
conf_val = pd.crosstab(val['Actual Label'], val['Prediction'], rownames=['Actual Label'], colnames=['Predicted'], margins=True)
conf_test = pd.crosstab(test['Actual Label'], test['Prediction'], rownames=['Actual Label'], colnames=['Predicted'], margins=True)
conf_train.iloc[:-1, :-1].div(conf_train.iloc[:-1, :-1].sum(axis=1), axis="index").head(50)

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [4]:
ddf = pd.read_csv(train_data / 'train_with_MO3.csv')

In [5]:
from sklearn.utils.class_weight import compute_class_weight

In [6]:
compute_class_weight('balanced', classes=np.unique(ddf['class_id']), y=ddf['class_id']).shape

(1277,)