In [75]:
import pandas as pd
import os
import numpy as np

pd.set_option('display.max_rows', 200)

class_csv_path = '/home/tien/Works/DH/final/data/PlantCLEF2017FinalPackage/map_list.csv'
media_csv_path = '/home/tien/Works/DH/final/data/PlantCLEF2017FinalPackage/final_master.csv'
vnc_csv_path = '/home/tien/Works/DH/final/project/modules/vncreature_crawler/vncreature_list.csv'
data_path = '/home/tien/Works/DH/final/data/PlantCLEF2017Train1EOL/data/'

unknown_species = ['chua', 'chua co ten' , 'no latin 1']
column_mapping = {
    'class': 'vnc_class',
    'name_vi': 'vnc_name',
    'species': 'vnc_species',
    'order': 'vnc_order',
    'url': 'vnc_url',
    'family': 'vnc_family',
}

def remove_unknown_species(species):
    if species.lower() in unknown_species:
        return False
    return True

def find_planclef_species(vnc_species, plc_species_list):
    for species in plc_species_list:
        if vnc_species.lower() in species.lower():
            return species        
    return np.nan

def count_train_media(class_id):
    dir_path = os.path.join(data_path, str(class_id))
#     print(dir_path)
    if not os.path.exists(dir_path):
        return -2
    
    return int(len([fn for fn in os.listdir(dir_path) if not fn.startswith('.')]))

In [6]:
class_df = pd.read_csv(class_csv_path)
print(class_df.info())

media_df = pd.read_csv(media_csv_path)
print(media_df.info())

vnc_df = pd.read_csv(vnc_csv_path)
print(vnc_csv_df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 7 columns):
Unnamed: 0    10000 non-null int64
Species       10000 non-null object
Genus         10000 non-null object
Family        10000 non-null object
Content       0 non-null float64
MediaId       10000 non-null int64
ClassId       10000 non-null int64
dtypes: float64(1), int64(3), object(3)
memory usage: 547.0+ KB
None
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1467831 entries, 0 to 1467830
Data columns (total 7 columns):
Unnamed: 0    1467831 non-null int64
Species       1467831 non-null object
Genus         1467831 non-null object
Family        1467831 non-null object
Content       24837 non-null object
MediaId       1467831 non-null int64
ClassId       1467831 non-null int64
dtypes: int64(3), object(4)
memory usage: 78.4+ MB
None
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1603 entries, 0 to 1602
Data columns (total 7 columns):
Unnamed: 0    1603 non-null int64
class    

In [43]:
plc_species_list = class_df['Species'].values.tolist()

vnc_df = vnc_df[vnc_df['species'].apply(remove_unknown_species)]
vnc_df = vnc_df.drop_duplicates(subset='species')
vnc_df['plc_species'] = vnc_df['species'].apply(lambda x: find_planclef_species(x, plc_species_list))


In [44]:
found_vnc_df = vnc_df.iloc[:, 1:].dropna(subset=['plc_species'])
found_vnc_df.rename(columns=column_mapping, inplace=True)
found_vnc_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 109 entries, 21 to 1602
Data columns (total 7 columns):
vnc_class      109 non-null object
vnc_family     108 non-null object
vnc_name       109 non-null object
vnc_order      109 non-null object
vnc_species    109 non-null object
vnc_url        109 non-null object
plc_species    109 non-null object
dtypes: object(7)
memory usage: 6.8+ KB


In [45]:
required_class_df_columns = ['ClassId', 'Species', 'Genus', 'Family']
plc_vnc_df = found_vnc_df.merge(class_df[required_class_df_columns], left_on='plc_species', right_on='Species', how='left')
plc_vnc_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 109 entries, 0 to 108
Data columns (total 11 columns):
vnc_class      109 non-null object
vnc_family     108 non-null object
vnc_name       109 non-null object
vnc_order      109 non-null object
vnc_species    109 non-null object
vnc_url        109 non-null object
plc_species    109 non-null object
ClassId        109 non-null int64
Species        109 non-null object
Genus          109 non-null object
Family         109 non-null object
dtypes: int64(1), object(10)
memory usage: 10.2+ KB


In [51]:
plc_vnc_species_list = plc_vnc_df['Species'].values.tolist()
media_vnc_df = media_df[media_df['Species'].isin(plc_vnc_species_list)]
media_vnc_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 24597 entries, 603 to 1467827
Data columns (total 7 columns):
Unnamed: 0    24597 non-null int64
Species       24597 non-null object
Genus         24597 non-null object
Family        24597 non-null object
Content       1058 non-null object
MediaId       24597 non-null int64
ClassId       24597 non-null int64
dtypes: int64(3), object(4)
memory usage: 1.5+ MB


In [55]:
countBySpc_media_vnc_df = media_vnc_df.groupby(by=['Species']).count()
countBySpc_media_vnc_df

Unnamed: 0_level_0,Unnamed: 0,Genus,Family,Content,MediaId,ClassId
Species,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Abies delavayi Franch.,156,156,156,0,156,156
Abutilon indicum (L.) Sweet,175,175,175,0,175,175
Achillea millefolium L.,1786,1786,1786,255,1786,1786
Achyranthes aspera L.,234,234,234,5,234,234
Aglaia odorata Lour.,161,161,161,0,161,161
Albizia chinensis (Osbeck) Merr.,163,163,163,0,163,163
Amorphophallus paeoniifolius (Dennst.) Nicolson,237,237,237,1,237,237
Annona glabra L.,233,233,233,0,233,233
Annona squamosa L.,180,180,180,0,180,180
Anthogonium gracile Wall. ex Lindl.,134,134,134,0,134,134


In [76]:
plc_vnc_df['num_train'] = plc_vnc_df['ClassId'].apply(count_train_media)
plc_vnc_df[['num_train', 'ClassId']]

Unnamed: 0,num_train,ClassId
0,38,235732
1,12,295430
2,22,127344
3,26,331071
4,20,331060
5,96,159448
6,38,285487
7,10,150598
8,122,42649
9,12,189727


In [77]:
plc_vnc_df['num_train'].sum()

11034

In [80]:
plc_vnc_df.to_csv('/home/tien/Works/DH/final/project/reports/plantclef_vncreature_join_list.csv')