# 相关设置

In [2]:
from IPython.core.interactiveshell import InteractiveShell

import json
import pandas as pd

InteractiveShell.ast_node_interactivity = "all"  # 一个cell显示多输出

pd.set_option('display.max_rows', 100)  # 设置最大显示100行
pd.set_option('display.max_columns', 100)  # 设置最大显示100列

## 读取词典文件

In [19]:
with open('data/dictionary/negative_comments_dictionary.txt') as file:
    comments_keywords_str = file.read()

单引号转化为双引号

In [20]:
comments_keywords_str = comments_keywords_str.replace("\'", '\"')

In [21]:
comments_keywords_dict = json.loads(comments_keywords_str)

In [49]:
# print(comments_keywords_dict)

## 构造 negative 样本种类字典

In [35]:
negative_species = {'wasp': ['wasp', 'wasps'],  # 胡蜂
                    'hornet': ['hornet', 'hornets'],  # 黄蜂
                    'bee': ['bee', 'bees'],  # 蜜蜂
                    'sawfly': ['sawfly', 'sawflies'],  # 叶蜂
                    'cicada': ['cicada', 'cicadas'],  # 蝉
                    'beetle': ['beetle', 'beetles'],  # 甲壳虫
                    'yellowjacket': ['yellowjacket', 'yellowjackets', 'yellow jacket', 'yellow jackets'],  # 黄胡蜂
                    'horntail': ['horntail', 'horntails'],  # 树蜂科中一种
                    'cicada killer': ['cicada killer', 'cicada killers'],  # 杀蝉泥蜂
                    'urocerus': ['uroceru', 'urocerus'],  # 叶蜂 
                    'moth': ['moth', 'moths'],  # 蛾
                    'poliste': ['poliste', 'polistes', 'Poliste', 'Polistes'],  # 马蜂属
                    'tremex': ['tremex', 'tremexs'],  # 树蜂科中一种
                    'siricid': ['siricid', 'siricids'],  # 树蜂科
                    'longhorn': ['longhorn', 'longhorns']  # 天牛
                   }

In [39]:
negative_species

{'wasp': ['wasp', 'wasps'],
 'hornet': ['hornet', 'hornets'],
 'bee': ['bee', 'bees'],
 'sawfly': ['sawfly', 'sawflies'],
 'cicada': ['cicada', 'cicadas'],
 'beetle': ['beetle', 'beetles'],
 'yellowjacket': ['yellowjacket',
  'yellowjackets',
  'yellow jacket',
  'yellow jackets'],
 'horntail': ['horntail', 'horntails'],
 'cicada killer': ['cicada killer', 'cicada killers'],
 'urocerus': ['uroceru', 'urocerus'],
 'moth': ['moth', 'moths'],
 'poliste': ['poliste', 'polistes', 'Poliste', 'Polistes'],
 'tremex': ['tremex', 'tremexs'],
 'siricid': ['siricid', 'siricids'],
 'longhorn': ['longhorn', 'longhorns']}

## 提取 negative 样本种类

In [50]:
# with open('data/negative_species.txt', 'w') as file:
#     for key, value in negative_species.items():
#         file.write(str(key) + '\n')

## 为 negative 样本做进一步的分类

In [60]:
processed_data = pd.read_csv("data/processed_data.csv")
negative_data = processed_data[processed_data['Lab Status'] == 'Negative ID']
negative_data.head()

Unnamed: 0,GlobalID,Detection Date,Notes,Lab Status,Lab Comments,Submission Date,Latitude,Longitude,FileName
6,{C4F44511-EA53-4FCF-9422-E1C57703720D},2020-02-29,"I’m not sure what this is, but it was the bigg...",Negative ID,This is a large fly that mimics bees! Thanks f...,2020-02-29,48.729596,-122.480035,ATT9.png
8,{89C867F1-D5ED-48C8-9586-B705F5DA9838},2019-09-07,"slow moving, i have this in the freezer still ...",Negative ID,Thank you for this submission. This is a harml...,2020-03-18,47.395721,-123.105188,ATT11.png
9,{81670D96-4143-47B1-A9C8-83977892D53F},2020-03-19,It landed on my window while working. It walke...,Negative ID,Hello! This one is a Yellowjacket:,2020-03-19,47.840041,-122.323562,ATT15.png
12,{D30895B7-3994-45A3-BD51-E5BA881833FD},2019-08-30,Probably not an Asain Hornet but still a prett...,Negative ID,"Hello! This is a native wood-boring wasp, in t...",2020-03-19,47.635058,-122.057332,ATT16.png
14,{1304CF55-4FF9-490F-AF5A-5719CD110A9D},2020-03-23,Was on the garden hose. It stung (maybe just a...,Negative ID,"This sounds more like a yellowjacket, which ar...",2020-03-24,48.74361,-122.43905,


In [61]:
def categorize(comment: str) -> str:
    for key, value in negative_species.items():
        for i in value:
            if i in comment:
                return key
    return 'negative'

In [62]:
negative_data['species'] = negative_data['Lab Comments'].map(categorize)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


In [65]:
negative_data.head(10)

Unnamed: 0,GlobalID,Detection Date,Notes,Lab Status,Lab Comments,Submission Date,Latitude,Longitude,FileName,species
6,{C4F44511-EA53-4FCF-9422-E1C57703720D},2020-02-29,"I’m not sure what this is, but it was the bigg...",Negative ID,This is a large fly that mimics bees! Thanks f...,2020-02-29,48.729596,-122.480035,ATT9.png,bee
8,{89C867F1-D5ED-48C8-9586-B705F5DA9838},2019-09-07,"slow moving, i have this in the freezer still ...",Negative ID,Thank you for this submission. This is a harml...,2020-03-18,47.395721,-123.105188,ATT11.png,wasp
9,{81670D96-4143-47B1-A9C8-83977892D53F},2020-03-19,It landed on my window while working. It walke...,Negative ID,Hello! This one is a Yellowjacket:,2020-03-19,47.840041,-122.323562,ATT15.png,negative
12,{D30895B7-3994-45A3-BD51-E5BA881833FD},2019-08-30,Probably not an Asain Hornet but still a prett...,Negative ID,"Hello! This is a native wood-boring wasp, in t...",2020-03-19,47.635058,-122.057332,ATT16.png,wasp
14,{1304CF55-4FF9-490F-AF5A-5719CD110A9D},2020-03-23,Was on the garden hose. It stung (maybe just a...,Negative ID,"This sounds more like a yellowjacket, which ar...",2020-03-24,48.74361,-122.43905,,hornet
17,{5AD8CAF4-AB96-4BFA-8A08-6010C96937F1},2020-03-31,"Not sure if this is the target pest, but I fig...",Negative ID,Thanks for your submission! This is a paper wa...,2020-03-31,46.609918,-120.563107,ATT17.png,wasp
18,{8DFDA283-3AFD-4E9F-9878-7A0F22E1B917},2019-07-23,not sure if this is one. but in case it is. Sa...,Negative ID,"This one is one of our native species, Sphex i...",2020-04-01,48.006763,-122.462053,ATT19.png,negative
24,{8BE98561-0104-44B6-98D2-1CB625378F9B},2020-04-06,We live on the coast and are familiar with the...,Negative ID,This is a harmless native wood-boring wasp.,2020-04-06,47.156028,-123.013366,,wasp
26,{BE0AD7A4-F4FD-40F7-979C-AE5F4470B1E4},2020-04-07,"I live in Newcastle, WA and found a dead one o...",Negative ID,This is a friendly neighborhood bumble bee!,2020-04-07,47.62986,-120.68945,ATT21.png,bee
27,{EF1D664F-F908-4086-95C2-6C3B1A93FCB2},2019-06-16,"Saw this big bee on my back deck, tried to loo...",Negative ID,"This is a native sawfly, Cimbex americana or C...",2020-04-07,47.766765,-122.552735,ATT25.png,sawfly


In [67]:
print(len(negative_data))

2055


In [68]:
negative_image_data = negative_data.dropna(subset=['FileName'])

In [70]:
negative_image_data.head(10)

Unnamed: 0,GlobalID,Detection Date,Notes,Lab Status,Lab Comments,Submission Date,Latitude,Longitude,FileName,species
6,{C4F44511-EA53-4FCF-9422-E1C57703720D},2020-02-29,"I’m not sure what this is, but it was the bigg...",Negative ID,This is a large fly that mimics bees! Thanks f...,2020-02-29,48.729596,-122.480035,ATT9.png,bee
8,{89C867F1-D5ED-48C8-9586-B705F5DA9838},2019-09-07,"slow moving, i have this in the freezer still ...",Negative ID,Thank you for this submission. This is a harml...,2020-03-18,47.395721,-123.105188,ATT11.png,wasp
9,{81670D96-4143-47B1-A9C8-83977892D53F},2020-03-19,It landed on my window while working. It walke...,Negative ID,Hello! This one is a Yellowjacket:,2020-03-19,47.840041,-122.323562,ATT15.png,negative
12,{D30895B7-3994-45A3-BD51-E5BA881833FD},2019-08-30,Probably not an Asain Hornet but still a prett...,Negative ID,"Hello! This is a native wood-boring wasp, in t...",2020-03-19,47.635058,-122.057332,ATT16.png,wasp
17,{5AD8CAF4-AB96-4BFA-8A08-6010C96937F1},2020-03-31,"Not sure if this is the target pest, but I fig...",Negative ID,Thanks for your submission! This is a paper wa...,2020-03-31,46.609918,-120.563107,ATT17.png,wasp
18,{8DFDA283-3AFD-4E9F-9878-7A0F22E1B917},2019-07-23,not sure if this is one. but in case it is. Sa...,Negative ID,"This one is one of our native species, Sphex i...",2020-04-01,48.006763,-122.462053,ATT19.png,negative
26,{BE0AD7A4-F4FD-40F7-979C-AE5F4470B1E4},2020-04-07,"I live in Newcastle, WA and found a dead one o...",Negative ID,This is a friendly neighborhood bumble bee!,2020-04-07,47.62986,-120.68945,ATT21.png,bee
27,{EF1D664F-F908-4086-95C2-6C3B1A93FCB2},2019-06-16,"Saw this big bee on my back deck, tried to loo...",Negative ID,"This is a native sawfly, Cimbex americana or C...",2020-04-07,47.766765,-122.552735,ATT25.png,sawfly
36,{BEE96943-A647-46B0-85FA-6B02B961D195},2020-04-07,"was in my house in Spokane, WA",Negative ID,"A paper wasp, Polistes dominula",2020-04-07,47.708791,-117.472206,ATT27.png,wasp
42,{3A1CBF2E-E6F0-4728-A6B6-F57F29BF5B40},2019-07-13,"We weren’t sure what these were, but they are ...",Negative ID,These are native cicada killer wasps.,2020-04-08,47.221882,-119.99443,ATT31.png,wasp


In [69]:
print(len(negative_image_data))

2030


In [75]:
images_species_dict = dict(negative_image_data[['FileName', 'species']].values)

In [77]:
with open('./data/negative_samples_categorize.txt', 'w') as file:
    file.write(str(images_species_dict))

48836