## clean GPT4 concepts

In [103]:
import pandas as pd

def process_bird_data(feature_file, class_file):
    """
    Process bird feature data and validate against class names.
    
    Args:
        feature_file (str): Path to the file containing bird features
        class_file (str): Path to the file containing bird class names
    
    Returns:
        pandas.DataFrame: Processed data with columns ['class', 'attribute', 'concepts']
    """
    # Read the class names file
    with open(class_file, 'r') as f:
        expected_classes = [line.strip() for line in f.readlines()]
    
    # Initialize lists to store data
    data_rows = []
    current_class = None
    current_class_idx = 0
    
    # Process the feature file
    with open(feature_file, 'r') as f:
        for line in f:
            line = line.strip()
            
            # Skip empty lines
            if not line:
                continue
                
            # If line doesn't contain ':', it's a class name
            if ':' not in line:
                # Remove ** if present
                if line.startswith('**') and line.endswith('**'):
                    line = line[2:-2].strip()
                
                current_class = line
                
                # Validate class name
                if current_class_idx < len(expected_classes):
                    expected_class = expected_classes[current_class_idx]
                    if current_class != expected_class:
                        # mismatch due to GPT4 change class name slightly
                        print(f"mismatch at index {current_class_idx}, Expected: {expected_class}, Got: {current_class}")
                        current_class=expected_class
                    current_class_idx += 1
                
            else:
                # Process feature line
                attribute, concepts = line.split(':', 1)
                attribute = attribute.strip()
                
                # Handle multiple concepts (separated by semicolon)
                concepts = [concept.strip() for concept in concepts.split(';')]
                
                # Create a row for each concept
                for concept in concepts:
                    if concept.lower() != 'n/a':  # Skip N/A values
                        data_rows.append({
                            'class': current_class,
                            'attribute': attribute,
                            'concepts': concept
                        })
    
    # Create DataFrame
    df = pd.DataFrame(data_rows)
    return df
    
df = process_bird_data('concept_files/cub_concepts.txt', 'concept_files/cub_classes.txt')
print(df.head())
print(f"Total rows: {len(df)}")

mismatch at index 3, Expected: Groove billed Ani, Got: Groove-billed Ani
mismatch at index 9, Expected: Red winged Blackbird, Got: Red-winged Blackbird
mismatch at index 11, Expected: Yellow headed Blackbird, Got: Yellow-headed Blackbird
mismatch at index 19, Expected: Yellow breasted Chat, Got: Yellow-breasted Chat
mismatch at index 23, Expected: Red faced Cormorant, Got: Red-faced Cormorant
mismatch at index 30, Expected: Black billed Cuckoo, Got: Black-billed Cuckoo
mismatch at index 32, Expected: Yellow billed Cuckoo, Got: Yellow-billed Cuckoo
mismatch at index 39, Expected: Olive sided Flycatcher, Got: Olive-sided Flycatcher
mismatch at index 40, Expected: Scissor tailed Flycatcher, Got: Scissor-tailed Flycatcher
mismatch at index 42, Expected: Yellow bellied Flycatcher, Got: Yellow-bellied Flycatcher
mismatch at index 48, Expected: Boat tailed Grackle, Got: Boat-tailed Grackle
mismatch at index 56, Expected: Rose breasted Grosbeak, Got: Rose-breasted Grosbeak
mismatch at index 59

In [162]:
df[df["concepts"].str.contains("redd")]

Unnamed: 0,class,attribute,concepts
2239,Fox Sparrow,head color,redd-brown head
2241,Fox Sparrow,wing color,redd-brown wings
2244,Fox Sparrow,body color,redd-brown body
2245,Fox Sparrow,back color,redd-brown back
2248,Fox Sparrow,breast feature,heavy redd streaks on breast
2250,Fox Sparrow,belly feature,redd streaks on belly
2253,Fox Sparrow,tail color,redd-brown tail
2756,Brown Thrasher,head color,redd-brown head
2757,Brown Thrasher,wing color,redd-brown wings
2760,Brown Thrasher,body color,redd-brown body


In [105]:
df[:50]

Unnamed: 0,class,attribute,concepts
0,Black footed Albatross,bill color,black bill
1,Black footed Albatross,bill shape,hooked seabird bill
2,Black footed Albatross,bill shape,long thick bill
3,Black footed Albatross,eye color,dark brown eyes
4,Black footed Albatross,head color,dark brown head
5,Black footed Albatross,head feature,white face
6,Black footed Albatross,wing color,dark brown wings
7,Black footed Albatross,wing shape,long narrow wings
8,Black footed Albatross,body color,dark brown body
9,Black footed Albatross,back color,dark brown back


In [193]:
df_part=df[df["attribute"]=="body color"]
df_part["concepts"].value_counts()

concepts
brown body                     32
black body                     20
white body                     18
gray body                      13
yellow body                    13
                               ..
chestnut brown body             1
pale streaked body              1
blue and black body             1
yellow and blue-gray body       1
yellow and olive-green body     1
Name: count, Length: 78, dtype: int64

In [194]:
to_remove=list(df_part["concepts"].value_counts()[df_part["concepts"].value_counts()<2].index)
for c in to_remove:
    if "body" in c:
        continue
    ind=df[df["concepts"]==c].index
    df.drop(index=ind, inplace=True)

In [163]:
df["concepts"]=df["concepts"].str.replace("(male)","")
df["concepts"]=df["concepts"].str.replace("(female)","")
df["concepts"]=df["concepts"].str.replace("ish","")
df["concepts"]=df["concepts"].str.replace("redd","red")
df["concepts"]=df["concepts"].str.strip()

In [113]:
df.drop(index=1511, inplace=True)


In [199]:
df["concepts"].value_counts()[100:150]

concepts
pale eyebrow stripe           7
lobed feet                    7
pale white belly              7
olive green back              7
dark brown head               7
pale gray wings               7
pink bill                     6
pale pink legs                6
pale ivory bill               6
long slightly rounded tail    6
pale brown breast             6
short slightly forked tail    6
pale buff breast              6
short slightly hooked bill    6
light gray belly              6
light gray breast             6
pale yellow breast            6
two white wing bars           6
long slender bill             6
short pointed wings           6
pale pink feet                6
olive green wings             6
bright red eyes               6
dark gray breast              6
warm brown body               6
green body                    5
dark gray belly               5
short pointed tail            5
yellow feet                   5
yellow legs                   5
white wing patch              5

In [200]:
# Generate per class json file
import json
def save_as_json(df, output_file):
    """
    Convert DataFrame to JSON format with class as key and concepts as values,
    then save to a file.
    
    Args:
        df (pandas.DataFrame): DataFrame with columns ['class', 'attribute', 'concepts']
        output_file (str): Path to save the JSON file
    """
    # Group by class and aggregate concepts into lists
    class_concepts = {}
    for class_name in df['class'].unique():
        concepts = df[df['class'] == class_name]['concepts'].tolist()
        # Remove duplicates while preserving order
        unique_concepts = list(dict.fromkeys(concepts))
        class_concepts[class_name] = unique_concepts
    
    # Save to JSON file
    with open(output_file, 'w') as f:
        json.dump(class_concepts, f, indent=2)
save_as_json(df, "concept_files/cub_per_class_v3.json")

In [201]:
# generate concept txt file for trainning
def extract_unique_concepts(df, output_file='cub_filtered.txt'):
    """
    Extract unique concepts from a dataframe and save them to a text file.
    
    Parameters:
    df (pandas.DataFrame): DataFrame with a 'concepts' column
    output_file (str): Name of the output text file
    
    Returns:
    list: List of unique concepts
    """
    # Get unique concepts
    unique_concepts = sorted(df['concepts'].unique())
    
    # Write to text file
    with open(output_file, 'w') as f:
        for concept in unique_concepts:
            f.write(concept + '\n')
            
    print(f"Successfully wrote {len(unique_concepts)} unique concepts to {output_file}")
    return unique_concepts

In [202]:
extract_unique_concepts(df)

Successfully wrote 567 unique concepts to cub_filtered.txt


['black V-shaped band on breast',
 'black and chestnut body',
 'black and rufous body',
 'black and white back',
 'black and white barred tail',
 'black and white body',
 'black and white crown stripes',
 'black and white head',
 'black and white stripes',
 'black and white tail',
 'black back',
 'black barred wings',
 'black barring on tail',
 'black belly',
 'black bib',
 'black bill',
 'black body',
 'black breast',
 'black breast band',
 'black cap',
 'black cap extends to eyes',
 'black cap head',
 'black cap on males',
 'black cap recedes in winter',
 'black crown and throat',
 'black crown stripe',
 'black crown stripes',
 'black extends to throat',
 'black eye mask',
 'black eye stripe',
 'black eye-stripe',
 'black eyeline',
 'black feet',
 'black forehead',
 'black head',
 'black hood',
 'black horn-like tufts',
 'black legs',
 'black line from bill to eye',
 'black mustache stripe',
 'black necklace',
 'black spot below eyes',
 'black spotted belly',
 'black spotted breast',

## Clean per Class concept file

In [6]:
import pandas as pd
import json
import data.utils as data_utils


# Read the JSON file
with open("concept_files/cub_per_class.json", 'r') as file:
    data = json.load(file)

# Create lists for each row
rows = []

# Iterate through the dictionary and create a row for each concept
for species, concepts in data.items():
    # Handle both single concepts and lists of concepts
    if isinstance(concepts, list):
        for concept in concepts:
            rows.append({'species': species, 'concept': data_utils.format_concept(concept)})
    else:
        rows.append({'species': species, 'concept': data_utils.format_concept(concepts)})

# Create DataFrame from the rows
df = pd.DataFrame(rows)

df


Unnamed: 0,species,concept
0,Black footed Albatross,black feet
1,Black footed Albatross,dark wingtips
2,Black footed Albatross,large size
3,Black footed Albatross,large wingspan
4,Black footed Albatross,long wings
...,...,...
1681,Common Yellowthroat,greenish yellow back and wings
1682,Common Yellowthroat,olive green back
1683,Common Yellowthroat,white belly
1684,Common Yellowthroat,yellow throat and breast


In [7]:
df['concept'].value_counts()

concept
small body                    64
white belly                   55
long tail                     36
yellow eyes                   28
black bill                    24
                              ..
blue sheen on the wings        1
short dark legs                1
brown back and white belly     1
black mask on the face         1
yellow head and breast         1
Name: count, Length: 685, dtype: int64

In [9]:
def show_bird(df, target_species):
    """
    Display all data for a specific bird species
    
    Args:
        target_species (str): Name of the species to search for
    
    Returns:
        pandas.DataFrame: Filtered DataFrame containing only rows for the specified species
    """
    # Create mask for exact species match
    mask = df['species'] == target_species
    
    # Filter DataFrame and return result
    bird_data = df[mask]
    
    # Return empty DataFrame with message if no matches found
    if len(bird_data) == 0:
        print(f"No data found for species: {target_species}")
        return pd.DataFrame()
    # display(bird_data)  
    return bird_data

def show_concepts(df, keywords, neg=False):
    """
    Filter DataFrame rows where concepts contain any of the given keywords (case-insensitive)
    
    Args:
        keywords (str or list): Single keyword or list of keywords to search for in the concepts column
        
    Returns:
        pandas.DataFrame: Filtered DataFrame containing rows where concepts contain any of the keywords
    """
    # Convert single string input to list
    if isinstance(keywords, str):
        keywords = [keywords]
    
    # Convert all keywords to lowercase
    keywords = [k.lower() for k in keywords]
    
    # Create mask for matching any keyword
    mask = df['concept'].str.lower().apply(lambda x: any(k in x for k in keywords))
    
    if not neg:
        matching_rows = df[mask]
    else:
        matching_rows = df[~mask]
    
    # Return empty DataFrame if no matches found
    if matching_rows.empty:
        print(f"No concepts found containing any of these keywords: {keywords}")
        return pd.DataFrame()
    
    return matching_rows

In [15]:
# Replace 'output.csv' with your desired filename
df.to_csv('output.csv', index=False)

In [11]:
show_concepts(df, " and ")

Unnamed: 0,species,concept
36,Crested Auklet,black back and wings
40,Crested Auklet,white breast and belly
41,Crested Auklet,black legs and feet
45,Least Auklet,black cap and back
46,Least Auklet,black head and back
...,...,...
1659,Rock Wren,brown and gray plumage
1663,Rock Wren,shy and secretive behavior
1680,Common Yellowthroat,yellow throat and breast
1681,Common Yellowthroat,greenish yellow back and wings


In [12]:
import pandas as pd

def split_concepts(df):
    # Create an empty list to store new rows
    new_rows = []
    
    for _, row in df.iterrows():
        species = row['species']
        concept = row['concept']
        
        if ' and ' in concept:
            # Split the concept by 'and'
            parts = concept.split(' and ')
            
            # Handle the first part (it's complete as is)
            new_rows.append({
                'species': species,
                'concept': parts[0]
            })
            
            # Handle the second part
            # If it's a single word, add the adjectives from the first part
            second_part = parts[1].strip()
            if len(second_part.split()) == 1:
                # Get adjectives from first part (all words except the last)
                first_part_words = parts[0].split()
                adjectives = ' '.join(first_part_words[:-1])
                second_part = f"{adjectives} {second_part}"
            
            new_rows.append({
                'species': species,
                'concept': second_part
            })
        # else:
        #     # If no 'and' in concept, keep the row as is
        #     new_rows.append({
        #         'species': species,
        #         'concept': concept
        #     })
    
    # Create new dataframe from the processed rows
    return pd.DataFrame(new_rows)

# Example usage:
result = split_concepts(df)

In [14]:
n=0
result[n*50:(n+1)*50]

Unnamed: 0,species,concept
0,Crested Auklet,black back
1,Crested Auklet,black wings
2,Crested Auklet,white breast
3,Crested Auklet,white belly
4,Crested Auklet,black legs
5,Crested Auklet,black feet
6,Least Auklet,black cap
7,Least Auklet,black back
8,Least Auklet,black head
9,Least Auklet,black back
