In [None]:
##TAKE EXCELL FILES PRODUCED BY MODEL RUN, COMBINE INTO ONE FILE

In [None]:
import os
import pandas as pd

# Function to load all Excel files and compile the data into a single DataFrame
def load_and_compile_data(directory_path):
    all_data = []
    # List all Excel files in the directory
    for file_name in os.listdir(directory_path):
        if file_name.endswith('.xlsx'):
            file_path = os.path.join(directory_path, file_name)
            # Read the Excel file
            data = pd.read_excel(file_path)
            # Append the data to the list
            all_data.append(data)
    # Concatenate all data into one DataFrame
    compiled_data = pd.concat(all_data, ignore_index=True)
    return compiled_data

# Function to analyze the misclassifications
def analyze_misclassifications(compiled_data):
    # Group the data by image identifier and misclassification details
    misclassification_summary = compiled_data.groupby(['Image Filename', 'True Label', 'Predicted Label']).size().reset_index(name='Count')
    # Sort the summary to show the most frequent misclassifications first
    misclassification_summary = misclassification_summary.sort_values(by='Count', ascending=False)
    return misclassification_summary

# Replace 'your_directory_path' with the path of your directory containing the Excel files
directory_path = '../incorrect_curation/moth_testingtrain/incorr/'

# Load and compile data from Excel files
compiled_data = load_and_compile_data(directory_path)

# Analyze misclassifications
misclassification_summary = analyze_misclassifications(compiled_data)

# Save the summary to a CSV file
misclassification_summary.to_csv(os.path.join(directory_path, 'misclassification_summary.csv'), index=False)

# Print a message that the process is complete
print("Misclassification summary has been saved to 'misclassification_summary.csv'.")

In [None]:
### COMBINE THROUGH TEST FOLDERS AND PRODUCE OCCURANCE TALLY WITHIN SET FOLDERS

In [None]:
import os
import pandas as pd
from collections import Counter

# Base directory
base_dir = '/media/cryptobiovis/17d4ec92-0c5a-4c67-8495-32dbeae14c24/moth_runs/mothruns_part2/moths_46/'

# Initialize a Counter to tally image occurrences
image_occurrences = Counter()

# File extensions to consider
file_extensions = ('.png', '.jpg', '.jpeg', '.JPG')

# Loop through each sub-directory
for i in range(1, 500):
    # Define path for 'test' directory
    test_dir = os.path.join(base_dir, f'moth_sorted_27_oct_save_offs_{i}', 'pt2', 'test')

    # Function to process a directory
    def process_directory(directory):
        if os.path.exists(directory):
            for class_subdir in os.listdir(directory):
                class_dir_path = os.path.join(directory, class_subdir)
                
                if os.path.isdir(class_dir_path):
                    for img_file in os.listdir(class_dir_path):
                        if img_file.endswith(file_extensions):
                            image_occurrences[img_file] += 1

    # Process the 'test' directory
    process_directory(test_dir)

# Convert the Counter to a DataFrame
df = pd.DataFrame(image_occurrences.items(), columns=['Image Name', 'Number of Occurrences'])

# Optionally, save the DataFrame to an Excel file
output_excel_path = '6_moth_test_image_occurrences.xlsx'
df.to_excel(output_excel_path, index=False)

print(f"Output Excel file saved at {output_excel_path}")


In [None]:
###TALLY TEST OCCURANCES

In [1]:
import os
import pandas as pd

# Directory containing the Excel files
directory_path = '../incorrect_curation/moth_testingtrain/train/'

# List all Excel files in the directory
excel_files = [os.path.join(directory_path, f) for f in os.listdir(directory_path) if f.endswith('.xlsx')]

# Load and combine the datasets
all_data = pd.concat([pd.read_excel(fp) for fp in excel_files])

# Group by 'Image Name' and sum the 'Number of Occurrences'
final_tally = all_data.groupby('Image Name').sum().reset_index()

# Save the final tally to a new Excel file
final_tally_path = os.path.join(directory_path, 'final_tally.xlsx')
final_tally.to_excel(final_tally_path, index=False)

final_tally_path


'../incorrect_curation/moth_testingtrain/train/final_tally.xlsx'

In [2]:
### TALLY AND REMOVE OTHER INCORRECTIONS

In [4]:
import pandas as pd

# Function to process each chunk of the DataFrame
def process_chunk(chunk_df):
    # Adding the new column with default value 0
    chunk_df['Additional Incorrectons'] = 0

    # Counting repeats and updating the new column
    for filename in chunk_df['Image Filename'].unique():
        repeat_count = chunk_df[chunk_df['Image Filename'] == filename].shape[0] - 1
        if repeat_count > 0:
            chunk_df.loc[chunk_df['Image Filename'] == filename, 'Additional Incorrectons'] = repeat_count

    return chunk_df

# Load the CSV file in chunks
file_path = '../incorrect_curation/moth_testingtrain/incorr/misclassification_summary.csv'
chunk_size = 10000  # You can adjust the chunk size based on your system's capabilities

processed_chunks = []
for chunk in pd.read_csv(file_path, chunksize=chunk_size):
    processed_chunk = process_chunk(chunk)
    processed_chunks.append(processed_chunk)

# Combine all processed chunks
combined_df = pd.concat(processed_chunks)

# Removing duplicate rows, keeping only the first occurrence
df_unique = combined_df.drop_duplicates(subset='Image Filename')

# Specify the path for saving the modified DataFrame
modified_file_path = '../incorrect_curation/moth_testingtrain/incorr/modified_misclassification_summary.csv'
df_unique.to_csv(modified_file_path, index=False)

print(f"Modified DataFrame saved to {modified_file_path}")


Modified DataFrame saved to ../incorrect_curation/moth_testingtrain/incorr/modified_misclassification_summary.csv


In [5]:
### MERGE MOD_CLASS_SUMM AND TALLY FILES

In [7]:
import pandas as pd

# Load the datasets
tally_df = pd.read_excel('../incorrect_curation/moth_testingtrain/train/final_tally.xlsx')
butterfly_df = pd.read_csv('../incorrect_curation/moth_testingtrain/incorr/modified_misclassification_summary.csv')

# Merge the datasets
merged_df = butterfly_df.merge(tally_df[['Image Name', 'Number of Occurrences']],
                               left_on='Image Filename',
                               right_on='Image Name',
                               how='left')

# Remove the extra 'Image Name' column after merging
merged_df.drop('Image Name', axis=1, inplace=True)

# Set tally results to 0 where there are no matches
merged_df['Number of Occurrences'].fillna(0, inplace=True)

# Save the updated butterfly dataset
merged_df.to_csv('merged_moth_misclassification_summary.csv', index=False)

print("Merge completed and file saved.")


Merge completed and file saved.


In [None]:
### ADD COLUMNS TO WORK OUT TIMES IN TRAINING BATCH

In [1]:
import pandas as pd

# File path to your CSV
file_path = '../incorrect_curation/merged_moth_misclassification_summary.csv'

# Reading the CSV file
df = pd.read_csv(file_path)

# Adding the values of columns 4, 5, and 6, subtracting this value from 100,
# and putting this value in a new column called 'Times in training batch'
# Columns to sum are 'Count', 'Additional Incorrectons', and 'Times correct'
df['Times in training batch'] = 100 - df.iloc[:, 3:6].sum(axis=1)

# Saving the modified DataFrame to a new CSV file
modified_file_path = 'final_moth_classification_summary.csv'
df.to_csv(modified_file_path, index=False)

print(f"Modified file saved at {modified_file_path}")


Modified file saved at final_moth_classification_summary.csv
