In [1]:
import os
import pandas as pd
import re
from datetime import datetime
from fuzzywuzzy import fuzz, process
from pymongo import MongoClient, ASCENDING
from pymongo.errors import ConnectionFailure, DuplicateKeyError, OperationFailure

class FuzzyFolderMatcher:
    def __init__(self, threshold=80):
        self.threshold = threshold
    
    def find_best_match(self, target, choices):
        if not choices:
            return None
        best_match, score = process.extractOne(target, choices, scorer=fuzz.token_sort_ratio)
        if score >= self.threshold:
            return best_match
        return None
    
    def find_files_fuzzy(self, folder_path, target_filename):
        if not os.path.exists(folder_path):
            return None
        files = os.listdir(folder_path)
        if target_filename in files:
            return os.path.join(folder_path, target_filename)
        best_match = self.find_best_match(target_filename, files)
        if best_match:
            print(f"  Fuzzy match: '{target_filename}' -> '{best_match}'")
            return os.path.join(folder_path, best_match)
        for file in files:
            name_without_ext = os.path.splitext(file)[0]
            if fuzz.token_sort_ratio(target_filename, name_without_ext) >= self.threshold:
                return os.path.join(folder_path, file)
        return None

def extract_data_from_excel(file_path, sheet_type):
    try:
        df = pd.read_excel(file_path, sheet_name='Sheet1', header=None)
        data_start_row = None
        for idx, row in df.iterrows():
            if row[0] == 'No.':
                data_start_row = idx + 1
                break        
        if data_start_row is None:
            return {}
        data_dict = {}
        for idx in range(data_start_row, len(df)):
            row = df.iloc[idx]
            sample_id = str(row[0]).strip()
            if pd.isna(sample_id) or 'Gragh' in sample_id or not sample_id:
                continue
            if '_' in sample_id:
                bus_pad_position = int(sample_id.split('_')[-1])
                ribbon_data = {}
                for ribbon_idx in range(1, 7):
                    value = row[ribbon_idx]
                    if pd.notna(value):
                        ribbon_data[f"{sheet_type}_{bus_pad_position}_{ribbon_idx}"] = float(value)
                data_dict.update(ribbon_data)
        return data_dict
    except Exception as e:
        print(f"Error reading {file_path}: {e}")
        return {}

def parse_folder_structure(root_path):
    all_data = []
    matcher = FuzzyFolderMatcher(threshold=75)
    month_year_patterns = [
        re.compile(r'^[A-Z]{3}-\d{4}$'),
    ]    
    date_patterns = [
        re.compile(r'^\d{2}\.\d{2}\.\d{4}$'),
        re.compile(r'^\d{2}-\d{2}-\d{4}$'),
        re.compile(r'^\d{2}/\d{2}/\d{4}$'),
    ]
    
    shift_patterns = [
        re.compile(r'^SHIFT-[ABC]$', re.IGNORECASE),
        re.compile(r'^SHIFT\s+-[ABC]$', re.IGNORECASE),
        re.compile(r'^SHIFT-\s+[ABC]$', re.IGNORECASE),
        re.compile(r'^SHIFT\s+-\s+[ABC]$', re.IGNORECASE),
    ]
    
    stringer_unit_patterns = [
        re.compile(r'^STRINGER-(\d+)\s+UNIT-([AB])$', re.IGNORECASE),
        re.compile(r'^STRINGER-(\d+)\s+UNIT\s+-([AB])$', re.IGNORECASE),
        re.compile(r'^STRINGER-(\d+)\s+UNIT-\s+([AB])$', re.IGNORECASE),
        re.compile(r'^STRINGER-(\d+)\s+UNIT\s+-\s+([AB])$', re.IGNORECASE),
        re.compile(r'^STRINGER\s+-(\d+)\s+UNIT-([AB])$', re.IGNORECASE),
        re.compile(r'^STRINGER-\s+(\d+)\s+UNIT-([AB])$', re.IGNORECASE),
        re.compile(r'^STRINGER\s+-\s+(\d+)\s+UNIT-([AB])$', re.IGNORECASE),
    ]
    
    def matches_any_pattern(name, patterns):
        for pattern in patterns:
            if pattern.match(name):
                return True
        return False
    
    def extract_from_stringer_unit_folder(folder_name, patterns):
        for pattern in patterns:
            match = pattern.match(folder_name)
            if match:
                return int(match.group(1)), match.group(2).upper()
        return None, None
    
    for root, dirs, files in os.walk(root_path):
        current_path = os.path.relpath(root, root_path)
        path_parts = current_path.split(os.sep)
        if current_path == '.':
            continue
        if len(path_parts) >= 4:
            month_year_folder = path_parts[0]
            date_folder = path_parts[1]
            shift_folder = path_parts[2]
            stringer_unit_folder = path_parts[3]
            valid_structure = (
                matches_any_pattern(month_year_folder, month_year_patterns) and
                matches_any_pattern(date_folder, date_patterns) and
                matches_any_pattern(shift_folder, shift_patterns)
            )
            if valid_structure:
                stringer_num, unit = extract_from_stringer_unit_folder(stringer_unit_folder, stringer_unit_patterns)
                if stringer_num is not None and unit is not None:
                    shift = shift_folder.split('-')[-1].upper() if '-' in shift_folder else shift_folder[-1].upper()
                    date_formats = ['%d.%m.%Y', '%d-%m-%Y', '%d/%m/%Y']
                    date_str = None
                    for date_format in date_formats:
                        try:
                            date_obj = datetime.strptime(date_folder, date_format)
                            date_str = date_obj.strftime('%Y-%m-%d')
                            break
                        except ValueError:
                            continue
                    if date_str is None:
                        print(f"Could not parse date: {date_folder}")
                        continue
                    front_file = matcher.find_files_fuzzy(root, 'FRONT')
                    back_file = matcher.find_files_fuzzy(root, 'BACK')
                    if not front_file:
                        front_file = matcher.find_files_fuzzy(root, 'FRONT.xlsx')
                    if not back_file:
                        back_file = matcher.find_files_fuzzy(root, 'BACK.xlsx')
                    if front_file and back_file:
                        front_data = extract_data_from_excel(front_file, 'Front')
                        back_data = extract_data_from_excel(back_file, 'Back')
                        if front_data and back_data:
                            record = {
                                'Date': date_str,
                                'Shift': shift,
                                'Stringer': stringer_num,
                                'Unit': unit,
                                'PO': '?PO?',
                                'Cell_Vendor': '?Cell_Vendor?'
                            }
                            record.update(front_data)
                            record.update(back_data)
                            all_data.append(record)
                    else:
                        print(f"  Could not find both Excel files in: {root}")
    return all_data

def create_structured_dataframe(root_path):
    print("Starting data extraction...")
    data_records = parse_folder_structure(root_path)    
    if not data_records:
        print("No data found!")
        return pd.DataFrame()
    df = pd.DataFrame(data_records)
    base_columns = ['Date', 'Shift', 'Stringer', 'Unit', 'PO', 'Cell_Vendor']
    def sort_peel_columns(column_name):
        if column_name.startswith('Front_') or column_name.startswith('Back_'):
            parts = column_name.split('_')
            col_type = parts[0]
            bus_pad = int(parts[1])
            ribbon = int(parts[2])
            return (0 if col_type == 'Front' else 1, bus_pad, ribbon)
        else:
            return (-1, 0, 0)
    peel_columns = [col for col in df.columns if col not in base_columns]
    sorted_peel_columns = sorted(peel_columns, key=sort_peel_columns)
    final_columns = base_columns + sorted_peel_columns
    df = df[final_columns]
    df['Date'] = pd.to_datetime(df['Date'])
    df['Shift'] = pd.Categorical(df['Shift'], categories=['A', 'B', 'C'], ordered=True)
    df['Unit'] = pd.Categorical(df['Unit'], categories=['A', 'B'], ordered=True)
    df = df.sort_values(['Date', 'Shift', 'Stringer', 'Unit']).reset_index(drop=True)
    df['Date'] = df['Date'].dt.strftime('%Y-%m-%d')
    print(f"Successfully extracted {len(df)} records")
    return df

def connect_to_mongodb(connection_string='mongodb://localhost:27017/'):
    try:
        client = MongoClient(connection_string, serverSelectionTimeoutMS=5000)
        client.admin.command('ping')
        print("Successfully connected to MongoDB!")
        return client
    except ConnectionFailure as e:
        print(f"Failed to connect to MongoDB: {e}")
        return None
    except Exception as e:
        print(f"Error connecting to MongoDB: {e}")
        return None

def get_collection_name(date_str):
    date_obj = datetime.strptime(date_str, '%Y-%m-%d')
    month_name = date_obj.strftime('%b').lower()
    year = date_obj.strftime('%Y')
    return f"{month_name}_{year}"

def ensure_collection_index(collection):
    try:
        existing_indexes = collection.index_information()
        index_name = 'unique_date_shift_stringer_unit'
        index_exists = False
        for idx_name, idx_info in existing_indexes.items():
            if idx_name == index_name:
                index_exists = True
                break
        if not index_exists:
            collection.create_index(
                [
                    ('Date', ASCENDING),
                    ('Shift', ASCENDING),
                    ('Stringer', ASCENDING),
                    ('Unit', ASCENDING)
                ],
                unique=True,
                name=index_name
            )
            print(f"  Created unique index on collection '{collection.name}'")
        else:
            print(f"  Index already exists on collection '{collection.name}'")
    except OperationFailure as e:
        print(f"  Warning: Could not create index on '{collection.name}': {e}")
    except Exception as e:
        print(f"  Warning: Unexpected error while creating index: {e}")

def store_in_mongodb(df, mongo_client, db_name='peel_test'):
    if df.empty:
        print("No data to store in MongoDB")
        return
    db = mongo_client[db_name]
    print(f"\nWorking with database: '{db_name}'")
    df_copy = df.copy()
    df_copy['collection_name'] = df_copy['Date'].apply(get_collection_name)
    grouped = df_copy.groupby('collection_name')
    total_inserted = 0
    total_updated = 0
    total_errors = 0
    for collection_name, group_df in grouped:
        print(f"\nProcessing collection: '{collection_name}'")
        collection = db[collection_name]
        ensure_collection_index(collection)
        group_df = group_df.drop('collection_name', axis=1)
        records = group_df.to_dict('records')
        inserted = 0
        updated = 0
        errors = 0
        for record in records:
            try:
                filter_query = {
                    'Date': record['Date'],
                    'Shift': record['Shift'],
                    'Stringer': record['Stringer'],
                    'Unit': record['Unit']
                }
                existing_record = collection.find_one(filter_query)
                if existing_record:
                    needs_update = False
                    for key, value in record.items():
                        if key in existing_record and existing_record[key] != value:
                            needs_update = True
                            break
                    for key in record.keys():
                        if key not in existing_record:
                            needs_update = True
                            break
                    if needs_update:
                        collection.update_one(filter_query, {'$set': record})
                        updated += 1
                else:
                    collection.insert_one(record)
                    inserted += 1
            except DuplicateKeyError:
                try:
                    collection.update_one(filter_query, {'$set': record})
                    updated += 1
                except Exception as e:
                    print(f"  Error updating duplicate record: {e}")
                    errors += 1
            except Exception as e:
                print(f"  Error processing record: {e}")
                errors += 1
        total_inserted += inserted
        total_updated += updated
        total_errors += errors
        print(f"  ✓ {inserted} new records inserted")
        print(f"  ✓ {updated} existing records updated")
        if errors > 0:
            print(f"  ✗ {errors} errors occurred")
    print(f"\n{'='*60}")
    print(f"SUMMARY:")
    print(f"  Total new records inserted: {total_inserted}")
    print(f"  Total records updated: {total_updated}")
    print(f"  Total errors: {total_errors}")
    print(f"{'='*60}")

def list_mongodb_collections(mongo_client, db_name='peel_test'):
    try:
        db = mongo_client[db_name]
        collections = db.list_collection_names()
        if collections:
            print(f"\nCollections in database '{db_name}':")
            for col in collections:
                count = db[col].count_documents({})
                print(f"  - {col}: {count} documents")
        else:
            print(f"\nNo collections found in database '{db_name}'")
        return collections
    except Exception as e:
        print(f"Error listing collections: {e}")
        return []

def query_mongodb_example(mongo_client, db_name='peel_test', collection_name='jan_2024'):
    try:
        db = mongo_client[db_name]
        if collection_name not in db.list_collection_names():
            print(f"Collection '{collection_name}' does not exist in database '{db_name}'")
            return []
        collection = db[collection_name]
        all_records = list(collection.find())
        print(f"\nTotal records in '{collection_name}': {len(all_records)}")
        if all_records:
            print(f"\nExample record:")
            print(all_records[0])
        return all_records
    except Exception as e:
        print(f"Error querying MongoDB: {e}")
        return []

if __name__ == "__main__":
    root_path = "D:\\WorkingFolder\\OneDrive - vikramsolar.com\\Desktop\\VSL Projects\\QC\\QC_Data\\Auto Peel Test Result"
    if not os.path.exists(root_path):
        print(f"Path {root_path} does not exist!")
    else:
        print(f"Found path: {root_path}")
    df = create_structured_dataframe(root_path)
    if not df.empty:
        print(f"\nDataFrame shape: {df.shape}")
        print(f"\nFirst few records:")
        display(df.head())
        mongo_client = connect_to_mongodb('mongodb://localhost:27017/')
        if mongo_client:
            store_in_mongodb(df, mongo_client, db_name='peel_test')
            list_mongodb_collections(mongo_client, db_name='peel_test')
            mongo_client.close()
            print("\n✓ MongoDB connection closed successfully")
        else:
            print("\n✗ Could not connect to MongoDB. Data saved to files only.")
    else:
        print("No data was extracted. Please check the folder structure and file paths.")

Found path: D:\WorkingFolder\OneDrive - vikramsolar.com\Desktop\VSL Projects\QC\QC_Data\Auto Peel Test Result
Starting data extraction...
  Could not find both Excel files in: D:\WorkingFolder\OneDrive - vikramsolar.com\Desktop\VSL Projects\QC\QC_Data\Auto Peel Test Result\OCT-2025\04.10.2025\SHIFT-A\STRINGER-5 UNIT-A
  Could not find both Excel files in: D:\WorkingFolder\OneDrive - vikramsolar.com\Desktop\VSL Projects\QC\QC_Data\Auto Peel Test Result\OCT-2025\04.10.2025\SHIFT-A\STRINGER-5 UNIT-B
  Could not find both Excel files in: D:\WorkingFolder\OneDrive - vikramsolar.com\Desktop\VSL Projects\QC\QC_Data\Auto Peel Test Result\OCT-2025\07.10.2025\SHIFT-C\STRINGER-3 UNIT-A
  Could not find both Excel files in: D:\WorkingFolder\OneDrive - vikramsolar.com\Desktop\VSL Projects\QC\QC_Data\Auto Peel Test Result\OCT-2025\07.10.2025\SHIFT-C\STRINGER-3 UNIT-B
  Could not find both Excel files in: D:\WorkingFolder\OneDrive - vikramsolar.com\Desktop\VSL Projects\QC\QC_Data\Auto Peel Test Resul

Unnamed: 0,Date,Shift,Stringer,Unit,PO,Cell_Vendor,Front_1_1,Front_1_2,Front_1_3,Front_1_4,...,Back_15_3,Back_15_4,Back_15_5,Back_15_6,Back_16_1,Back_16_2,Back_16_3,Back_16_4,Back_16_5,Back_16_6
0,2025-10-03,A,1,A,?PO?,?Cell_Vendor?,2.005,1.871,1.701,3.958,...,1.376,2.797,2.279,3.991,2.068,2.693,3.874,3.344,2.467,3.164
1,2025-10-03,A,1,B,?PO?,?Cell_Vendor?,0.473,1.195,0.923,1.117,...,2.885,2.716,3.606,2.078,1.617,2.949,2.894,3.278,3.03,3.221
2,2025-10-03,A,2,A,?PO?,?Cell_Vendor?,1.141,1.231,1.397,1.899,...,4.095,3.449,2.474,4.77,1.202,2.16,4.051,3.055,1.474,2.466
3,2025-10-03,A,2,B,?PO?,?Cell_Vendor?,2.905,3.338,3.79,4.103,...,2.248,1.399,1.868,2.264,2.718,1.223,2.235,2.906,2.054,1.915
4,2025-10-03,A,3,A,?PO?,?Cell_Vendor?,1.686,2.378,2.778,2.698,...,3.651,4.074,3.269,3.201,2.788,3.734,4.031,3.377,3.054,3.163


Successfully connected to MongoDB!

Working with database: 'peel_test'

Processing collection: 'oct_2025'
  Created unique index on collection 'oct_2025'
  ✓ 302 new records inserted
  ✓ 0 existing records updated

SUMMARY:
  Total new records inserted: 302
  Total records updated: 0
  Total errors: 0

Collections in database 'peel_test':
  - oct_2025: 302 documents

✓ MongoDB connection closed successfully
