In [1]:
# Step 1: Define directories and list the files for each directory
# Note: You should run this part in your local Jupyter environment where the directories exist

import os
import shutil

# Define the paths for the two directories
base_dir = '/mnt/d/Doc_To_Date/20241024/20241013'
output_dir = '/mnt/d/Doc_To_Date/20241024/merged_output'

# Step 1.1: Create output directory if it doesn't exist and copy all base files into it
os.makedirs(output_dir, exist_ok=True)

# Copy all files from base_dir to output_dir
for file_name in os.listdir(base_dir):
    base_file_path = os.path.join(base_dir, file_name)
    output_file_path = os.path.join(output_dir, file_name)
    shutil.copy(base_file_path, output_file_path)

print("All files from base directory copied to merged_output directory.")

# Step 2: List files in the merged_output directory
merged_files = os.listdir(output_dir)

print("\nFiles in Merged Output Directory (First 20 files):")
print("\n".join(merged_files[:20]))  # Print first 20 files for readability

# We will stop here for now as per the current step requested.
# Next steps can be implemented after verification of the copied files.


All files from base directory copied to merged_output directory.

Files in Merged Output Directory (First 20 files):
Org_FHAB_FHAB_1000_20241013T1345.txt
Org_FHAB_FHAB_1000_20241013T1345_RST.txt
Org_FHAB_FHAB_1003_20241013T1345.txt
Org_FHAB_FHAB_1003_20241013T1345_RST.txt
Org_FHAB_FHAB_1004_20241013T1345.txt
Org_FHAB_FHAB_1004_20241013T1345_RST.txt
Org_FHAB_FHAB_1005_20241013T1345.txt
Org_FHAB_FHAB_1005_20241013T1345_RST.txt
Org_FHAB_FHAB_1006_20241013T1345.txt
Org_FHAB_FHAB_1006_20241013T1345_RST.txt
Org_FHAB_FHAB_1007_20241013T1345.txt
Org_FHAB_FHAB_1007_20241013T1345_RST.txt
Org_FHAB_FHAB_1008_20241013T1345.txt
Org_FHAB_FHAB_1008_20241013T1345_RST.txt
Org_FHAB_FHAB_1009_20241013T1345.txt
Org_FHAB_FHAB_1009_20241013T1345_RST.txt
Org_FHAB_FHAB_1011_20241013T1345.txt
Org_FHAB_FHAB_1011_20241013T1345_RST.txt
Org_FHAB_FHAB_1012_20241013T1345.txt
Org_FHAB_FHAB_1012_20241013T1345_RST.txt


In [4]:
# Step 3: Analyze file name patterns
import re

file_patterns = []

# Regular expression to capture the different parts of the file name
file_pattern_regex = re.compile(r'^(.*?_.*?)_(.*?)_(.*?)_(.*?)\.txt$')

for file_name in merged_files:
    match = file_pattern_regex.match(file_name)
    if match:
        prefix, location, timestamp, suffix = match.groups()
        file_patterns.append({
            'file_name': file_name,
            'prefix': prefix,
            'location': location,
            'timestamp': timestamp,
            'suffix': suffix
        })

print("\nAnalyzed File Name Patterns (First 10 files):")
for pattern in file_patterns[:10]:
    print(f"File: {pattern['file_name']} - Prefix: {pattern['prefix']}, Location: {pattern['location']}, Timestamp: {pattern['timestamp']}, Suffix: {pattern['suffix']}")

# We will stop here for now as per the current step requested.
# Next steps can be implemented after verification of the file name patterns.



Analyzed File Name Patterns (First 10 files):
File: Org_FHAB_FHAB_1000_20241013T1345.txt - Prefix: Org_FHAB, Location: FHAB, Timestamp: 1000, Suffix: 20241013T1345
File: Org_FHAB_FHAB_1000_20241013T1345_RST.txt - Prefix: Org_FHAB, Location: FHAB, Timestamp: 1000, Suffix: 20241013T1345_RST
File: Org_FHAB_FHAB_1003_20241013T1345.txt - Prefix: Org_FHAB, Location: FHAB, Timestamp: 1003, Suffix: 20241013T1345
File: Org_FHAB_FHAB_1003_20241013T1345_RST.txt - Prefix: Org_FHAB, Location: FHAB, Timestamp: 1003, Suffix: 20241013T1345_RST
File: Org_FHAB_FHAB_1004_20241013T1345.txt - Prefix: Org_FHAB, Location: FHAB, Timestamp: 1004, Suffix: 20241013T1345
File: Org_FHAB_FHAB_1004_20241013T1345_RST.txt - Prefix: Org_FHAB, Location: FHAB, Timestamp: 1004, Suffix: 20241013T1345_RST
File: Org_FHAB_FHAB_1005_20241013T1345.txt - Prefix: Org_FHAB, Location: FHAB, Timestamp: 1005, Suffix: 20241013T1345
File: Org_FHAB_FHAB_1005_20241013T1345_RST.txt - Prefix: Org_FHAB, Location: FHAB, Timestamp: 1005, Suf

In [8]:
# Step 1: Define directories and list the files for each directory
# Note: You should run this part in your local Jupyter environment where the directories exist

import os
import shutil
import re

# Define the paths for the two directories
base_dir = '/mnt/d/Doc_To_Date/20241024/20241013'
update_dir = '/mnt/d/Doc_To_Date/20241024/20241018'
output_dir = '/mnt/d/Doc_To_Date/20241024/merged_output'

# Step 1.1: Create output directory if it doesn't exist and clear it if it already exists
if os.path.exists(output_dir):
    shutil.rmtree(output_dir)  # Remove all contents of the output directory
os.makedirs(output_dir, exist_ok=True)

# Copy all files from base_dir to output_dir
for file_name in os.listdir(base_dir):
    base_file_path = os.path.join(base_dir, file_name)
    output_file_path = os.path.join(output_dir, file_name)
    shutil.copy(base_file_path, output_file_path)

print("All files from base directory copied to merged_output directory.")

# Step 2: List files in the merged_output directory
merged_files = os.listdir(output_dir)

print("\nFiles in Merged Output Directory (First 20 files):")
print("\n".join(merged_files[:20]))  # Print first 20 files for readability

# Step 3: Analyze file name patterns in merged_output
file_patterns = []

# Regular expression to capture the different parts of the file name
file_pattern_regex = re.compile(r'^(.*?_.*?)_(.*?)_(.*?)_(.*?)\.txt$')

for file_name in merged_files:
    match = file_pattern_regex.match(file_name)
    if match:
        prefix, location, timestamp, suffix = match.groups()
        file_patterns.append({
            'file_name': file_name,
            'prefix': prefix,
            'location': location,
            'timestamp': timestamp,
            'suffix': suffix
        })

print("\nAnalyzed File Name Patterns (First 10 files):")
for pattern in file_patterns[:10]:
    print(f"File: {pattern['file_name']} - Prefix: {pattern['prefix']}, Location: {pattern['location']}, Timestamp: {pattern['timestamp']}, Suffix: {pattern['suffix']}")

# Step 4: List and analyze files in update_dir to find matches and non-matches
update_files = os.listdir(update_dir)
matched_files = []
unmatched_files = []

for file_name in update_files:
    match = file_pattern_regex.match(file_name)
    if match:
        prefix, location, timestamp, suffix = match.groups()
        # Check if a file with the same prefix, location, and timestamp exists in the merged_output directory
        if any(f['prefix'] == prefix and f['location'] == location and f['timestamp'] == timestamp for f in file_patterns):
            matched_files.append(file_name)
        else:
            unmatched_files.append(file_name)

print("\nMatched Files in Update Directory (First 10 files):")
print("\n".join(matched_files[:10]))

print("\nUnmatched Files in Update Directory (First 10 files):")
print("\n".join(unmatched_files[:10]))

# Print total number of matched and unmatched files
print(f"\nTotal number of matched files: {len(matched_files)}")
print(f"Total number of unmatched files: {len(unmatched_files)}")

# Step 5: Copy unmatched files from update_dir to merged_output
for file_name in unmatched_files:
    update_file_path = os.path.join(update_dir, file_name)
    output_file_path = os.path.join(output_dir, file_name)
    shutil.copy(update_file_path, output_file_path)

print("\nUnmatched files copied to merged_output directory.")

# Step 6: Merge matched files while preserving original formatting
for file_name in matched_files:
    # Define paths for base and update files
    base_file_pattern = re.sub(r'_20241018T0905', '_20241013T1345', file_name)  # Convert update file to match base filename in merged_output
    base_file_path = os.path.join(output_dir, base_file_pattern)
    update_file_path = os.path.join(update_dir, file_name)

    try:
        # Read lines from both files
        with open(base_file_path, 'r', encoding='cp1251') as base_file:  # Updated to use CP1251 encoding
            base_lines = base_file.readlines()

        with open(update_file_path, 'r', encoding='cp1251') as update_file:  # Updated to use CP1251 encoding
            update_lines = update_file.readlines()

        # Use a set to track lines in the base file to avoid duplicates
        base_lines_set = set(base_lines)

        # Add only unique lines from the update file to the base lines
        merged_lines = base_lines[:]
        for line in update_lines:
            if line not in base_lines_set:
                merged_lines.append(line)

        # Write merged lines back to the base file
        with open(base_file_path, 'w', encoding='cp1251') as merged_file:  # Updated to use CP1251 encoding
            merged_file.writelines(merged_lines)

    except FileNotFoundError as e:
        print(f"Error in files: {base_file_path} or {update_file_path}, error: {e}")
        continue

print("\nMatched files merged and updated in merged_output directory.")


All files from base directory copied to merged_output directory.

Files in Merged Output Directory (First 20 files):
Org_FHAB_FHAB_1000_20241013T1345.txt
Org_FHAB_FHAB_1000_20241013T1345_RST.txt
Org_FHAB_FHAB_1003_20241013T1345.txt
Org_FHAB_FHAB_1003_20241013T1345_RST.txt
Org_FHAB_FHAB_1004_20241013T1345.txt
Org_FHAB_FHAB_1004_20241013T1345_RST.txt
Org_FHAB_FHAB_1005_20241013T1345.txt
Org_FHAB_FHAB_1005_20241013T1345_RST.txt
Org_FHAB_FHAB_1006_20241013T1345.txt
Org_FHAB_FHAB_1006_20241013T1345_RST.txt
Org_FHAB_FHAB_1007_20241013T1345.txt
Org_FHAB_FHAB_1007_20241013T1345_RST.txt
Org_FHAB_FHAB_1008_20241013T1345.txt
Org_FHAB_FHAB_1008_20241013T1345_RST.txt
Org_FHAB_FHAB_1009_20241013T1345.txt
Org_FHAB_FHAB_1009_20241013T1345_RST.txt
Org_FHAB_FHAB_1011_20241013T1345.txt
Org_FHAB_FHAB_1011_20241013T1345_RST.txt
Org_FHAB_FHAB_1012_20241013T1345.txt
Org_FHAB_FHAB_1012_20241013T1345_RST.txt

Analyzed File Name Patterns (First 10 files):
File: Org_FHAB_FHAB_1000_20241013T1345.txt - Prefix: Org

  df_base = pd.read_csv(base_file_path, delimiter='|', header=None, encoding='latin1')
  df_base = pd.read_csv(base_file_path, delimiter='|', header=None, encoding='latin1')
  df_base = pd.read_csv(base_file_path, delimiter='|', header=None, encoding='latin1')


  
  Совпавшие файлы объединены и обновлены в каталоге merged_output.
