In [28]:
#The following script splits a file into "num_files" number of smaller files.
#Very useful for analyzing large files that are difficult to manipulate and/or process.
#Example is for json format but can be used for any text file.
#Note that the resulting json files will need to be manually updated to include the closing ].

import json

def split_json_file(input_file, num_files):
    # Determine the size of each chunk
    with open (input_file) as f:
        size = sum(1 for line in f)
    chunk_size = size // num_files
    remainder = size % num_files
    
    # Split the data into chunks and save each chunk to a separate file
    with open(input_file) as f:
        for i in range (num_files) :
            chunk_length = chunk_size + (1 if i < remainder else 0)
            output_file = f"{input_file}.{i+1}.json"
            with open(output_file, "w") as out_f: 
                for j in range(chunk_length):
                    line = f.readline()
                    if not line:
                        break 
                    out_f.write(line)
                    
if __name__ == "__main__":
    input_file = "large-file.json"
    num_files = 5 #edit as needed
    split_json_file(input_file, num_files)


In [13]:
#The following script will search fileB for all words in fileA.
#It will write all found words into fileC.
#Useful for reconciliation/analysis purposes in large files.

#Open and read the content of fileA
with open('fileA.txt', 'r') as fileA:
        words = fileA.read().split()

#Open and read the content of fileB
with open('fileB.txt', 'r') as fileB:
        content = fileB.read()

found_items = []

#Search for each item in fileB and print it if found
for word in words:
    if word in content:
        found_items.append(word)
        
        with open('found_items.txt', 'w') as found_items_file:
            for word in found_items:
                found_items_file.write(word + '\n')

In [None]:
#The following script is used to flatten json files, which can be exported to csv/excel format for easier analysis.
#The current version takes as parameter only one key from one node. It needs to be extended to retrieve all key pair values in all nodes.

import pandas as pd
def flatten_json(nested_json, exclude = ['']):
    
    out = {}
    
    def flatten(x, name = '', exclude = exclude):
        if type(x) is dict:
            for a in x:
                if a not in exclude: 
                    flatten(x[a], name + a + '_')
        elif type(x) is list:
            i = 0
            for a in x:
                flatten(a, name + str(i) + '_')
                i+=1
        else:
            out[name[:-1]] = x
            
    flatten(nested_json)
    return out

df = pd.read_json("small_file.json")
result = pd.DataFrame([flatten_json(x) for x in df['actor']])
result

In [None]:
#The following script removes all numerical characters from file names in order to make them more generic.
#The resulting file names need to be unique.
#This step can be used as a prerequisite in regression analysis for files with names that contain variable content, such as dates.

import os
import re

for root, dirs, files in os.walk('./test_folder'):
    for file_name in files:
        #Create full path to file
        file_path = os.path.join(root, file_name)
        try:
            #Extract the file extension
            file_base, file_ext = os.path.splitext(file_name)
            
            #Remove numbers from the file base
            new_file_base = re.sub(r'\d', '', file_base)
            
            #Construct the new file name
            new_file_name = new_file_base + file_ext
            
            #Create the full path to the new file
            new_file_path = os.path.join(root, new_file_name)
            
            #Rename the file if the new name is different
            if file_path != new_file_path:
                os.rename(file_path, new_file_path)
                print(f'Renamed: {file_name} to {new_file_name}')
                
        except Exception as e:
            print(f'Error processing {file_name}: {str(e)}')
            continue

In [None]:
#Create a folder structure: useful for saving output corresponding to a certain user or date.
#This script needs to be saved in a .py file and executed on a Linux machine at the desired location.
#Does not work in Jupyter Lab.

import os
import shutil

DATE = "Test_date"
ENV = "Test_env"
Subfolder1 = "Test1"
Subfolder2 = "Test2"
Subfolder3 = "Test3"
#Add more as needed

folder_structure = [DATE, ENV, Subfolder1]

full_path = ""
for folder in folder_structure:
    full_path = os.path.join(full_path, folder)
    if not os.path.exists(full_path):
        os.mkdir(full_path)

folder_structure = [DATE, ENV, Subfolder2]

full_path = ""
for folder in folder_structure:
    full_path = os.path.join(full_path, folder)
    if not os.path.exists(full_path):
        os.mkdir(full_path)
        
folder_structure = [DATE, ENV, Subfolder2]

full_path = ""
for folder in folder_structure:
    full_path = os.path.join(full_path, folder)
    if not os.path.exists(full_path):
        os.mkdir(full_path)

#add steps for all required subfolders

print("Folder structure created successfully")