In [2]:
#Libraries
import csv
import os

In [26]:
import os
import csv

# File paths for master and dependent data
FILE_PATH_MASTER = 'data.csv'
FILE_PATH_DEPENDENT = 'data2.csv'

def open_csv_file(file_path):
    master_data = []
    
    if os.path.exists(file_path):  # Check if the file exists
        with open(file_path, mode='r', newline='') as file:
            reader = csv.reader(file)
            for row in reader:
                master_data.append(row)
    else:
        # Create a new file if it does not exist
        print(f"The file '{file_path}' does not exist. Creating a new file.")
        with open(file_path, mode='w', newline=''):
            pass  # Just create an empty file

    return master_data

# Load master and dependent data
master_data = open_csv_file(FILE_PATH_MASTER)
dependent_data = open_csv_file(FILE_PATH_DEPENDENT)

# Optionally print out the loaded data for verification
print("Master Data:", master_data)
print("Dependent Data:", dependent_data)


In [43]:
# Uses polymorphism and factories to convert the column types into objects
# This will allow us to convert data easier and handle edge cases
# For example the name object's convert() only returns the first name

# Base class for all data fields
class DataField:
    def display(self):
        raise NotImplementedError("Subclasses must implement this method.")

# Subclass for each specific category
class Name(DataField):
    def __init__(self, value):
        self.value = value

    def convert(self) :
        self.value = [name.split()[0].capitalize() for name in self.value]


    def display(self):
        return f"Name: {self.value}"

class Date(DataField):
    def __init__(self, value):
        self.value = value

    def display(self):
        return f"Date: {self.value}"

class Time(DataField):
    def __init__(self, value):
        self.value = value

    def display(self):
        return f"Time: {self.value}"

class Temperature(DataField):
    def __init__(self, value):
        self.value = value

    def display(self):
        return f"Temperature: {self.value}°C"

class Status(DataField):
    def __init__(self, value):
        self.value = value

    def display(self):
        return f"Status: {self.value}"

class Address(DataField):
    def __init__(self, value):
        self.value = value

    def display(self):
        return f"Address: {self.value}"

class ID(DataField):
    def __init__(self, value):
        self.value = value

    def display(self):
        return f"ID: {self.value}"

class PhoneNumber(DataField):
    def __init__(self, value):
        self.value = value

    def display(self):
        return f"Phone Number: {self.value}"

class Email(DataField):
    def __init__(self, value):
        self.value = value

    def display(self):
        return f"Email: {self.value}"

class Price(DataField):
    def __init__(self, value):
        self.value = value

    def display(self):
        return f"Price: ${self.value}"

class Quantity(DataField):
    def __init__(self, value):
        self.value = value

    def display(self):
        return f"Quantity: {self.value}"

class Age(DataField):
    def __init__(self, value):
        self.value = value

    def display(self):
        return f"Age: {self.value} years"

class Gender(DataField):
    def __init__(self, value):
        self.value = value

    def display(self):
        return f"Gender: {self.value}"

class Weight(DataField):
    def __init__(self, value):
        self.value = value

    def display(self):
        return f"Weight: {self.value} kg"

class Height(DataField):
    def __init__(self, value):
        self.value = value

    def display(self):
        return f"Height: {self.value} m"


# Factory for creating DataField instances
class DataFieldFactory:
    @staticmethod
    def create_data_field(field_type, value):
        field_classes = {
            "name": Name,
            "date": Date,
            "time": Time,
            "temperature": Temperature,
            "status": Status,
            "address": Address,
            "id": ID,
            "phone_number": PhoneNumber,
            "email": Email,
            "price": Price,
            "quantity": Quantity,
            "age": Age,
            "gender": Gender,
            "weight": Weight,
            "height": Height,
        }
        
        field_class = field_classes.get(field_type.lower())
        if not field_class:
            raise ValueError(f"Unknown data field type: {field_type}")
        
        return field_class(value)


# Example usage
def display_data(fields):
    for field in fields:
        print(field.display())


Name: Charlie Day
Date: 2024-10-12
Time: 15:30:00
Temperature: 39.8°C
Status: Active
Address: 123 Main St
ID: A001
Phone Number: 555-1234
Email: charlie@example.com
Price: $19.99
Quantity: 5
Age: 23 years
Gender: Male
Weight: 70.5 kg
Height: 1.75 m


In [49]:
# Threshold for categorizing column names
CATEGORY_THRESHOLD = 10

def get_column_values(file_data, index):
    """
    Retrieves all values from a specified column index in the given data.
    
    Args:
        file_data (list): 2D list containing data.
        index (int): The index of the column to retrieve.

    Returns:
        list: A list of values from the specified column.
    """
    column_values = []
    for row in file_data[1:]:  # Skip the header row
        column_values.append(row[index])
    return column_values

def levenshtein_distance(str1, str2):
    """
    Calculates the Levenshtein distance between two strings.
    
    Args:
        str1 (str): The first string.
        str2 (str): The second string.

    Returns:
        int: The Levenshtein distance.
    """
    m, n = len(str1), len(str2)
    dp = [[0] * (n + 1) for _ in range(m + 1)]

    # Initialize base cases
    for i in range(m + 1):
        dp[i][0] = i 
    for j in range(n + 1):
        dp[0][j] = j 

    # Compute the distances
    for i in range(1, m + 1):
        for j in range(1, n + 1):
            cost = 0 if str1[i - 1] == str2[j - 1] else 1 
            dp[i][j] = min(
                dp[i - 1][j] + 1,
                dp[i][j - 1] + 1,
                dp[i - 1][j - 1] + cost
            )
    
    return dp[m][n]  # Return the distance

def categorize_column(column_name, categories):
    """
    Categorizes a column based on the minimum Levenshtein distance 
    compared to known categories.
    
    Args:
        column_name (str): The name of the column to categorize.
        categories (list): A list of category names and variations.

    Returns:
        str: The category assigned to the column.
    """
    current_category = "etc"
    min_distance = CATEGORY_THRESHOLD

    for category_row in categories[1:]:  # Skip the header
        category_name = category_row[0]
        for other_name in category_row[1:]:
            distance = levenshtein_distance(other_name.lower(), column_name.lower())
            if distance < min_distance:
                min_distance = distance
                current_category = category_name
                
    return current_category

def categorize_all_columns(data_table, categories):
    """
    Categorizes all columns in the data table based on the defined categories.
    
    Args:
        data_table (list): The 2D list representing the data table.
        categories (list): The categories to compare against.

    Returns:
        list: A list of DataField instances for each categorized column.
    """
    table_categories = []
    
    # Iterate over column names
    for j, column_name in enumerate(data_table[0]):
        current_category = categorize_column(column_name, categories)
        column_values = get_column_values(data_table, j)
        
        # Create a DataField instance for the categorized column
        table_categories.append(DataFieldFactory.create_data_field(current_category, column_values))

    print("Column Names:", data_table[0])
    print("Assigned Categories:", table_categories)
    return table_categories

# Example usage
categories = open_csv_file('categories_data.csv')
master_categories = categorize_all_columns(master_data, categories)
dependent_table = categorize_all_columns(dependent_data, categories)

# Convert the first master category for demonstration
master_categories[0].convert()
print('Processing completed.')

Column Names: ['Name', 'Temperature', 'Time', 'Date', 'Age (years)']
Assigned Categories: [<__main__.Name object at 0x0000012BF71F6810>, <__main__.Temperature object at 0x0000012BF5204620>, <__main__.Time object at 0x0000012BF4EB9700>, <__main__.Date object at 0x0000012BF52044A0>, <__main__.Age object at 0x0000012BF5204740>]
Column Names: ['Full Name', 'Temp', 'Timestamp', 'Day Recorded', 'Status']
Assigned Categories: [<__main__.Name object at 0x0000012BF60AF5F0>, <__main__.Temperature object at 0x0000012BF60AE360>, <__main__.Time object at 0x0000012BF5206090>, <__main__.Time object at 0x0000012BF5204800>, <__main__.Status object at 0x0000012BF60AED80>]
Processing completed.


In [22]:
def get_column_values(file_data):
    
    for i in range(len(file_data[0]) - 1):
        column_values = []
        for row in file_data[1:]:
            column_values.append(row[i])
        print(column_values)
    return column_values


column_data = get_column_values(master_data)


['Charlie', 'Charlie', 'Bob', 'Eva', 'Eva', 'Eva', 'Alice', 'David', 'Eva', 'Bob']
['39.8', '36.2', '36.5', '36.3', '38.2', '38.5', '39', '39.6', '36.1', '35.2']
['21:47:43', '3:10:43', '18:35:43', '19:31:43', '15:51:43', '7:03:43', '16:21:43', '4:16:43', '3:43:43', '16:59:43']
['9/4/2024', '9/14/2024', '9/28/2024', '7/28/2024', '9/5/2024', '4/12/2024', '4/16/2024', '3/27/2024', '6/15/2024', '6/27/2024']
