# UFCFVQ-15-M Programming for Data Science (Autumn 2023)


## Student Id:

# Programming Task 1

### Requirement FR1 - Develop a function to read a single column from a CSV file

In [None]:
import csv

# Functional Requirement #1
# This function takes two arguments: 'file_path' for the CSV file and 'column_index'
# for the index of the column that needs to be extracted. It uses the csv module to 
# work with CSV files. The function returns two values: 'column_name' and 'column_data'.

def extract_column_data(file_path, column_index):
    # Open the CSV file with utf-8-sig encoding to handle potential BOM (Byte Order Mark) issues
    with open(file_path, 'r', encoding='utf-8-sig') as csv_file:
        csv_reader = csv.reader(csv_file)
        
        # Extract the header row
        header_row = next(csv_reader)
        
        # Get the column name based on the provided column index
        column_name = header_row[column_index]
        
        # Extract column data using a list comprehension
        column_data = [row[column_index] for row in csv_reader]
    
    return column_name, column_data


### Requirement FR2 - Develop a function to read CSV data from a file into memory

In [None]:
import csv

# Functional Requirement #2
# This function uses a dictionary (key, value) to extract data from the file and 
# store it into a dictionary. It relies on the previous function extract_column_data() 
# to get column data and stores it in the following format: {'key': ['list_of_data']}

def read_csv_into_dict(file_path):
    data_dict = {}
    
    with open(file_path, 'r') as csv_file:
        csv_reader = csv.reader(csv_file)
        
        # Extract the header row
        header_row = next(csv_reader)
        
        # Iterate over all columns in the CSV file
        for column_index in range(len(header_row)):
            # Use extract_column_data() to get column name and data
            column_name, column_data = extract_column_data(file_path, column_index)
            
            # Store column data in the dictionary
            data_dict[column_name] = column_data
            
    return data_dict


### Requirement FR3 - Develop a function to calculate the Kendall Tau Correlation Coefficient for two lists of data

In [None]:
# Functional Requirement #3
# To measure the strength of correlation between two variables, Kendall's correlation
# is used. Kendall's correlation coefficients are rank-based and are also known as 
# non-parametric correlation. This correlation calculates concordant and discordant pairs.

def kendall_tau_coefficient(list_a, list_b):
    if len(list_a) != len(list_b):
        raise ValueError("Lists must be of equal length.")
    
    concordant_pairs = discordant_pairs = 0
    
    for i in range(len(list_a)):
        for j in range(i+1, len(list_a)):
            # Find concordant pairs
            if (list_a[i] < list_a[j] and list_b[i] < list_b[j]) or (list_a[i] > list_a[j] and list_b[i] > list_b[j]):
                concordant_pairs += 1
            # Find discordant pairs
            elif (list_a[i] < list_a[j] and list_b[i] > list_b[j]) or (list_a[i] > list_a[j] and list_b[i] < list_b[j]):
                discordant_pairs += 1
    
    # Calculate the coefficient
    coefficient = (concordant_pairs - discordant_pairs) / ((concordant_pairs + discordant_pairs) * 0.5)
    
    return coefficient


### Requirement FR4 - Develop a function to generate a set of Kendall Tau Correlation Coefficients for a data structure like the one generated in FR2

In [None]:
# Functional Requirement #4
# This function takes the help from the previous function read_csv_into_dict(),
# using it as an argument to generate pairs of Kendall Tau coefficients. For example,
# [('Cancer', 'Cardiovascular', 0.75), ('Cancer', 'Stroke', 0.60)] and so on.
# Returns the coefficients pairs as a list of tuples containing column pairs and coefficients.

def generate_kendall_coefficients(data_dict):
    coefficients_list = []
    
    columns = list(data_dict.keys())
    
    # Nested loop to generate pairs of columns
    for i in range(len(columns)):
        for j in range(i+1, len(columns)):
            # Calculate Kendall Tau coefficient for the pair of columns
            coefficient = kendall_tau_coefficient(data_dict[columns[i]], data_dict[columns[j]])
            coefficients_list.append((columns[i], columns[j], coefficient))
    
    return coefficients_list


### Requirement FR5 - Develop a function to print a custom table for selected data from a data structure like the one generated in FR4

In [None]:
# Functional Requirement #5
# This function prints the correlation in a grid format with a default border character '*',
# which can be replaced as an argument. Similarly, the column value can be None to display
# all dependencies, or a list to display only the specified columns.

def display_custom_table(list_of_pairs, border_char='*', columns=None):
    column_names = list({pair[0] for pair in list_of_pairs})
    
    # Print header (empty space) row
    print(f"{'':<15}", end=' ')
    
    # Print all column categories
    for col in column_names:
        print(f"{col:>15}", end=' ')
        
    print()

    # Print border row
    for _ in range(len(column_names) + 1):
        print(f"{border_char*15:>15}", end=' ')
    
    print()

    for row in column_names:
        if columns is None or row in columns:
            print(f"{row:<15}", end=' ')
        else:
            continue
        
        for col in column_names:
            if row == col:
                print(" "*15, end=' ')
            else:
                # Get coefficient value from the list_of_pairs
                coefficient = next((pair[2] for pair in list_of_pairs if pair[0] == row and pair[1] == col), None)
                if columns is not None:
                    coefficient = next((pair[2] for pair in list_of_pairs if pair[0] == row and pair[1] == col and row in columns), None)                
                if coefficient is not None:
                    print(f"{coefficient:>15.4f}", end=' ')
                else:
                    print(" "*15, end=' ')
        print()

    # Print border row
    for _ in range(len(column_names) + 1):
        print(f"{border_char*15:>15}", end=' ')
