In [1]:
import pandas as pd

# 1. Read the raw file with no parsing (just one column)
with open("Dataset.csv", "r", encoding="utf-8") as file:
    lines = file.readlines()

# 2. First line is the header, clean it
header = lines[0].replace('"', '').strip().split(',')

# 3. The rest are the rows, clean them too
data = [line.replace('"', '').strip().split(',') for line in lines[1:]]

# 4. Create the cleaned DataFrame
df = pd.DataFrame(data, columns=header)

# 5. dataset overview
# print(df.shape)
# print(df.columns) # list all columns names
# print(df.isnull().sum())
# df.head()
# (df.isnull().sum() / len(df) * 100).sort_values(ascending=False)
# print(df.info()) # overview about the dataset
# print(df.dtypes) # Check data types of columns
# print(df.columns.tolist()) # check column names

# 6. Convert columns from objects to numeric types, coercing errors to NaN (if exist)
numeric_columns = [
    "lbxsal", "lbdsalsi", "lbxsatsi", "lbxsassi", "lbxsapsi",
    "lbxsbu", "lbdsbusi", "lbxsca", "lbdscasi", "lbxsck", "lbxsch",
    "lbdschsi", "lbxsc3si", "lbxscr", "lbdscrsi", "lbxsgtsi", "lbxsgl",
    "lbdsglsi", "lbxsir", "lbdsirsi", "lbxsldsi", "lbxsph", "lbdsphsi",
    "lbxstb", "lbdstbsi", "lbxstp", "lbdstpsi", "lbxsua", "lbdsuasi",
    "lbxsnasi", "lbxsksi", "lbxsclsi", "lbxsossi", "lbxsgb", "lbdsgbsi",
    "lbxstr", "lbdstrsi"
]

for col in numeric_columns:
    df[col] = pd.to_numeric(df[col], errors="coerce")

# Drop 'seqn' since it's just an ID
data = df.drop(columns=['seqn'])

# 7. Normal ranges
normal_ranges = {
    'lbxsal': (135, 145),
    'lbdsalsi': (135, 145),
    'lbxsassi': (10, 40),
    'lbxsapsi': (44, 147),
    'lbxsbu': (7, 20),
    'lbdsbusi': (2.5, 7.1),
    'lbxsca': (8.5, 10.2),
    'lbdscasi': (2.12, 2.55),
    'lbxsck': (22, 198),
    'lbxsch': (0, 200),
    'lbdschsi': (0, 5.17),
    'lbxsc3si': (22, 29),
    'lbxscr': (0.74, 1.35),
    'lbdscrsi': (65.4, 119.3),
    'lbxsgtsi': (9, 48),
    'lbxsgl': (70, 99),
    'lbdsglsi': (3.9, 5.5),
    'lbxsir': (60, 170),
    'lbdsirsi': (10.7, 30.4),
    'lbxsldsi': (140, 280),
    'lbxsph': (2.5, 4.5),
    'lbdsphsi': (0.81, 1.45),
    'lbxstb': (0.1, 1.2),
    'lbdstbsi': (1.71, 20.5),
    'lbxstp': (6.0, 8.3),
    'lbdstpsi': (60, 83),
    'lbxsua': (3.5, 7.2),
    'lbdsuasi': (208, 428),
    'lbxsnasi': (135, 145),
    'lbxsksi': (3.5, 5.1),
    'lbxsclsi': (98, 107),
    'lbxsossi': (275, 295),
    'lbxsgb': (2.0, 3.5),
    'lbdsgbsi': (20, 35),
    'lbxstr': (0, 150),
    'lbdstrsi': (0, 1.7),
    'lbxsatsi': (20, 55)  # Saturated Iron Binding Capacity (approx.)
}

# Function to normalize
def normalize(val, low, high):
    if val < low:
        return -1
    elif val > high:
        return 1
    else:
        return 0

# New dataframe for normalized values
normalized_df = pd.DataFrame()

# Loop through columns and apply normalization
for col, (low, high) in normal_ranges.items():
    normalized_df[col] = df[col].head(3).apply(lambda x: normalize(x, low, high))

# Show result
# print(normalized_df)

# normalized_df.to_latex("dataset_table.tex", index=False)

# 8. data preprocessing - transactions
# Function to label values
def get_label(col, val):
    if val == -1:
        return f"{col}↓"
    elif val == 1:
        return f"{col}↑"
    else:
        return None

# Create transactions
transactions = []
for _, row in normalized_df.iterrows():
    items = [get_label(col, row[col]) for col in normalized_df.columns]
    # Remove None values (i.e., normal results)
    transaction = list(filter(None, items))
    transactions.append(transaction)

# Check first few
for t in transactions[:5]:
    print(t)



NameError: name 'apriori' is not defined