In [2]:
import pandas as pd
import numpy as np
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from scipy.sparse import csr_matrix
from scipy.optimize import minimize
from sklearn.model_selection import train_test_split

# Function to handle missing values
def handle_missing_values(dataset):
    if dataset.isnull().any().any():
        numerical_cols = dataset.select_dtypes(include=['number']).columns
        categorical_cols = dataset.select_dtypes(exclude=['number']).columns

        imputer_numeric = SimpleImputer(strategy='mean')
        dataset[numerical_cols] = imputer_numeric.fit_transform(dataset[numerical_cols])

        imputer_categorical = SimpleImputer(strategy='most_frequent')
        dataset[categorical_cols] = imputer_categorical.fit_transform(dataset[categorical_cols])

    return dataset

# Function to categorize the dependent variable
def categorize_column(dataset, dependent_column_name, num_intervals):
    # Ensure the dependent column exists in the dataset
    if dependent_column_name not in dataset.columns:
        raise ValueError(f"Dependent column '{dependent_column_name}' does not exist in the dataset.")

    # Fill NaN values with the mean of the column
    dataset_filled = dataset.fillna(dataset[dependent_column_name].mean())

    # Extract the specified dependent column
    value_ag_filled = dataset_filled[dependent_column_name]

    # Check if the column contains numeric data
    if not pd.api.types.is_numeric_dtype(value_ag_filled):
        raise ValueError(f"The '{dependent_column_name}' column must contain numeric data.")

    # Scale to the target range [0, 1] using np.interp
    value_ag_scaled_filled = np.interp(value_ag_filled, (value_ag_filled.min(), value_ag_filled.max()), (0, 1))

    # Create intervals and assign labels for scaled values
    data_inter_filled = pd.cut(value_ag_scaled_filled, bins=num_intervals, labels=[chr(ord('A') + i) for i in range(num_intervals)])

    # Store intervals in the dataset
    dataset['Category'] = data_inter_filled

    return dataset

# Function to calculate Jaccard similarity
def calculate_jaccard_similarity(Data_binary, one_hot_encoded):
    num_rows_binary = Data_binary.shape[0]
    num_rows_encoded = one_hot_encoded.shape[0]
    result_matrix = np.zeros((num_rows_binary, num_rows_encoded))

    for i in range(num_rows_binary):
        for j in range(num_rows_encoded):
            set_row_binary = set(Data_binary.iloc[i, 1:])
            set_row_encoded = set(one_hot_encoded.iloc[j, :])
            intersection_size = len(set_row_binary.intersection(set_row_encoded))
            union_size = len(set_row_binary.union(set_row_encoded))
            similarity = intersection_size / union_size if union_size != 0 else 0
            result_matrix[i, j] = similarity

    return pd.DataFrame(result_matrix, columns=one_hot_encoded.index)

# Transform matrix for Jaccard similarity
def transform_matrix(matrix):
    transformed_matrix = matrix.copy()
    transformed_matrix[transformed_matrix > 0.50] = 1
    transformed_matrix[transformed_matrix <= 0.50] = 0
    return transformed_matrix

# Objective function for LMVM
def objective_function(weights, SX, OS):
    P = np.exp(SX.dot(weights))
    P /= np.sum(P)
    loss = -np.sum(OS * np.log(P + 1e-10))
    return loss

# Gradient function for LMVM
def gradient_function(weights, SX, OS):
    P = np.exp(SX.dot(weights))
    P /= np.sum(P)
    gradients = SX.T @ (P - OS)
    return gradients

# Optimize weights using LMVM
def optimize_weights(SX, OS):
    initial_weights = np.random.normal(0, 0.01, size=SX.shape[1])
    result = minimize(
        fun=objective_function,
        x0=initial_weights,
        args=(SX, OS),
        method='L-BFGS-B',
        jac=gradient_function
    )
    return result.x

# Function to calculate weights for each category (using LMVM)
def calculate_weights_for_category(category, Data_binary, one_hot_encoded):
    jaccard_similarity_matrix = calculate_jaccard_similarity(Data_binary, one_hot_encoded)
    transformed_matrix = transform_matrix(jaccard_similarity_matrix)
    SXmat = csr_matrix(transformed_matrix)

    # Optimize weights using LMVM
    weights = optimize_weights(SXmat, np.ones(SXmat.shape[0]) / SXmat.shape[0])

    return weights

# Function to calculate P(x/c) after weights are optimized
def calculate_x_by_c(X, one_hot_encoded, category_weights):
    jaccard_result = calculate_jaccard_similarity(X, one_hot_encoded)
    x_by_c = pd.DataFrame(index=X.index)

    for index, row in X.iterrows():
        try:
            category = row['Category']
            jaccard_row = jaccard_result.iloc[index].values.reshape(1, -1)

            category_info = category_weights.get(category)
            if category_info is None:
                raise ValueError(f"No weights found for category: {category}")
            weights = category_info['weights']
            weights = np.expand_dims(weights, axis=0)

            # Perform matrix multiplication directly with weights
            row_value = np.dot(jaccard_row, weights.T).squeeze()

            # Store result
            x_by_c.loc[index, 'P(x/c)'] = row_value

        except ValueError as e:
            print(f"Error in row {index}: {e}")
            raise

    return x_by_c

# Calculate category probabilities
def calculate_category_probabilities(data_frame, category_column):
    category_counts = data_frame[category_column].value_counts()
    category_probabilities = category_counts / len(data_frame)

    return category_counts, pd.DataFrame({
        category_column: category_probabilities.index,
        'Probability': category_probabilities.values
    })

# Calculate maximum posterior category
def calculate_max_posterior_category(binary_dataset, x_by_c_values, record_prob_per_category):
    predicted_categories = []
    for index, row_values in x_by_c_values.iterrows():
        p_x_c = row_values['P(x/c)']
        category_probs = p_x_c * record_prob_per_category['Record_Prob_Per_Category']
        predicted_category = record_prob_per_category['Category'][category_probs.idxmax()]
        predicted_categories.append(predicted_category)

    binary_dataset['Predicted_Category'] = predicted_categories
    return binary_dataset

# Calculate RMSE for the predictions
def calculate_rmse(X_test, intervals_df):
    prediction = calculate_max_posterior_category(Data_binary, x_by_c_values, record_prob_per_category)
    predicted_column = prediction['Predicted_Category']

    sum_squared_error = 0.0
    for i, row in X_test.iterrows():
        actual_category = row['Category']
        predicted_category = predicted_column[i]

        actual_max_value = intervals_df[intervals_df['Category'] == actual_category]['Max_Value'].values[0]
        predicted_max_value = intervals_df[intervals_df['Category'] == predicted_category]['Max_Value'].values[0]

        squared_difference = (predicted_max_value - actual_max_value) ** 2
        sum_squared_error += squared_difference

    mean_squared_error = sum_squared_error / len(X_test)
    rmse = np.sqrt(mean_squared_error)
    return rmse

# Main execution
dataset = input("Enter the path to the dataset CSV file: ")
concrete = pd.read_csv(dataset)

# Specify the dependent column correctly
dependent = 'T20BOLT'

# Check the structure of the DataFrame
print("Columns in concrete DataFrame:", concrete.columns)

# Categorize the dependent variable
num_intervals = 10
concrete = categorize_column(concrete, dependent, num_intervals)

# Prepare the data for training
X = concrete.drop(dependent, axis=1)
y = concrete[dependent]

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Load one-hot encoded frequent itemsets
frequent_itemset_file_path = input("Enter the path to the frequent itemset CSV file: ")
column_names = ['Frequent_Itemsets_CHARM']
freq_item = pd.read_csv(frequent_itemset_file_path, names=column_names)
one_hot_encoded = pd.get_dummies(freq_item['Frequent_Itemsets_CHARM'].str.split(expand=True), prefix='', prefix_sep='')

# Calculate weights for each category
categories_weights = {}
categories = X_train['Category'].unique()

for category in categories:
    weights = calculate_weights_for_category(category, X_train, one_hot_encoded)
    categories_weights[category] = {'weights': weights}

# Calculate P(x/c) for the test set
x_by_c_values = calculate_x_by_c(X_test, one_hot_encoded, categories_weights)

# Calculate category probabilities and record probabilities
category_counts, category_probabilities_df = calculate_category_probabilities(X_train, 'Category')
record_prob_per_category = pd.DataFrame({
    'Category': np.sort(X_train['Category'].unique()),
    'Record_Prob_Per_Category': 1 / len(X_train) / category_probabilities_df['Probability']
})

# Calculate RMSE
result


Columns in concrete DataFrame: Index(['RUN', 'SPEED1', 'TOTAL', 'SPEED2', 'NUMBER2', 'SENS', 'TIME',
       'T20BOLT'],
      dtype='object')


IndexError: single positional indexer is out-of-bounds