In [1]:
import json
from striprtf.striprtf import rtf_to_text

with open('algoparams_from_ui.json.rtf') as json_file:
    content = json_file.read()
    file = rtf_to_text(content)
print(file)


{
    "session_name": "test",
    "session_description": "test",
    "design_state_data": {

      "session_info" : {
        "project_id": "1",
        "experiment_id": "kkkk-11",
        "dataset":"iris_modified.csv",
        "session_name": "test",
        "session_description": "test"
        },

      "target": {
        "prediction_type": "Regression",
        "target": "petal_width",
        "type":"regression",
        "partitioning": true
      },
      "train": {
        "policy": "Split the dataset",
        "time_variable": "sepal_length",
        "sampling_method": "No sampling(whole data)",
        "split": "Randomly",
        "k_fold": false,
        "train_ratio": 0,
        "random_seed": 0
      },
      "metrics": {
        "optomize_model_hyperparameters_for": "AUC",
        "optimize_threshold_for": "F1 Score",
        "compute_lift_at": 0,
        "cost_matrix_gain_for_true_prediction_true_result": 1,
        "cost_matrix_gain_for_true_prediction_false_result": 0,

In [2]:
data = json.loads(file)

In [3]:
#Read target and models
import pandas as pd

target_column = data['design_state_data']['target']
algorithms = data['design_state_data']['algorithms']

In [4]:
dataset = pd.read_csv('iris.csv')

In [17]:
import hashlib
from nltk.tokenize import word_tokenize
from collections import defaultdict

In [16]:
def tokenize(text):
    tokens = word_tokenize(text)
    
    # Convert tokens to lowercase for consistent hashing
    tokens_lower = [token.lower() for token in tokens]
    return tokens_lower

In [22]:
def dataframe_hash(df, hash_columns=0):
    # Convert the DataFrame to a dictionary of lists
    data_dict = df.to_dict(orient='list')

    # Initialize a defaultdict to store hash values for each column
    hash_values = defaultdict(list)

    # Iterate through the columns and tokenize + hash text columns
    for col, values in data_dict.items():
        if col in df.select_dtypes(include=['object']):
            for value in values:
                tokens = tokenize(value)
                
                # Create a hashlib object
                hash_obj = hashlib.sha256()
                hash_obj.update(''.join(tokens).encode('utf-8'))
                hash_result = hash_obj.hexdigest()
                hash_values[col].append(hash_result)
        else:
            # For non-text columns, add original values
            hash_values[col] = values

    # Convert the dictionary of hash values back to a DataFrame
    hashed_df = pd.DataFrame(hash_values)

    # If hash_columns is specified, keep only the specified number of hash columns
    if hash_columns > 0:
        hash_columns_to_keep = list(hashed_df.columns)[:hash_columns]
        hashed_df = hashed_df[hash_columns_to_keep]

    return hashed_df





In [24]:
if 'feature_handling' in data['design_state_data']:
    print("Yes")
    feature_handling = data['design_state_data']['feature_handling']
    
    for k, v in feature_handling.items():
        print(k)
        
        #feature_name_specific = feature_handling['sepal_length']
    
        feature_name = v['feature_name']
        feature_details = v['feature_details']
        
        if 'missing_values' in feature_details:
            impute_strategy = feature_details['impute_with']
            impute_value = feature_details['impute_value']
    
            if impute_strategy == "Average of values":
                impute_value = dataset[feature_name].mean()
                
            elif impute_strategy == "custom":
                impute_value = feature_details['impute_value']
        
            dataset[feature_name].fillna(impute_value, inplace =True)
            
        if 'text_handling' in feature_details:
            
            df = pd.DataFrame(dataset[feature_name])
            hashed_df = dataframe_hash(df, hash_columns=1)            

Yes
sepal_length
sepal_width
petal_length
petal_width
species


In [31]:
dataset.isnull().sum()

sepal_length    0
sepal_width     0
petal_length    0
petal_width     0
species         0
dtype: int64

In [32]:
dataset.describe()

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width
count,150.0,150.0,150.0,150.0
mean,5.843333,3.054,3.758667,1.198667
std,0.828066,0.433594,1.76442,0.763161
min,4.3,2.0,1.0,0.1
25%,5.1,2.8,1.6,0.3
50%,5.8,3.0,4.35,1.3
75%,6.4,3.3,5.1,1.8
max,7.9,4.4,6.9,2.5


In [26]:
from sklearn.decomposition import PCA
from sklearn.ensemble import RandomForestRegressor
from sklearn.feature_extraction.text import CountVectorizer, FeatureHasher
from sklearn.feature_selection import SelectFromModel
from sklearn.feature_selection import SelectKBest, f_regression

In [None]:
if 'feature_reduction' in data['design_state_data']:
    target_column = data['design_state_data']['target']['target']
    c = data['design_state_data']['feature_reduction']
    
    if 'No_reduction' in c:
        k0 = c['No_reduction']['num_of_features_to_keep']
        k_best = SelectKBest(f_regression, k = k0)
        selected_features = k_best.fit_transform(dataset.drop(target_column, axis=1), dataset[target_column])
        selected_feature_indices = k_best.get_support(indices=True)
        selected_feature_names = dataset.drop(target_column, axis=1).columns[selected_feature_indices]
        dataset = dataset[selected_feature_names.tolist() + [target_column]]
    
    elif 'Corr with Target' in c:
        k0 = c['Corr with Target']['num_of_features_to_keep']
        corr_matrix = dataset.corr()
        corr_with_target = corr_matrix[target_column].drop(target_column)
        selected_features = corr_with_target[abs(corr_with_target) > k0].index.tolist()
        dataset = dataset[selected_features + [target_column]]
        
    elif 'Tree-based' in c:
        depth = c['Tree-based']['depth_of_trees']
        num = c['Tree-based']['num_of_trees']
        k0 = c['Tree-based']['num_of_features_to_keep']
        model = RandomForestRegressor(n_estimators = num, max_depth=depth)
        model.fit(dataset.drop(target_column, axis=1), dataset[target_column])
        feature_importances = model.feature_importances_
        selected_indices = SelectFromModel(model, threshold=k0).fit_transform(dataset.drop(target_column, axis=1))
        selected_features = dataset.drop(target_column, axis=1).columns[selected_indices]
        dataset = dataset[selected_features.tolist() + [target_column]]
    
    elif 'Principal Component Analysis':
        k0 = c['Principal Component Analysis']['num_of_features_to_keep']
        pca = PCA(n_components=k0)
        reduced_features = pca.fit_transform(dataset.drop(target_column, axis=1))
        reduced_dataset = pd.DataFrame(reduced_features, columns=[f'PC{i+1}' for i in range(k0)])
        dataset = pd.concat([reduced_dataset, dataset[target_column]], axis=1)
        

In [37]:
X = dataset.drop(columns = ['petal_width'],axis = 1)
Y = dataset.petal_width

In [38]:
from sklearn.model_selection import train_test_split
X_train, X_test, Y_train, Y_test = train_test_split(X,Y, test_size=0.2, random_state=2)

In [47]:
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.linear_model import LinearRegression, Ridge, LogisticRegression
from sklearn.svm import SVC, SVR
from sklearn.neighbors import KNeighborsClassifier, KNeighborsRegressor
from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor
from xgboost import XGBClassifier, XGBRegressor
from sklearn.neural_network import MLPRegressor
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import mean_squared_error

In [None]:
if 'algorithms' in data['design_state_data']:
    model_list = data['design_state_data']['algorithms']
    
    model_results = []
    
    for algorithm, config in model_list.items():
        if config['is_selected']:
            model_name = config['model_name']
            
            if algorithm == 'RandomForestClassifier':
                model = RandomForestClassifier(
                    n_estimators=config['min_trees'],
                    max_depth=config['max_depth'],
                    min_samples_split=config['min_samples_per_leaf_min_value'],
                    min_samples_leaf=config['min_samples_per_leaf_max_value'],
                    n_jobs=config['parallelism']
                )
                
            elif algorithm == 'RandomForestRegressor':
                model = RandomForestRegressor(
                    n_estimators=config['min_trees'],
                    max_depth=config['max_depth'],
                    min_samples_split=config['min_samples_per_leaf_min_value'],
                    min_samples_leaf=config['min_samples_per_leaf_max_value'],
                    n_jobs=config['parallelism']
                )
                
            elif algorithm == 'GBTClassifier':
                model = GradientBoostingClassifier(
                    n_estimators=config['num_of_BoostingStages'][0],
                    learning_rate=config['learningRate'][0],
                    subsample=config['min_subsample'],
                    max_depth=config['max_depth'],
                    random_state=config['random_state'],
                    criterion='deviance' if config['use_deviance'] else 'exponential'
                )

            elif algorithm == 'GBTRegressor':
                model = GradientBoostingRegressor(
                    n_estimators=config['num_of_BoostingStages'][0],
                    learning_rate=config['learningRate'][0],
                    subsample=config['min_subsample'],
                    max_depth=config['max_depth']
                )
                
            elif algorithm == 'LinearRegression':
                model = LinearRegression(
                    n_jobs=config['parallelism']
                )

            elif algorithm == 'LogisticRegression':
                model = LogisticRegression(
                    n_jobs=config['parallelism'],
                    max_iter=config['max_iter'],
                    C=config['min_regparam'],
                    penalty='l1' if config['use_l1_regularization'] else 'l2',
                    solver='saga' if config['use_elastic_net_regularization'] else 'auto'
                )

            elif algorithm == 'RidgeRegression':
                model = Ridge(
                    max_iter=config['max_iter'],
                    alpha=config['min_regparam'],
                    tol=config['max_regparam']
                )
                
            elif algorithm == 'LassoRegression':
                model = Lasso(
                    max_iter=config['max_iter'],
                    alpha=config['min_regparam'],
                    tol=config['max_regparam']
                )

            elif algorithm == 'ElasticNetRegression':
                model = ElasticNet(
                    max_iter=config['max_iter'],
                    alpha=config['min_regparam'],
                    l1_ratio=config['min_elasticnet'],
                    tol=config['max_regparam']
                )
            
            elif algorithm == 'xg_boost':
                model = XGBClassifier(
                    n_estimators=config['max_num_of_trees'],
                    learning_rate=config['learningRate'][1],
                    max_depth=config['max_depth_of_tree'][1],
                    subsample=config['sub_sample'][0],
                    colsample_bytree=config['col_sample_by_tree'][0],
                    random_state=config['random_state'],
                    use_label_encoder=False if config['use_gradient_boosted_tree'] else True
                )

            elif algorithm == 'DecisionTreeClassifier':
                model = DecisionTreeClassifier(
                    max_depth=config['max_depth'],
                    min_samples_split=config['min_samples_per_leaf'][0],
                    min_samples_leaf=config['min_samples_per_leaf'][1],
                    criterion='gini' if config['use_gini'] else 'entropy',
                    splitter='best' if config['use_best'] else 'random',
                    random_state=config['use_random']
                )

            elif algorithm == 'DecisionTreeRegressor':
                model = DecisionTreeRegressor(
                    max_depth=config['max_depth'],
                    min_samples_split=config['min_samples_per_leaf'][0],
                    min_samples_leaf=config['min_samples_per_leaf'][1],
                    criterion='mse' if config['use_best'] else 'mae',
                    splitter='best' if config['use_best'] else 'random',
                    random_state=config['use_random']
                )

            elif algorithm == 'SVM':
                model = SVC(
                    C=config['c_value'][0],
                    kernel='linear' if config['linear_kernel'] else 'rbf',
                    degree=3 if config['polynomial_kernel'] else 0,
                    gamma='scale' if config['auto'] else 'auto' if config['scale'] else 'scale', # You might need to adjust this based on your requirements
                    tol=config['tolerance'],
                    max_iter=config['max_iterations']
                )

            elif algorithm == 'SGD':
                model = SGDRegressor(
                    loss='log' if config['use_logistics'] else 'squared_loss',
                    penalty='elasticnet' if config['use_elastic_net_regularization'] else 'l2',
                    alpha=config['alpha_value'][0],
                    max_iter=config['max_iterations'],
                    tol=config['tolerance'],
                    l1_ratio=config['use_l1_regularization'],
                    l2_ratio=config['use_l2_regularization'],
                    learning_rate='constant',
                    eta0=0.01,  # You might need to adjust this based on your requirements
                    early_stopping=True if config['use_elastic_net_regularization'] else False
                )
                
            elif algorithm == 'KNN':
                model = KNeighborsClassifier(
                    n_neighbors=config['k_value'][0],
                    weights='distance' if config['distance_weighting'] else 'uniform',
                    algorithm=config['neighbour_finding_algorithm'],
                    p=config['p_value']
                )

            elif algorithm == 'extra_random_trees':
                model = ExtraTreesRegressor(
                    n_estimators=config['num_of_trees'][0],
                    max_features=config['feature_sampling_statergy'],
                    max_depth=config['max_depth'][0],
                    min_samples_split=config['min_samples_per_leaf'][0],
                    min_samples_leaf=config['min_samples_per_leaf'][1],
                    n_jobs=config['parallelism']
                )

            elif algorithm == 'neural_network':
                model = MLPRegressor(
                    hidden_layer_sizes=config['hidden_layer_sizes'],
                    activation=config['activation'],
                    alpha=config['alpha_value'][0],
                    max_iter=config['max_iterations'],
                    tol=config['convergence_tolerance'],
                    solver=config['solver'],
                    learning_rate_init=config['initial_learning_rate'],
                    batch_size='auto' if config['automatic_batching'] else 'None'
                )
    parameters = { #Add the custom parameters for the given model
        
    }
    grid_search = GridSearchCV(estimator=model, param_grid=parameters, cv=5, scoring='neg_mean_squared_error', n_jobs=-1)
    grid_search.fit(dataset.drop(target_column, axis=1), dataset[target_column])

    best_model = grid_search.best_estimator_
    best_params = grid_search.best_params_
    best_score = grid_search.best_score_

    model_results.append({
                'model_name': model_name,
                'best_model': best_model,
                'best_params': best_params,
                'best_score': best_score
            })
    
    for result in model_results:
        model_name = result['model_name']
        best_model = result['best_model']
        best_params = result['best_params']
        best_score = result['best_score']

        print(f"Running {model_name} with best parameters: {best_params}")
        best_model.fit(X_train, Y_train)
        predictions = best_model.predict(X_test, axis=1)
        mse = mean_squared_error(Y_test, predictions)
        print(f"Mean Squared Error for {model_name}: {mse}")

          