In [160]:
import pandas as pd
import json

In [161]:
import json

with open('/content/algoparams_from_ui.json.json', 'r') as file:
    config = json.load(file)
print(json.dumps(config, indent=4))  # Print to verify structure


{
    "session_name": "test",
    "session_description": "test",
    "design_state_data": {
        "session_info": {
            "project_id": "1",
            "experiment_id": "kkkk-11",
            "dataset": "IRIS.csv",
            "session_name": "test",
            "session_description": "test"
        },
        "target": {
            "prediction_type": "Regression",
            "target": "petal_width",
            "type": "regression",
            "partitioning": true
        },
        "train": {
            "policy": "Split the dataset",
            "time_variable": "sepal_length",
            "sampling_method": "No sampling(whole data)",
            "split": "Randomly",
            "k_fold": false,
            "train_ratio": 0,
            "random_seed": 0
        },
        "metrics": {
            "optomize_model_hyperparameters_for": "AUC",
            "optimize_threshold_for": "F1 Score",
            "compute_lift_at": 0,
            "cost_matrix_gain_for_true_predictio

In [162]:
dataset_path = config['design_state_data']['session_info']['dataset']
df = pd.read_csv(dataset_path)

1) Read the target and type of regression to be run


In [163]:
target_data = config.get('design_state_data', {}).get('target', {})

In [164]:
prediction_type = target_data.get('prediction_type')
target_column = target_data.get('target')
target_type = target_data.get('type')

In [165]:
print("Prediction Type:", prediction_type)
print("Target Column:", target_column)
print("Regression Type:", target_type)

Prediction Type: Regression
Target Column: petal_width
Regression Type: regression


2) Read the features (which are column names in the csv) and figure out what missing imputation needs to be applied and apply that to the columns loaded in a dataframe

In [166]:
feature_handling = config.get('design_state_data', {}).get('feature_handling', {})

In [167]:
for feature_name, feature_details in feature_handling.items():
    if feature_details.get('is_selected', False):
        feature_detail_data = feature_details.get('feature_details', {})
        imputation_strategy = feature_detail_data.get('impute_with')
        if imputation_strategy == "Average of values":
            df[feature_name] = df[feature_name].fillna(df[feature_name].mean())
        elif imputation_strategy == "custom":
            impute_value = feature_detail_data.get('impute_value', 0)
            df[feature_name] = df[feature_name].fillna(impute_value)

In [168]:
print(df.head())

   sepal_length  sepal_width  petal_length  petal_width      species
0           5.1          3.5           1.4          0.2  Iris-setosa
1           4.9          3.0           1.4          0.2  Iris-setosa
2           4.7          3.2           1.3          0.2  Iris-setosa
3           4.6          3.1           1.5          0.2  Iris-setosa
4           5.0          3.6           1.4          0.2  Iris-setosa


3) Compute feature reduction based on input. See the screenshot below where there can be No Reduction, Corr with Target, Tree-based, PCA. Please make sure you write code so that all options can work. If we rerun your code with a different Json it should work if we switch No Reduction to say PCA.

In [154]:
from sklearn.decomposition import PCA

In [155]:
# Step 4: Feature Reduction
reduction_method = config['design_state_data']['feature_reduction']['feature_reduction_method']
if reduction_method == 'PCA':
    n_components = int(config['design_state_data']['feature_reduction']['num_of_features_to_keep'])
    reducer = PCA(n_components=n_components)
else:
    reducer = "passthrough"  # For "No Reduction" or others not implemented here

4) Parse the Json and make the model objects (using sklean) that can handle what is required in the “prediction_type” specified in the JSON (See #1 where “prediction_type” is specified). Keep in mind not to pick models that don’t apply for the prediction_type specified

In [194]:
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.impute import SimpleImputer

In [195]:
X = df.drop(columns=[target_column])    #independent
y = df[target_column]                      #dependent

In [196]:
if y.dtype == 'object':  # if the target is categorical (classification problem)
    label_encoder = LabelEncoder()
    y = label_encoder.fit_transform(y)

In [197]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [198]:
categorical_columns = X.select_dtypes(include=['object']).columns
numeric_columns = X.select_dtypes(exclude=['object']).columns

Encoding the data columns and Sacling them

In [199]:
preprocessor = ColumnTransformer(
    transformers=[
        ('num', Pipeline([
            ('imputer', SimpleImputer(strategy='mean')),  # Impute numeric features with mean
            ('scaler', StandardScaler())  # Scale numeric features
        ]), numeric_columns),
        ('cat', Pipeline([
            ('imputer', SimpleImputer(strategy='most_frequent')),  # Impute categorical features with most frequent
            ('onehot', OneHotEncoder())  # One-hot encode categorical features
        ]), categorical_columns)
    ])

Created a Pipeline

In [200]:
pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('reducer', reducer),  # apply PCA if selected
    ('model', LinearRegression())  # Use LinearRegression for regression
])

In [203]:
param_grid = {
    'model__fit_intercept': [True, False],  # Hyperparameter for Linear Regression

}

In [204]:
grid_search = GridSearchCV(pipeline, param_grid, cv=5, scoring='neg_mean_squared_error', verbose=1)
grid_search.fit(X_train, y_train)

Fitting 5 folds for each of 2 candidates, totalling 10 fits


In [205]:
print("Best Parameters:", grid_search.best_params_)
y_pred = grid_search.best_estimator_.predict(X_test)

Best Parameters: {'model__fit_intercept': True}


In [206]:
from sklearn.metrics import mean_squared_error, r2_score
print("Mean Squared Error:", mean_squared_error(y_test, y_pred))
print("R2 Score:", r2_score(y_test, y_pred))

Mean Squared Error: 0.02934029441894716
R2 Score: 0.9538424636384967
