### Import dependencies

In [6]:
import numpy as np
import pandas as pd
from azureml.core import Workspace

### Load Train and Test Data

In [7]:
#Load Secrets from key-vault 
keyVaultName = "mlops-with-azure"
KVUri = f"https://mlops-with-azure.vault.azure.net"

In [8]:
from azure.identity import DefaultAzureCredential
from azure.keyvault.secrets import SecretClient

credential = DefaultAzureCredential()
KVUri = f"https://mlops-with-azure.vault.azure.net"

secret_client = SecretClient(vault_url=KVUri, credential=credential)
secret = secret_client.get_secret("model-train-path")

print(secret.name)
model_train_path = secret.value

HttpResponseError: (Forbidden) Caller is not authorized to perform action on resource.
If role assignments, deny assignments or role definitions were changed recently, please observe propagation time.
Caller: appid=18a66f5f-dbdf-4c17-9dd7-1634712a9cbe;oid=9b525743-7417-4049-961e-9274eb72fd94;iss=https://sts.windows.net/6f10680d-f7ed-44b5-b77b-b9ff3f5ea798/
Action: 'Microsoft.KeyVault/vaults/secrets/getSecret/action'
Resource: '/subscriptions/aac067db-0216-47f9-a4fb-fb27eb1556ea/resourcegroups/mlops-with-azure/providers/microsoft.keyvault/vaults/mlops-with-azure/secrets/model-train-path'
Assignment: (not found)
DenyAssignmentId: null
DecisionReason: null 
Vault: mlops-with-azure;location=centralindia

Code: Forbidden
Message: Caller is not authorized to perform action on resource.
If role assignments, deny assignments or role definitions were changed recently, please observe propagation time.
Caller: appid=18a66f5f-dbdf-4c17-9dd7-1634712a9cbe;oid=9b525743-7417-4049-961e-9274eb72fd94;iss=https://sts.windows.net/6f10680d-f7ed-44b5-b77b-b9ff3f5ea798/
Action: 'Microsoft.KeyVault/vaults/secrets/getSecret/action'
Resource: '/subscriptions/aac067db-0216-47f9-a4fb-fb27eb1556ea/resourcegroups/mlops-with-azure/providers/microsoft.keyvault/vaults/mlops-with-azure/secrets/model-train-path'
Assignment: (not found)
DenyAssignmentId: null
DecisionReason: null 
Vault: mlops-with-azure;location=centralindia

Inner error: {
    "code": "ForbiddenByRbac"
}

In [4]:
df_train = pd.read_csv("https://mlopsazure7843230036.blob.core.windows.net/propensitymodeldatasets/testing_sample.csv")
df_train.head()

HTTPError: HTTP Error 409: Public access is not permitted on this storage account.

In [None]:
credential = DefaultAzureCredential()
KVUri = f"https://mlops-with-azure.vault.azure.net"

secret_client = SecretClient(vault_url=KVUri, credential=credential)
secret = secret_client.get_secret("model-test-path")

print(secret.name)
model_test_path = secret.value


df_test = pd.read_csv(model_test_path)
df_test.head()

### Check for Basic data checks

In [None]:
print(df_train.shape)
print('')
print(df_test.shape)

In [None]:
#Variable info
print(df_train.info())

In [None]:
#Variable info
print(df_test.info())

### EDA

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt
corr = df_train.corr()
plt.figure(figsize=(16, 14))
sns.heatmap(corr, vmax=0.5, center=0,
            square=True, linewidths=2, cmap='Blues')
plt.savefig("heatmap.png")
plt.show()

In [None]:
df_train.corr()['ordered']

In [None]:
# Drop columns with High Corr to avoid Multicollinearity
df_train = df_train.drop(['checked_delivery_detail'], axis=1)

In [None]:
df_train.columns

### Feature Selection
- Variance Threshold check
- Select k best for top n features

In [None]:
id_target_col_list = ['ordered', 'UserID']
col_for_feature_selection = df_train.columns.difference(id_target_col_list)
print(len(col_for_feature_selection))

In [None]:
from sklearn.feature_selection import VarianceThreshold

var_thr = VarianceThreshold(threshold = 0.005) #Removing both constant and quasi-constant
var_thr.fit(df_train[col_for_feature_selection])

# Get the support mask
support = var_thr.get_support()

print("Support:", support)

In [None]:
# List of columns to be retained
print(len(df_train.columns))
print('')
retained_columns = df_train[col_for_feature_selection].columns[support]

# List of columns to be removed
remove_col_list = [col for col in col_for_feature_selection if col not in retained_columns]
print("Columns to be removed:", remove_col_list)

In [None]:
print(df_train.shape)
print('')
df_train = df_train.drop(remove_col_list, axis=1, errors='ignore')
print(df_train.shape)

In [None]:
df_train.columns

### Feature Selection Using Select K Best

In [None]:
#Select top n Features
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import f_classif

def select_kbest_features(df, target_col,n):
  """
  Selects the top n features from the DataFrame using the SelectKBest algorithm.

  Args:
    df: The DataFrame to select features from.
    n: The number of features to select.

  Returns:
    A list of the top n features.
  """


  selector = SelectKBest(k=n)
  selected_features = selector.fit_transform(df, target_col)
  
  mask = selector.get_support()
  top_n_features = df.columns[mask]

  return top_n_features

In [None]:
id_col_list = ['UserID']
target_col = df_train['ordered']
top_n_col_list = select_kbest_features(df_train.drop(id_col_list,axis=1),target_col, 10)
print(len(top_n_col_list))
top_n_col_list

In [None]:
#Convert to list
top_n_col_list = top_n_col_list.tolist()

type(top_n_col_list)

In [None]:
cols_for_model_df_list = id_col_list + top_n_col_list
print(len(cols_for_model_df_list))
print('')

In [None]:
df_feature_eng_output = df_train[cols_for_model_df_list]
df_feature_eng_output.head()

In [None]:
df_feature_eng_output['ordered'].value_counts( )

### Save the output to Blob storage