In [23]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler, LabelEncoder  # For preprocessing input data
from sklearn.feature_extraction.text import TfidfVectorizer

In [24]:
data = pd.read_csv(r'C:/Users/HP/Desktop/data_CodeCrafters/Data/Master/Mock_Data.csv')

In [25]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 11 columns):
 #   Column                   Non-Null Count  Dtype 
---  ------                   --------------  ----- 
 0   transaction_id           1000 non-null   int64 
 1   transaction_date         1000 non-null   object
 2   transaction_amount       1000 non-null   int64 
 3   merchant_category        1000 non-null   object
 4   card_type                1000 non-null   object
 5   transaction_location     1000 non-null   object
 6   cardholder_age           1000 non-null   int64 
 7   cardholder_gender        1000 non-null   object
 8   transaction_description  1000 non-null   object
 9   account_balance          1000 non-null   int64 
 10  calander_income          1000 non-null   int64 
dtypes: int64(5), object(6)
memory usage: 86.1+ KB


In [26]:
transaction_id = data['transaction_id']
    # Define columns to be scaled, excluding 'transaction_id'
    
#Convert to datetime format
data['transaction_date'] = pd.to_datetime(data['transaction_date'],format='%d-%m-%Y')

# Extract components
data['transaction_year'] = data['transaction_date'].dt.year
data['transaction_month'] = data['transaction_date'].dt.month
data['transaction_day'] = data['transaction_date'].dt.day

# Drop the transaction_date column
data = data.drop('transaction_date', axis=1)
numerical_cols = [
    'transaction_amount', 'cardholder_age', 'account_balance', 'calander_income','transaction_year','transaction_month','transaction_day'
]

categorical_cols = [
    'merchant_category', 'card_type', 'transaction_location', 'cardholder_gender', 
]

# Create a temporary DataFrame for scaling
temp_data = data[numerical_cols].copy()

scaler = StandardScaler() # Initialize the StandardScaler
temp_data = pd.DataFrame(scaler.fit_transform(temp_data), columns=numerical_cols) # Scale numerical columns

# Encode categorical columns
encoder = LabelEncoder() # Initialize the LabelEncoder
for col in categorical_cols:
    data[col] = encoder.fit_transform(data[col]) # Encode categorical columns

# Rejoin transaction_id and scaled numerical columns
data = data.drop(columns=numerical_cols) # Drop original numerical columns
data = pd.concat([data, temp_data], axis=1) # Concatenate scaled numerical columns back
# data['transaction_id'] = transaction_id # Reassign transaction_id


# # Convert text descriptions into numerical features
# Initialize the TF-IDF Vectorizer
tfidf = TfidfVectorizer(max_features=100)

# Transform the transaction_description column
description_features = tfidf.fit_transform(data['transaction_description'])

# Convert to dataframe
description_data = pd.DataFrame(description_features.toarray(), columns=tfidf.get_feature_names_out())

# Rejoin the data and drop original column
data = pd.concat([data, description_data], axis=1)
data = data.drop('transaction_description', axis=1, inplace=False)

In [29]:
data.head()

Unnamed: 0,transaction_id,merchant_category,card_type,transaction_location,cardholder_gender,transaction_amount,cardholder_age,account_balance,calander_income,transaction_year,...,ultrices,ut,vel,velit,venenatis,vestibulum,vitae,vivamus,volutpat,vulputate
0,1,3,0,1,2,0.743856,-1.403021,-0.754383,0.967126,1.439195,...,0.0,0.231837,0.0,0.0,0.0,0.0,0.0,0.0,0.308267,0.0
1,2,6,0,7,1,1.306674,0.505341,1.534423,-1.156623,-0.444568,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,3,7,3,7,0,0.999558,0.232718,-1.093191,0.758704,-0.444568,...,0.0,0.294438,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,4,7,1,6,0,-1.616071,-1.471177,-1.841461,-0.735905,-0.444568,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.274229,0.0
4,5,1,1,9,2,1.110613,0.709808,-1.654051,-1.101064,1.439195,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [6]:
# Print each column name and its type
data.columns = data.columns.astype('str')
for col in data.columns:
#     print(f"Column name: {col}, Type: {type(col)}")
    if not isinstance(col, str):  # Check if the column name is not a string
        print(col)
        

In [7]:
# np.any(np.isnan(data))
np.any(np.isinf(data))

False

In [8]:
from sklearn.model_selection import train_test_split
X = data
X_train, X_temp = train_test_split(X, test_size=0.4, random_state=42)  # 60% train, 40% temp
X_test, X_temp = train_test_split(X_temp, test_size=0.625, random_state=42)  # 0.25 * 0.4 = 0.45 for test
X_val, X_superval= train_test_split(X_temp, test_size=0.4, random_state=42)

In [9]:
X_train = X_train.values
X_test = X_test.values

In [10]:
from sklearn.ensemble import IsolationForest
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import make_scorer, mean_squared_error, silhouette_score
def perform_hyperparameter_tuning(X_train):
    # Define the grid of hyperparameters to search over
    param_grid = {
        'n_estimators': [100, 50, 70],  # Number of base estimators in ensemble
        'max_samples': [0.1, 0.05, 0.75],  # Maximum number of samples to draw from the dataset
        'max_features':[0.05,0.5,0.7], #Number of Features to draw from dataset to train each base estimator
        'contamination': [0.01, 0.05, 0.02],  # Proportion of outliers in the sample
        'bootstrap': [True, False]  # Whether bootstrap samples are used when building trees
    }
    
    # Initialize an IsolationForest Ensemble
    model = IsolationForest(random_state=42)
 
     # Use a custom scorer, for example, negative mean squared error
    def scorer(estimator, X):
        cluster_labels = estimator.fit_predict(X)
        return silhouette_score(X,cluster_labels)
    # scorer = make_scorer(mean_squared_error, greater_is_better=False)
    
    # Initialize a GridSearchCV to search for the best hyperparameters
    grid_search = GridSearchCV(estimator=model, param_grid=param_grid, 
                               cv=3, n_jobs=-1, verbose=2,scoring=scorer)
    
    # Fit the GridSearchCV to the training data
    grid_search.fit(X_train)
    
    # Retrieve the best model with the optimal hyperparameters
    best_model = grid_search.best_estimator_
    
    # Print the best hyperparameters found
    print("Best hyperparameters:", grid_search.best_params_)
    
    # Return the best model
    return best_model, grid_search.best_params_

In [11]:
best_model, best_params = perform_hyperparameter_tuning(X_train)

Fitting 3 folds for each of 162 candidates, totalling 486 fits
Best hyperparameters: {'bootstrap': False, 'contamination': 0.01, 'max_features': 0.7, 'max_samples': 0.05, 'n_estimators': 100}


In [12]:
best_model.fit(X_train)

IsolationForest(contamination=0.01, max_features=0.7, max_samples=0.05,
                random_state=42)

In [13]:
pred = best_model.predict(X_test)

In [14]:
print("Unique labels in pred:", set(pred))

Unique labels in pred: {1, -1}


In [15]:
from sklearn.metrics import davies_bouldin_score
test_db_index = davies_bouldin_score(X_test, pred)
test_sh_index = silhouette_score(X_test,pred)

In [16]:
print(test_db_index)

4.452983544397346


In [17]:
print(test_sh_index)

0.28939692910960135
