In [8]:
import pandas as pd
import numpy as np
import re
import scipy.sparse as sp
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier, NearestNeighbors
from sklearn.svm import LinearSVC
from sklearn.metrics import classification_report
from imblearn.over_sampling import SMOTE
from collections import Counter
import joblib
from datetime import datetime

# Custom Tokenizer
def custom_tokenizer(text):
    if pd.isna(text):
        return []
    tokens = re.findall(r'\w+', str(text).lower())
    return [subtok for token in tokens for subtok in re.findall(r'[a-z]+|\d+', token)]

# Custom TfidfVectorizer
class CustomTfidfVectorizer(TfidfVectorizer):
    def build_analyzer(self):
        return lambda doc: custom_tokenizer(doc)

def clean_columns(df: pd.DataFrame) -> pd.DataFrame:
    # 1. Extract the ID from the column name 
    df['ID'] = df['Original Column'].str.extract(r'(\d+)\.')

    # 2. Extract the Column name up to the first colon, slash, or parenthesis
    df['Column'] = df['Original Column'].str.extract(r'\d+\.\s*([^/(:]+)')

    # 3. Extract everything after the first colon, slash, or parenthesis as the full description
    df['Description'] = df['Original Column'].str.extract(r'\d+\.\s*[^/(:]+\s*[:(/](.*)')[0]

    # 4. For cases where Description is None,
    #    extract the content within parentheses
    mask = df['Description'].isnull()
    df.loc[mask, 'Description'] = df.loc[mask, 'Original Column'].str.extract(r'\(([^)]+)\)')

    # 5. Trim spaces from all string columns
    for col in df.columns:
        if df[col].dtype == "object" and col != 'dataset_index':
            df[col] = df[col].str.strip()

    # 6. Replace empty strings with pd.NA  ← changed from None to pd.NA to avoid downcasting warning
    df = df.replace(r'^\s*$', pd.NA, regex=True)
    df = df.infer_objects(copy=False)
    
    # 7. Ensure Description is object dtype before applying .str.replace  ← added to avoid incompatible dtype warning
    df['Description'] = df['Description'].astype(object)

    # 8. Only apply .str.replace on non-null Description rows,
    #    converting to string first to avoid errors
    mask_desc = df['Description'].notnull()
    df.loc[mask_desc, 'Description'] = (
        df.loc[mask_desc, 'Description']
          .astype(str)
          .str.replace(r'\(', ' ', regex=True)
          .str.replace(r'\)', '', regex=True)
    )

    # 9. Reorder columns to ensure ID comes before Column
    columns_order = ['dataset_index', 'name', 'area', 'Original Column', 'ID', 'Column', 'Description']
    columns_order = [col for col in columns_order if col in df.columns]
    df = df[columns_order]

    return df

def replace_abbreviations(text, abbreviations_dict):
    tokens = re.findall(
        r'\b\w+(?:[-=]\w+)*\b'          # Words, with hyphens/equals (like 0-4, 1=NE)
        r'|[=]'                         # Standalone equal signs
        r'|[°º][CFK]'                   # °C, ºF, etc.
        r'|[°º\-]+'                     # Standalone degree/hyphens
        r'|[a-zA-Z]+/[a-zA-Z]+(?:[²³μµ]?)'   # mg/dL, kg/m², µg/m³, cm², etc.
        r'|\d+[²³]?'                    # 10², 5³ (numbers with superscripts)
        r'|%'                           # <- add this line to capture % as a token
        , text
    )
    replaced_tokens = [abbreviations_dict.get(token.lower(), token) for token in tokens]
    return " ".join(replaced_tokens)

def preprocess_columns(df, abbreviations_dict):
    # Only convert "Column" to string type for non-null values and then to lowercase to create "CleanedColumn"
    mask = df['Column'].notna()

    df.loc[mask, 'Column'] = df.loc[mask, 'Column'].str.replace('#', 'number', regex=False)
    
    df.loc[mask, 'CleanedColumn'] = df.loc[mask, 'Column'].astype(str).apply(split_camel_case)
    
    # Remove '-' and '_' characters only for non-null values
    mask = df['CleanedColumn'].notna()
    df.loc[mask, 'CleanedColumn'] = df.loc[mask, 'CleanedColumn'].str.replace('[-_]', ' ', regex=True)

    # Apply abbreviations dictionary to Description
    mask = df['Description'].notna()
    df.loc[mask, 'Description'] = df.loc[mask, 'Description'].apply(lambda x: replace_abbreviations(x, abbreviations_dict))
 
    return df

def split_camel_case(s):
    # Special case for 'pH'
    if s.lower() == 'ph':
        return 'ph'
    # Split on separators
    tokens = re.split(r'[\s._\-]+', s)
    processed_tokens = []

    for token in tokens:
        if not token:
            continue
        # If all uppercase or digits, keep as is
        if token.isupper() or token.isdigit():
            processed_tokens.append(token.lower())
            continue
        # If all-uppercase+digits, keep as is
        if re.match(r'^[A-Z]{2,}\d+$', token):
            processed_tokens.append(token.lower())
            continue

        # Character-by-character scan
        words = []
        current = ''
        i = 0
        while i < len(token):
            c = token[i]
            if current == '':
                current = c
            elif (
                # lower->upper transition
                (current[-1].islower() and c.isupper()) or
                # digit->letter or letter->digit
                (current[-1].isdigit() and c.isalpha()) or
                (current[-1].isalpha() and c.isdigit())
            ):
                words.append(current)
                current = c
            else:
                current += c
            i += 1
        if current:
            words.append(current)

        # Now check if the last two or more are all caps: join them!
        if len(words) >= 2 and all(w.isupper() for w in words[-2:]) and len(''.join(words[-2:])) >= 2:
            # Join last all-cap runs
            n = len(words) - 1
            while n > 0 and words[n].isupper():
                n -= 1
            # All uppercase words from n+1 to end are the tail
            head = words[:n+1]
            tail = ''.join(words[n+1:])
            processed_tokens.extend([w.lower() for w in head if w])
            if tail:
                processed_tokens.append(tail.lower())
        else:
            processed_tokens.extend([w.lower() for w in words if w])

    return ' '.join(processed_tokens)

def match_with_plural(token, dictionary):
    # Attempt direct match first
    if token in dictionary:
        return dictionary[token]
    # Check for 'ies' -> 'y' (families -> family)
    if token.endswith('ies') and len(token) > 3:
        singular = token[:-3] + 'y'
        if singular in dictionary:
            return dictionary[singular]
    # Check for 'es' -> '' (e.g., classes -> class)
    if token.endswith('es') and len(token) > 2:
        singular = token[:-2]
        if singular in dictionary:
            return dictionary[singular]
    # Check for 's' -> '' (e.g., chlorides -> chloride)
    if token.endswith('s') and len(token) > 2:
        singular = token[:-1]
        if singular in dictionary:
            return dictionary[singular]
    return None

def apply_analysis(df, target_words_dict, description_words_dict, abbreviations_dict, all_datasets_info, DB_or_dataset):

    def get_special_name_format(table_name, column_name):
        special_cases = {
            'city': 'city',
            'country': 'country',
            'state': 'state',
            'province': 'state'  # province is treated as state
        }
        
        if DB_or_dataset.lower() == 'd':
            # For datasets, table_name is a number, so we don't check it
            if column_name.lower() == 'name':
                return 'name', target_words_dict['name']
        else:
            # For database tables, check if any special case is in the table name
            table_name_str = str(table_name).lower()
            for case, format_type in special_cases.items():
                if case in table_name_str and column_name.lower() == 'name':
                    return f"{format_type}_name", target_words_dict[format_type]
        
        return 'name', target_words_dict['name']
    
    # main code 
    # Check if PK/FK information is available
    has_pk_fk_info = 'primary_key' in all_datasets_info.columns and 'foreign_keys' in all_datasets_info.columns

    # Get primary and foreign keys for each table
    pk_fk_dict = {}
    if has_pk_fk_info:
        for _, row in all_datasets_info.iterrows():
            table_name = row['index']
            pk = row['primary_key'] if pd.notna(row['primary_key']) else None
            fks = row['foreign_keys'].split(', ') if pd.notna(row['foreign_keys']) else []
            fk_columns = [fk.split(' -> ')[0] for fk in fks]
            pk_fk_dict[table_name] = {'pk': pk, 'fks': fk_columns} 

    # 'formats_ordered_list' is a list of dictionary keys in the order they appear in 'formats_dictionary'
    formats_ordered_list = list(target_words_dict.keys())

    # Initialize new columns
    df['ColumnKeyword'] = None
    df['ColumnFormat'] = None
    df['DescriptionKeyword'] = None
    df['DescriptionFormat'] = None
    count = 0
   
    # Apply target words analysis
    for i, row in df.iterrows():
        count += 1
        if count % 1000 == 0:
            print(f"✅ Processed {count} rows...")

        std_col_name = row['CleanedColumn']
        col_name = row['Column']
        table_name = row['dataset_index']

        # Skip if 'CleanedColumn' is missing
        if pd.isnull(std_col_name):
            continue

        # Split camel case
        std_col_name = split_camel_case(std_col_name)

        # Check if the column is a primary key or foreign key
        if table_name in pk_fk_dict:
            if col_name == pk_fk_dict[table_name]['pk'] or col_name in pk_fk_dict[table_name]['fks']:
                df.at[i, 'ColumnKeyword'] = 'id'
                df.at[i, 'ColumnFormat'] = 'IDcolumn'
                continue

        found = False

        # Iterate through each word based on the order in 'formats_ordered_list'
        for word in formats_ordered_list:
            analysis = target_words_dict[word.lower()]
            # Special handling for 'name'
            if word == 'name':
                keyword, format_type = get_special_name_format(table_name, col_name)
                if keyword != 'name':
                    df.at[i, 'ColumnKeyword'] = keyword
                    df.at[i, 'ColumnFormat'] = format_type
                    found = True
                    break
                pattern = rf'({word})(?![\\w-])'
            # Special handling for uppercase 'ID' at the end of a column name
            elif word == 'id' and col_name.endswith('ID'):
                df.at[i, 'ColumnKeyword'] = 'id'
                df.at[i, 'ColumnFormat'] = target_words_dict.get('id', 'ID column')
                found = True
                break
            else:
                # General matching for other terms
                pattern = rf'\b{word}\b'

            # Search for the pattern in the CleanedColumn
            if re.search(pattern, std_col_name, re.IGNORECASE):
                df.at[i, 'ColumnKeyword'] = word
                df.at[i, 'ColumnFormat'] = analysis
                found = True
                break

        # If no match found, replace abbreviations and try again
        if not found:
            replaced_text = replace_abbreviations(std_col_name, abbreviations_dict)
            for word in formats_ordered_list:
                pattern = rf'\b{word}\b'
                if re.search(pattern, replaced_text, re.IGNORECASE):
                    df.at[i, 'ColumnKeyword'] = word
                    df.at[i, 'ColumnFormat'] = target_words_dict[word.lower()]
                    found = True
                    break


        # If still no match, check tokens only
        if not found:
            tokens = re.findall(r'\b\w+\b|%', row['CleanedColumn'].lower())
            for token in tokens:
                expanded_token = abbreviations_dict.get(token.lower(), None)
                # First check plural support for expanded_token
                if expanded_token:
                    match = match_with_plural(expanded_token, target_words_dict)
                    if match:
                        df.at[i, 'ColumnKeyword'] = expanded_token
                        df.at[i, 'ColumnFormat'] = match
                        found = True
                        break
                # Then check plural support for the token itself
                match = match_with_plural(token, target_words_dict)
                if match:
                    df.at[i, 'ColumnKeyword'] = token
                    df.at[i, 'ColumnFormat'] = match
                    found = True
                    break

        # -- PRIORITIZE percentage if % present in CleanedColumn tokens --
        tokens = re.findall(r'\b\w+\b|%', row['CleanedColumn'].lower())
        if '%' in tokens and '%' in target_words_dict:
            df.at[i, 'ColumnKeyword'] = '%'
            df.at[i, 'ColumnFormat'] = target_words_dict['%']

    count = 0
    # Apply description words analysis
    for i, row in df.iterrows():
        count += 1
        if count % 1000 == 0:
            print(f"✅ Processed {count} rows...")
        # Skip if 'Description' is missing
        if pd.isnull(row['Description']):
            continue
        for word, analysis in description_words_dict.items():
            if word in ['is', 'has']:
                pattern = r'^\s*' + re.escape(word) + r'\b'
                if re.search(pattern, row['Description'], re.IGNORECASE):
                    df.at[i, 'DescriptionKeyword'] = word
                    df.at[i, 'DescriptionFormat'] = analysis
                    break
            else:
                if re.search(rf'\b{re.escape(word)}\b', row['Description'], re.IGNORECASE):
                    df.at[i, 'DescriptionKeyword'] = word
                    df.at[i, 'DescriptionFormat'] = analysis
                    break
                elif not word.isalnum():
                    if word.lower() in row['Description'].lower():
                        df.at[i, 'DescriptionKeyword'] = word
                        df.at[i, 'DescriptionFormat'] = analysis
                        break

    return df

def get_top_features(instance, feature_names, clf, top_n=3):
    feature_importances = sorted(
        [(importance, name) for importance, name in zip(instance.toarray()[0], feature_names) if importance > 0],
        reverse=True
    )[:top_n]
    return [(name, importance) for importance, name in feature_importances]

def custom_resample(X, y, min_samples=6):
    # Convert to CSR format if it's not already
    if not isinstance(X, sp.csr_matrix):
        X = X.tocsr()

    class_counts = Counter(y)
    X_resampled = []
    y_resampled = []
    
    for class_label, count in class_counts.items():
        class_indices = np.where(y == class_label)[0]
        if count < min_samples:
            n_samples = min_samples
            resampled_indices = np.random.choice(class_indices, size=n_samples, replace=True)
        else:
            n_samples = count
            resampled_indices = class_indices
        
        X_resampled.append(X[resampled_indices])
        y_resampled.extend([class_label] * n_samples)
    
    X_resampled = sp.vstack(X_resampled)
    return X_resampled, np.array(y_resampled)

def efficient_smote(X, y, sampling_strategy='auto', k_neighbors=5):
    # Convert to CSR format if it's not already
    if not isinstance(X, sp.csr_matrix):
        X = X.tocsr()

    X_resampled, y_resampled = custom_resample(X, y)

    # Create a NearestNeighbors estimator
    #nn = NearestNeighbors(n_neighbors=k_neighbors + 1, n_jobs=-1)
    
    #smote = SMOTE(sampling_strategy=sampling_strategy, k_neighbors=nn, n_jobs=None)
    smote = SMOTE(sampling_strategy=sampling_strategy, k_neighbors=k_neighbors)
    # Apply SMOTE
    X_resampled, y_resampled = smote.fit_resample(X_resampled, y_resampled)
    
    return X_resampled, y_resampled

def train_evaluate_save_model(model, model_name, X_train, X_test, y_train, y_test, df, vectorizer, weights, feature_names , output_file_path_text):
    print(f"\nTraining {model_name}...")
    
    param_grid = {
        'RandomForestClassifier': {
            'n_estimators': [100, 200],
            'max_depth': [None, 30],
            'min_samples_split': [2, 5],
        },
        'LogisticRegression': {
            'C': [0.1, 1],
            'solver': ['lbfgs'],
            'max_iter': [20000],
            'warm_start': [True]
        },
        'GradientBoostingClassifier': {
            'n_estimators': [100],
            'learning_rate': [0.1],
            'max_depth': [3],
        },
        'KNeighborsClassifier': {
            'n_neighbors': [3, 5],
            'weights': ['uniform', 'distance'],
        },
        'LinearSVC': {
            'C': [0.1, 1],
            'max_iter': [20000],
            'dual': ['auto']
        }
    }

    grid_search = GridSearchCV(model, param_grid[model_name], cv=3, n_jobs=-1, verbose=1)
    grid_search.fit(X_train, y_train)

    best_model = grid_search.best_estimator_
    print(f"Best parameters for {model_name}: {grid_search.best_params_}")

    y_pred = best_model.predict(X_test)
    report = classification_report(y_test, y_pred, zero_division=1)
    print(f"Classification Report for {model_name}:")
    print(report)

    # Save the model
    joblib.dump(best_model, f'{model_name}_model.joblib')

    # Create predictions for all data
    X_full = vectorizer.transform(df['combined'])
    if sp.issparse(weights):
        X_full = X_full.multiply(weights.diagonal())
    else:
        X_full = X_full.multiply(weights)
    
    df[f'predicted_format_{model_name}'] = best_model.predict(X_full)
    
    # Calculate top features
    df[f'top_features_{model_name}'] = df.apply(lambda row: get_top_features(
        vectorizer.transform([row['combined']]).multiply(weights), feature_names, best_model), axis=1)

    # Add DIF column
    def compare_formats(actual, predicted, output_file_path_text):
        if actual in ['numerical', 'numerical>=0'] and predicted in ['numerical', 'numerical>=0']:
            return 0
        return int(actual != predicted)

    # Add DIF column
    df[f'DIF_{model_name}'] = df.apply(lambda row: compare_formats(row['FinalFormat'], row[f'predicted_format_{model_name}'], output_file_path_text), axis=1)

    # Sort the DataFrame
    df_sorted = df.sort_values(by=[f'DIF_{model_name}', f'predicted_format_{model_name}', 'FinalFormat'], ascending=[False, True, True])

    # Save the DataFrame to an Excel file
    #output_file_path = f'AnalysedColumnsDB_with_Predictions_{model_name}.xlsx'
    output_file_path = f'{output_file_path_text}_{model_name}.xlsx'
    df_sorted.to_excel(output_file_path, index=False)
    print(f"DataFrame for {model_name} saved to {output_file_path}")

    return best_model, report

def main():
    print(f"It started on: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")
    
    DB_or_dataset = input("Insert if using Datasets (d), Database.tables (dt), the two Data Sources (ds), Kaggle Datasets (k), Kaggle Datasets to correct (kc), Sato Datasets (s), Sato Filtered (sf), or All Viznet Datasets (v): ")

    print(DB_or_dataset)

    if DB_or_dataset.lower() == 'd':
        print("Using Datasets")
        datasets_xlsx = pd.read_excel("FiftyDatasets.xlsx") 
        columns_xlsx = pd.read_excel("AllColumnsFromFiftyDatasets.xlsx")
        analysed_columns_file_path = 'AnalysedColumns.xlsx' 
        output_file_path_text = 'AnalysedColumns_with_Predictions'    
    elif DB_or_dataset.lower() == 'dt':
        print("Using Database tables")
        datasets_xlsx = pd.read_excel("AllDatasetsInfo.xlsx")
        columns_xlsx = pd.read_excel("AllColumnsInfo.xlsx")
        analysed_columns_file_path = 'AnalysedColumnsDB.xlsx'
        output_file_path_text = 'AnalysedColumnsDB_with_Predictions'
    elif DB_or_dataset.lower() == 'ds':
        print("Using Datasets and Database tables")
        datasets_xlsx = pd.read_excel("AllDataSourcesInfo.xlsx")
        columns_xlsx = pd.read_excel("AllAttributes_andColumnsInfo.xlsx")
        analysed_columns_file_path = 'AnalysedColumnsDS.xlsx'
        output_file_path_text = 'AnalysedColumnsDS_with_Predictions'
    elif DB_or_dataset.lower() == 'k':
        print("Using Kaggle Datasets")
        datasets_xlsx = pd.read_excel("kaggle_datasets_with_domain_and_match.xlsx")
        columns_xlsx = pd.read_excel("kaggle_headers.xlsx")
        analysed_columns_file_path = 'AnalysedColumnsK.xlsx'
        output_file_path_text = 'AnalysedColumnsK_with_Predictions'
    elif DB_or_dataset.lower() == 'kc':
        print("Using Kaggle Datasets Corrections")
        datasets_xlsx = pd.read_excel("kaggle_datasets_with_domain_and_match.xlsx")
        columns_xlsx = pd.read_excel("kaggle_headers_10k.xlsx")
        analysed_columns_file_path = 'AnalysedColumnsKc.xlsx'
        output_file_path_text = 'AnalysedColumnsKc_with_Predictions'
    elif DB_or_dataset.lower() == 's':
        print("Using Sato Datasets")
        datasets_xlsx = pd.read_excel("datasets_viznet.xlsx")
        columns_xlsx = pd.read_excel("columns_sato_only.xlsx")
        analysed_columns_file_path = 'AnalysedColumnsSato.xlsx'
        output_file_path_text = 'AnalysedColumnsS_with_Predictions'
    elif DB_or_dataset.lower() == 'sf':
        print("Using Sato Filtered Datasets")
        datasets_xlsx = pd.read_excel("datasets_viznet.xlsx")
        columns_xlsx = pd.read_excel("filtered_columns_sato_only.xlsx")
        analysed_columns_file_path = 'AnalysedColumnsSatoFiltered.xlsx'
        output_file_path_text = 'AnalysedColumnsSF_with_Predictions'
    elif DB_or_dataset.lower() == 'v':
        print("Using Viznet Filtered Datasets")
        datasets_xlsx = pd.read_excel("datasets_viznet.xlsx")
        columns_xlsx = pd.read_excel("filtered_columns_viznet_all.xlsx")
        analysed_columns_file_path = 'AnalysedColumnsViznetFiltered.xlsx'
        output_file_path_text = 'AnalysedColumnsV_with_Predictions'
    else:
        raise ValueError("Invalid input. Please enter 'd', 'dt', 'ds', 'k', 'kc', 's', 'sf' or 'v'.")

    print(datasets_xlsx.head())

    columns_xlsx = columns_xlsx.rename(columns={'index': 'dataset_index'})

    # Load the dataset
    df = columns_xlsx
    print(f"Original df shape: {df.shape}")

    # Load dictionaries
    dictionary = {}

    # Open the formats dictionary file and read line by line 
    with open("formats_dictionary.txt", "r") as file:
        for line in file:
            # Remove the trailing newline and comma, then split the line into key and value at the colon
            key, value = line.rstrip(",\n").split(":")
        
            # Remove the quotes around the key and value
            key = key.strip("'")
            value = value.strip("'")

            # Add the key-value pair to the dictionary
            dictionary[key] = value

    # Load the abbreviations dictionary
    abbreviations_dict = {}
    with open("abbreviations_dictionary.txt", "r") as file:
        for line in file:
            abbr, full_form = line.strip().split(":")
            abbreviations_dict[abbr.strip()] = full_form.strip()

    target_words_dict = dictionary
    description_words_dict = dictionary

    # Clean and preprocess columns
    df = clean_columns(df)

    print('AFTER CLEAN', df.head())
    df = preprocess_columns(df, abbreviations_dict)

    print(df)

    # Apply analysis
    df = apply_analysis(df, target_words_dict, description_words_dict, abbreviations_dict, datasets_xlsx, DB_or_dataset)

    # Load the FinalFormat from AnalysedColumns
    analysed_df = pd.read_excel(analysed_columns_file_path)
    
    # Merge the FinalFormat into our main dataframe
    df['FinalFormat'] = analysed_df['FinalFormat']

    # Prepare features for machine learning
    df['combined'] = df['CleanedColumn'].fillna('') + ' [SEP] ' + df['Description'].fillna('') 
    df['combined'] += ' [SEP] ' + df['ColumnKeyword'].fillna('') + ' [SEP] ' + df['DescriptionKeyword'].fillna('')

    #df['combined'] += df['ColumnFormat'].fillna('') + ' [SEP] ' + df['DescriptionFormat'].fillna('')
    
    # Load DB headers distribution file
    db_headers = {}
    with open('DBheaders+formats_dict.txt', 'r') as file:
        next(file)  # skip the header line
        for line in file:
            header, count = line.strip().split('\t')
            db_headers[header.lower()] = int(count)

    # Extract features using Custom TF-IDF with vocabulary restricted to DB headers
    vectorizer = CustomTfidfVectorizer(vocabulary=list(db_headers.keys()))
    X = vectorizer.fit_transform(df['combined']).tocsr()

    # Modify TF-IDF scores explicitly based on DB headers
    feature_names = vectorizer.get_feature_names_out()
    weights = np.array([db_headers.get(feat, 1) for feat in feature_names])
  
    # Ensure weights is a 1D array with the same number of elements as X has columns
    if weights.shape[0] != X.shape[1]:
        raise ValueError(f"Number of weights ({weights.shape[0]}) does not match number of features in X ({X.shape[1]})")
    
    # Multiply each column of X by its corresponding weight
    X = X.multiply(weights)

    # Encode the FinalFormat column 
    y = df['FinalFormat'].astype(str)

    # Apply efficient SMOTE
    X_resampled, y_resampled = efficient_smote(X, y)

    print("Class distribution after SMOTE:")
    print(pd.Series(y_resampled).value_counts())

    # Split the resampled data
    X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, test_size=0.2, random_state=42, stratify=y_resampled)

    # List of models to train
    models = [
        (RandomForestClassifier(class_weight='balanced', random_state=42), 'RandomForestClassifier'),
        (LogisticRegression(class_weight='balanced', random_state=42, warm_start=True), 'LogisticRegression'),
        (GradientBoostingClassifier(random_state=42), 'GradientBoostingClassifier'),
        (KNeighborsClassifier(), 'KNeighborsClassifier'),
        (LinearSVC(class_weight='balanced', random_state=42, dual='auto'), 'LinearSVC')
    ]

    # Train, evaluate, and save each model
    for model, model_name in models:
        best_model, report = train_evaluate_save_model(model, model_name, X_train, X_test, y_train, y_test, df, vectorizer, weights, feature_names, output_file_path_text)
        
        # Save classification report
        with open(f'{model_name}_classification_report.txt', 'w') as f:
            f.write(report)

    # Save the vectorizer for future use
    joblib.dump(vectorizer, 'vectorizer.joblib')
    print("Vectorizer saved for future use")

    print(f"Last run on: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")

if __name__ == "__main__":
    main()

It started on: 2025-06-09 11:30:53
v
Using Viznet Filtered Datasets
         index  name  area  url
0  SATO_000001   NaN   NaN  NaN
1  SATO_000002   NaN   NaN  NaN
2  SATO_000003   NaN   NaN  NaN
3  SATO_000004   NaN   NaN  NaN
4  SATO_000005   NaN   NaN  NaN
Original df shape: (74909, 5)
AFTER CLEAN   dataset_index  name         area        Original Column ID  \
0   SATO_000001   NaN  Sato-Viznet                1.   ID  1   
1   SATO_000001   NaN  Sato-Viznet           2.   Country  2   
2   SATO_000001   NaN  Sato-Viznet        3.   Unnamed: 2  3   
3   SATO_000001   NaN  Sato-Viznet  4.   Complete-o-meter  4   
4   SATO_000001   NaN  Sato-Viznet           5.   Courses  5   

             Column Description  
0                ID         NaN  
1           Country         NaN  
2           Unnamed           2  
3  Complete-o-meter         NaN  
4           Courses         NaN  
      dataset_index  name         area        Original Column ID  \
0       SATO_000001   NaN  Sato-Viznet   

STOP: TOTAL NO. of f AND g EVALUATIONS EXCEEDS LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Best parameters for LogisticRegression: {'C': 0.1, 'max_iter': 20000, 'solver': 'lbfgs', 'warm_start': True}
Classification Report for LogisticRegression:
               precision    recall  f1-score   support

 E-mailformat       1.00      1.00      1.00      3472
     IDcolumn       1.00      1.00      1.00      3472
     IPformat       1.00      0.99      0.99      3472
    URLformat       1.00      0.98      0.99      3472
          age       1.00      1.00      1.00      3472
        angle       1.00      1.00      1.00      3472
       binary       1.00      0.97      0.98      3472
bloodpressure       1.00      1.00      1.00      3472
  categorical       0.98      0.95      0.97      3472
         city       0.99      1.00      0.99      3472
      country       1.00      1.00      1.00      3472
         date       1.00      0.94      0.97      3472
     datetime       0.98      1.00      0.99      3472
          day       1.00      1.00      1.00      3472
    heartrate      



Best parameters for KNeighborsClassifier: {'n_neighbors': 3, 'weights': 'uniform'}
Classification Report for KNeighborsClassifier:
               precision    recall  f1-score   support

 E-mailformat       1.00      0.99      1.00      3472
     IDcolumn       1.00      1.00      1.00      3472
     IPformat       0.98      0.99      0.99      3472
    URLformat       0.99      1.00      0.99      3472
          age       1.00      1.00      1.00      3472
        angle       1.00      1.00      1.00      3472
       binary       0.99      0.97      0.98      3472
bloodpressure       1.00      1.00      1.00      3472
  categorical       0.98      0.94      0.96      3472
         city       0.99      0.99      0.99      3472
      country       1.00      1.00      1.00      3472
         date       0.91      0.97      0.94      3472
     datetime       0.99      0.91      0.95      3472
          day       1.00      1.00      1.00      3472
    heartrate       1.00      1.00      1.0



Best parameters for LinearSVC: {'C': 0.1, 'dual': 'auto', 'max_iter': 20000}
Classification Report for LinearSVC:
               precision    recall  f1-score   support

 E-mailformat       1.00      1.00      1.00      3472
     IDcolumn       1.00      1.00      1.00      3472
     IPformat       1.00      0.99      0.99      3472
    URLformat       0.99      1.00      1.00      3472
          age       1.00      1.00      1.00      3472
        angle       1.00      1.00      1.00      3472
       binary       1.00      0.97      0.98      3472
bloodpressure       1.00      1.00      1.00      3472
  categorical       0.98      0.94      0.96      3472
         city       0.99      0.99      0.99      3472
      country       1.00      1.00      1.00      3472
         date       0.89      0.95      0.92      3472
     datetime       0.99      0.89      0.94      3472
          day       1.00      1.00      1.00      3472
    heartrate       1.00      1.00      1.00      3472
     