In [None]:
#import numpy as np
import pandas as pd

def preprocess_data(df):
    # Changing column names using mapping
    my_list = ["iyear", "imonth", "iday", "extended", "country_txt", "region_txt", "provstate", "city", "success", "multiple", "suicide", "attacktype1_txt", "gname",
               "targtype1_txt", "natlty1_txt", "weaptype1_txt", "nkill"]
    new_list = ["Year", "Month", "Day", "Extended", "Country", "Region", "Province", "City", "Success", "Multiple", "Suicide", "Attack", "Group", "Target", "Nationality",
                "Weapon", "Dead"]

    column_mapping = dict(zip(my_list, new_list))
    df = df.rename(columns=column_mapping)

    # Converting NA values in Dead column to zeros
    df['Dead'] = df['Dead'].fillna(0)

    # Selecting only the specified columns
    selected_columns = ['Year', 'Month', 'Day', 'Country', 'Region', 'Province', 'City',  
                        'Attack', 'Target', 'Group', 'Weapon', 'Dead']

    df = df[selected_columns]

    # Creating Lethal variable as binary
    df['Lethal'] = np.where(df['Dead'] == 0, 0, 1)

    # Mapping values in the Attack column
    attack_mapping = {
        "Bombing/Explosion": "BombAttack",
        "Hostage Taking (Kidnapping)": "HostageKidnapAttack",
        "Facility/Infrastructure Attack": "InfrastructureAttack",
        "Armed Assault": "ArmedAssaultAttack",
        "Unarmed Assault": "UnarmedAssaultAttack",
        "Hostage Taking (Barricade Incident)": "HostageBarricadeAttack"
    }

    df['Attack'] = df['Attack'].map(attack_mapping).fillna(df['Attack'])

    # Mapping values in the Target column
    target_mapping = {
        "Private Citizens & Property": "Private",
        "Government (Diplomatic)": "GovtDip",
        "Journalists & Media": "JournalistsMedia",
        "Government (General)": "GovtGen",
        "Airports & Aircraft": "AirportsAircraft",
        "Educational Institution": "EduIns",
        "Violent Political Party": "VPPTarget",
        "Religious Figures/Institutions": "RelFigIns",
        "Unknown": "UnknownTarget",
        "Food or Water Supply": "FoodWaterSup",
        "Terrorists/Non-State Militia": "TNSMTarget",
        "Abortion Related": "Abortion"
    }

    df['Target'] = df['Target'].map(target_mapping).fillna(df['Target'])

    # Mapping values in the Group column
    group_mapping = {
        "Group.Islamic State of Iraq and the Levant (ISIL)": "ISIS",
        "Tehrik-i-Taliban Pakistan (TTP)": "TTP",
        "Revolutionary Armed Forces of Colombia (FARC)": "FARC",
        "M-19 (Movement of April 19)": "M19",
        "National Liberation Army of Colombia (ELN)": "ELN",
        "Unknown": "OtherGroup",
        "Tupac Amaru Revolutionary Movement (MRTA)": "MRTA",
        "Shining Path (SL)": "ShiningPath",
        "Salafist Group for Preaching and Fighting (GSPC)": "GSPC",
        "Islamic Salvation Front (FIS)": "FIS",
        "Algerian Islamic Extremists": "Algerian_Islamic_Extremists",
        "Al-Qaida in the Islamic Maghreb (AQIM)": "AQIM",
        "Armed Islamic Group (GIA)": "GIA",
        "Farabundo Marti National Liberation Front (FMLN)": "FMLN",
        "Liberation Tigers of Tamil Eelam (LTTE)": "LTTE"
    }

    df['Group'] = df['Group'].map(group_mapping).fillna(df['Group'])

    # Mapping values in the Province column
    province_mapping = {
        "North-West Frontier Province": "NWFP",
        "Federally Administered Tribal Areas": "FATA",
        "Khyber Pakhtunkhwa": "Khyber_Pakhtunkhwa",
        "Al Anbar": "Al_Anbar",
        "Tizi Ouzou": "Tizi_Ouzou",
        "North Central": "NorthCentral",
        "Valle del Cauca": "ValledelCauca",
        "Bogota": "BogotaProvince"
    }

    df['Province'] = df['Province'].replace(province_mapping).fillna(df['Province'])

    # Mapping values in the Weapon column
    weapon_mapping = {
        "Unknown": "OtherWeapon"
    }

    df['Weapon'] = df['Weapon'].replace(weapon_mapping).fillna(df['Weapon'])

    # Grouping infrequent categories with a 5% threshold
    categorical_columns = ['Attack', 'Target', 'Group', 'Province', 'Weapon', 'Country', 'City']

    for column in categorical_columns:
        counts = df[column].value_counts(normalize=True)
        infrequent_categories = counts[counts < 0.05].index
        df[column] = np.where(df[column].isin(infrequent_categories), 'Other' + column, df[column])

    return df


In [None]:
import numpy as np
import warnings
from scipy.stats import pearsonr
from sklearn.feature_selection import VarianceThreshold

def preprocess_dataframe_three(df, target_column, correlation_threshold=0.7, variance_threshold=0.1):
    # Remove NA values
    df = df.dropna()

    # Check for linearity
    linear_features = []
    for column in df.columns:
        if column == 'Lethal' or column == 'Year':  # Skip 'Lethal' and 'Year' variables
            continue
        with warnings.catch_warnings():
            warnings.filterwarnings('ignore', category=RuntimeWarning)
            try:
                correlation, _ = pearsonr(df[column], df[target_column])
            except RuntimeWarning:
                continue
        if np.isnan(correlation):
            continue
        if abs(correlation) >= correlation_threshold:
            linear_features.append(column)

    # Check for correlation
    correlated_features = set()
    correlation_matrix = df.corr()
    for i in range(len(correlation_matrix.columns)):
        for j in range(i):
            with warnings.catch_warnings():
                warnings.filterwarnings('ignore', category=RuntimeWarning)
                try:
                    correlation = correlation_matrix.iloc[i, j]
                except RuntimeWarning:
                    continue
            if np.isnan(correlation):
                continue
            if abs(correlation) >= correlation_threshold:
                colname_i = correlation_matrix.columns[i]
                colname_j = correlation_matrix.columns[j]
                correlated_features.add(colname_i)
                correlated_features.add(colname_j)

    # Check for zero and near-zero variance
    selector = VarianceThreshold(threshold=variance_threshold)
    selector.fit(df)
    low_variance_features = df.columns[~selector.get_support()]

    # Combine all features to be removed
    features_to_remove = set(linear_features + list(correlated_features) + list(low_variance_features))

    # Return processed DataFrame with all features removed
    return df.drop(features_to_remove, axis=1)



In [None]:
from boruta import BorutaPy
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
import random
import numpy as np

def run_boruta_iterations(data, target, n_iterations=10, random_seed=42, file_prefix='boruta'):
    # Set the random seed for reproducibility
    random.seed(random_seed)
    np.random.seed(random_seed)

    # overcome an issue of numpy compatibility with Boruta, where np.int, np.float, and np.bool should be int, float, and bool respectively
    np.int = int
    np.float = float
    np.bool = bool

    # Store the results of each iteration
    results = []
    selected_variables = []

    # Create a set to store common character strings
    common_strings = set()

    for i in range(n_iterations):
        # Create a new Boruta instance for each iteration
        estimator = RandomForestClassifier(n_estimators=100)  # Choose your desired estimator
        boruta = BorutaPy(estimator=estimator)
        boruta.fit(data.values, target.values)

        # Store the results of the current iteration
        feature_ranks = pd.DataFrame({
            'Feature': data.columns,
            'Rank': boruta.ranking_,
            'Support': boruta.support_
        })
        results.append(feature_ranks)

        # Store the selected variables of the current iteration
        selected_vars = feature_ranks.loc[boruta.support_, 'Feature'].tolist()
        selected_variables.append(selected_vars)

        # Add selected variables to the common_strings set
        if i == 0:
            common_strings.update(selected_vars)
        else:
            common_strings.intersection_update(selected_vars)

        # Save the results of the current iteration to a file
        file_name = f'{file_prefix}_iteration_{i+1}.csv'
        feature_ranks.to_csv(file_name, index=False)

        # Print the results of the current iteration
        print(f"Iteration {i+1} results:")
        print(feature_ranks)

        # Print the selected variables of the current iteration
        print(f"Selected variables in iteration {i+1}:")
        print(selected_vars)

        # Print the common strings after each iteration
        print(f"Common strings after iteration {i+1}:")
        print(common_strings)
        print()

    return results, selected_variables, common_strings


In [None]:
from sklearn.linear_model import LassoCV
from sklearn.model_selection import cross_val_score
import numpy as np

def generate_lasso_iterations_one(X, y):
    np.random.seed(42)  # Set a seed for reproducibility
    vectors = []  # Initialize an empty list to store the vectors
    
    for i in range(10):
        random_seed = np.random.randint(1000)  # Generate a random seed for each iteration
        model = LassoCV(random_state = random_seed, cv = 10)
        
        # Fit the model and perform cross-validation
        model.fit(X, y)
        scores = cross_val_score(model, X, y, cv = 10)
        
        # Print the average cross-validation score
        print(f"Iteration {i + 1} - Average Cross-Validation Score: {np.mean(scores):.3f}")
        
        # Print the optimal collection of coefficients
        print(f"Iteration {i + 1} - Optimal Coefficients:")
        vector = []  # Initialize an empty list for the current iteration's vector
        for feature, coef in zip(X.columns, model.coef_):
            if coef != 0:  # Only consider coefficients for non-zero features
                print(f"{feature}: {coef}")
                vector.append(feature)  # Append the feature to the vector list
        
        vectors.append(vector)  # Append the vector to the vectors list
        print()
    
    # Find common features in all the vectors
    common_features = set.intersection(*map(set, vectors))
    print("Common Features in All Vectors:")
    for feature in common_features:
        print(feature)
    
    return vectors, common_features

In [None]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score, f1_score
import joblib

def caret_model_proportion(data, target_col, Output, Output_Two, Output_Three, test_size=0.2):
    # Set the seed for reproducibility
    np.random.seed(123)
    
    # Create dictionaries to store the model outputs and performance metrics
    model_outputs = {}
    performance_metrics = {}
    
    # Define the sampling methods and models
    sampling_methods = ["none", "boot", "LGOCV", "cv", "repeatedcv"]
    models = [LogisticRegression(), GradientBoostingClassifier(), RandomForestClassifier()]
    
    # Split the data into train and test sets using stratified random sampling
    X_train, X_test, y_train, y_test = train_test_split(
        data.drop(target_col, axis = 1),
        data[target_col],
        test_size = test_size,
        stratify = data[target_col],
        random_state = 123
    )
    
    # Iterate through sampling methods and models
    for sampling_method in sampling_methods:
        for model in models:
            # Set the seed for each model
            np.random.seed(123)
            
            # Train the caret model
            caret_formula = target_col + " ~ ."
            model.fit(X_train, y_train)
            
            # Save the model output
            model_outputs[sampling_method + "_" + model.__class__.__name__] = model
            
            # Save the model object
            output_file = Output + "_" + sampling_method + "_" + model.__class__.__name__ + "Output.pkl"
            joblib.dump(model, output_file)
            
            # Make predictions on the test data
            y_pred = model.predict(X_test)
            
            # Calculate the confusion matrix
            cm = confusion_matrix(y_test, y_pred)
            
            # Convert confusion matrix values to percentages
            row_sums = cm.sum(axis=1, keepdims=True)
            cm_percent = (cm / row_sums) * 100
            cm_percent_rounded = np.round(cm_percent, decimals=2)
            cm_percent_formatted = ["{:.2f}%".format(value) for value in cm_percent_rounded.ravel()]
            cm_percent_reshaped = np.array(cm_percent_formatted).reshape(cm_percent_rounded.shape)
            
            # Calculate performance metrics and convert to percentages
            accuracy = accuracy_score(y_test, y_pred) * 100
            precision = precision_score(y_test, y_pred) * 100
            recall = recall_score(y_test, y_pred) * 100
            f1 = f1_score(y_test, y_pred) * 100
            
            tn, fp, fn, tp = cm.ravel()
            specificity = (tn / (tn + fp)) * 100
            
            # Save the performance metrics
            performance_metrics[sampling_method + "_" + model.__class__.__name__] = {
                'Confusion Matrix': cm_percent_reshaped,
                'Accuracy': "{:.2f}%".format(accuracy),
                'Precision': "{:.2f}%".format(precision),
                'Recall': "{:.2f}%".format(recall),
                'F1-Score': "{:.2f}%".format(f1),
                'Specificity': "{:.2f}%".format(specificity)
            }
    
    return model_outputs, performance_metrics

