In [None]:
import sys
sys.path.append('../libraries/')
sys.path.append('results/')

import pandas as pd
import matplotlib.pyplot as plt
import shap
import libraries.model_tools as mt

from shap import KernelExplainer, Explanation

import os

In [None]:
os.getcwd()

In [None]:
os.chdir('/Users/christostrydom/github_repos/tinyhands/application/lji_social_media/')

DATA_COLUMNS = [
    "assure_prompt",
    "bypass_prompt",
    "callback_request_prompt",
    "false_organization_prompt",
    "gender_specific_prompt",
    "illegal_activities_prompt",
    "immediate_hiring_prompt",
    "language_switch_prompt",
    # "multiple_applicants_prompt",
    # "multiple_jobs_prompt",
    "multiple_provinces_prompt",
    "no_education_skilled_prompt",
    "no_location_prompt",
    "quick_money_prompt",
    "recruit_students_prompt",
    # "requires_references",
    "suspicious_email_prompt",
    "target_specific_group_prompt",
    "unprofessional_writing_prompt",
    "unrealistic_hiring_number_prompt",
    "unusual_hours_prompt",
    "vague_description_prompt",
    "wrong_link_prompt",
]


def load_and_preprocess_data(file_path):
    """
    Load and preprocess the dataset from the provided file path.
    """
    model_data = pd.read_csv(file_path)
    model_data = model_data[
        (model_data["monitor_score"] != "unknown")
        & (~model_data["monitor_score"].isna())
    ]
    mapping = {"yes": 1, "no": 0}
    model_data = model_data.replace(mapping)
    for col in DATA_COLUMNS:
        model_data[col] = pd.to_numeric(model_data[col], errors="coerce")

        # Now, you can identify and handle NaN values
        non_numeric_entries = model_data[model_data[col].isna()]
        model_data = model_data.dropna(subset=[col])
        model_data[col] = model_data[col].astype(int)

    return model_data, model_data["monitor_score"]



# Load your saved model
unique_model_filename = "redflag_model_2024-10-18T14_28_09_548540.pkl"  # Update with the actual filename
trained_pipeline = mt.load_model(unique_model_filename)

file_path = "results/advert_flags.csv"
advert_flags, y = load_and_preprocess_data(file_path)
X = advert_flags[DATA_COLUMNS]

# If your model contains any preprocessing steps (like data transformation), ensure you transform the data accordingly
X_transformed = trained_pipeline[:-1].transform(X)  # Assuming the last step is the model, we skip it

# Convert transformed data to a DataFrame for SHAP compatibility
X_transformed_df = pd.DataFrame(X_transformed, columns=X.columns[:X_transformed.shape[1]])

# Initialize SHAP KernelExplainer with the model and the transformed dataset
explainer = KernelExplainer(trained_pipeline[-1].predict, X_transformed_df)

# Calculate SHAP values for the entire dataset
shap_values = explainer.shap_values(X_transformed_df, nsamples=100)

# Plot SHAP summary plot for the entire dataset
plt.figure()
shap.summary_plot(shap_values, X_transformed_df)

# If you want to analyze individual predictions, pick a specific row (e.g., row 0)
instance_index = 0
shap_values_instance = shap_values[instance_index]

# Plot a SHAP waterfall plot for an individual instance
plt.figure()
shap.plots.waterfall(shap.Explanation(values=shap_values_instance, feature_names=X_transformed_df.columns), max_display=20)

# Show the plots
plt.show()

In [None]:
advert_flags.iloc[3]

In [None]:
advert_flags

In [None]:
# Plot SHAP summary plot for the entire dataset
plt.figure()
shap.summary_plot(shap_values, X_transformed_df)

# If you want to analyze individual predictions, pick a specific row (e.g., row 0)
instance_index = 3
shap_values_instance = shap_values[instance_index]

# Create an Explanation object for the individual instance
shap_values_instance_explanation = Explanation(
    values=shap_values_instance,
    base_values=explainer.expected_value,
    data=X_transformed_df.iloc[instance_index],
    feature_names=X_transformed_df.columns
)

# Plot a SHAP waterfall plot for an individual instance
plt.figure()
shap.plots.waterfall(shap_values_instance_explanation, max_display=20)

# Show the plots
plt.show()

In [None]:
# Extract feature importances (mean absolute SHAP values)
feature_importances = pd.DataFrame({
    'Feature': X_transformed_df.columns,
    'Importance': abs(shap_values).mean(axis=0)
}).sort_values(by='Importance', ascending=False)

print("Feature Importances:")
print(feature_importances)

In [None]:
# If you want to analyze individual predictions, pick a specific row (e.g., row 0)
instance_index = 3
shap_values_instance = shap_values[instance_index]

In [None]:
X.iloc[instance_index]

In [None]:
X_transformed_df.iloc[instance_index]

In [None]:
# Create an Explanation object for the individual instance
shap_values_instance_explanation = Explanation(
    values=shap_values_instance,
    base_values=explainer.expected_value,
    data=X_transformed_df.iloc[instance_index],
    feature_names=X_transformed_df.columns
)

In [None]:
print(X_transformed_df.loc[2])

In [None]:
# Plot a SHAP force plot for an individual instance to show the contribution of each feature to the predicted score
plt.figure()
shap.force_plot(
    base_value=shap_values_instance_explanation.base_values,
    shap_values=shap_values_instance_explanation.values,
    features=shap_values_instance_explanation.data,
    feature_names=shap_values_instance_explanation.feature_names,
    matplotlib=True
)

# Plot a SHAP waterfall plot for an individual instance
plt.figure()
shap.plots.waterfall(shap_values_instance_explanation, max_display=20)

# Show the plots
plt.show()

In [None]:
X.loc[2]

In [None]:
advert_flags.loc[advert_flags['illegal_activities_prompt']==1, ['illegal_activities_prompt', 'monitor_score']]

In [None]:
instance_index = 3  # Example index

print("Original value in X:", X.loc[instance_index, "gender_specific_prompt"])
print("Transformed value in X_transformed_df:", X_transformed_df.loc[instance_index, "gender_specific_prompt"])


In [None]:
raw_data = pd.read_csv("results/advert_flags.csv")
print("Raw data value for gender_specific_prompt:", raw_data.loc[instance_index, "gender_specific_prompt"])
