<a href="https://colab.research.google.com/github/LeonHauch/bachelorarbeit/blob/main/BA_TextData.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
from google.colab import files


uploaded = files.upload()


Saving cyberbullying_tweets.csv to cyberbullying_tweets.csv


In [None]:
import pandas as pd

# Load the dataset
df = pd.read_csv("cyberbullying_tweets.csv")

# General analysis
head_data = df.head()
info_data = df.info()
label_distribution = df['cyberbullying_type'].value_counts(normalize=True)

label_distribution


In [None]:
pip install pandas matplotlib seaborn


In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# Remove duplicates
data_cleaned = df.drop_duplicates()

# Create a column for tweet length
data_cleaned['tweet_length'] = data_cleaned['tweet_text'].apply(len)

# Visualize class distribution
class_distribution = data_cleaned['cyberbullying_type'].value_counts()
class_distribution.plot(kind='bar', color=['skyblue', 'salmon'])
plt.title('Distribution of Cyberbullying Classes')
plt.xlabel('Cyberbullying Type')
plt.ylabel('Number of Instances')
plt.xticks(rotation=0)
plt.show()

# Visualize the distribution of tweet lengths for each class
plt.figure(figsize=(12, 8))
for label in class_distribution.index:
    subset = data_cleaned[data_cleaned['cyberbullying_type'] == label]
    sns.kdeplot(subset['tweet_length'], label=label, shade=True)

plt.title('Distribution of Tweet Lengths by Cyberbullying Type')
plt.xlabel('Tweet Length')
plt.ylabel('Density')
plt.legend()
plt.show()


In [None]:
# Preprocessing Steps

# 1. Check for missing values
missing_values = df.isnull().sum()

# 2. Check for duplicate rows
duplicates = df.duplicated().sum()

missing_values, duplicates


In [None]:
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
nltk.download('punkt')

# Download required nltk data
nltk.download('stopwords')
nltk.download('wordnet')

# Preprocessing function
def preprocess_text(text):
    # Convert to lowercase
    text = text.lower()
    # Remove punctuations, special characters, and digits
    text = re.sub(r'[^\w\s]|[\d]', '', text)
    # Tokenization
    tokens = nltk.word_tokenize(text)
    # Remove stopwords
    stopwords_list = set(stopwords.words('english'))
    tokens = [token for token in tokens if token not in stopwords_list]
    # Lemmatization
    lemmatizer = WordNetLemmatizer()
    tokens = [lemmatizer.lemmatize(token) for token in tokens]
    return ' '.join(tokens)

# Apply preprocessing to the 'tweet_text' column
df['tweet_text'] = df['tweet_text'].apply(preprocess_text)

# Display the first few preprocessed records
df.head()


In [None]:
# Load the preprocessed dataset
df_preprocessed = df.copy()

# Remove NaN values
df_preprocessed = df_preprocessed.dropna()

# Check the first few rows and data info
head_data = df_preprocessed.head()
info_data = df_preprocessed.info()

head_data, info_data


In [None]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, f1_score


# Vektorisierer vorbereiten
tfidf_vectorizer = TfidfVectorizer()
count_vectorizer = CountVectorizer()

X_tfidf = tfidf_vectorizer.fit_transform(df_preprocessed['tweet_text'])
X_count = count_vectorizer.fit_transform(df_preprocessed['tweet_text'])
y = df_preprocessed['cyberbullying_type']

# Modelle vorbereiten
models = {
    'SVC': SVC(),
    'RandomForest': RandomForestClassifier(),
    'kNN': KNeighborsClassifier()  # kNN hinzugefügt
}

results = []

# Durch die Kombinationen von Vektorisierer und Modell iterieren
for vectorizer_name, X in [('TF-IDF', X_tfidf), ('CountVectorizer', X_count)]:
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    for model_name, model in models.items():
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)

        accuracy = accuracy_score(y_test, y_pred)
        f1 = f1_score(y_test, y_pred, average='weighted')

        results.append({
            'Vectorizer': vectorizer_name,
            'Model': model_name,
            'Accuracy': accuracy,
            'F1 Score': f1
        })

# Ergebnisse anzeigen
results_df = pd.DataFrame(results)
display(results_df)


In [None]:
!pip install lime shap

In [None]:
#implementierung von LIME, Sample Space des Countvectorizer nicht limitiert
from sklearn.svm import SVC
from lime import lime_text
from lime.lime_text import LimeTextExplainer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split

# Assuming df_preprocessed and other necessary dataframes are already loaded and preprocessed

# Vectorize using CountVectorizer
count_vectorizer = CountVectorizer()
X_count = count_vectorizer.fit_transform(df_preprocessed['tweet_text'])
y = df_preprocessed['cyberbullying_type']

# Split the data
X_train_count, X_test_count, y_train, y_test = train_test_split(X_count, y, test_size=0.2, random_state=42)

# Create and train the SVC model
svc = SVC(probability=True)
svc.fit(X_train_count, y_train)

# Custom prediction function for LIME
def custom_predict(texts):
    transformed_data = count_vectorizer.transform(texts)
    return svc.predict_proba(transformed_data)

# LIME
explainer = LimeTextExplainer(class_names=y.unique())

# 10 Instanzen aus dem gesamten Datensatz; mindestens eine pro Kategorie
sample_instances = []
for category in y.unique():
    sample = df_preprocessed[df_preprocessed['cyberbullying_type'] == category].sample(1)
    sample_instances.append(sample['tweet_text'].values[0])

# If we have fewer than 10 categories, fill the rest with random samples
while len(sample_instances) < 10:
    sample = df_preprocessed.sample(1)
    sample_instances.append(sample['tweet_text'].values[0])




In [None]:
y = df_preprocessed['cyberbullying_type']
# 10 Instanzen aus dem gesamten Datensatz; mindestens eine pro Kategorie
sample_instances = []
for category in y.unique():
    sample = df_preprocessed[df_preprocessed['cyberbullying_type'] == category].sample(1)
    sample_instances.append(sample['tweet_text'].values[0])

# If we have fewer than 10 categories, fill the rest with random samples
while len(sample_instances) < 10:
    sample = df_preprocessed.sample(1)
    sample_instances.append(sample['tweet_text'].values[0])

In [None]:
from sklearn.metrics import accuracy_score

# ... [your existing code]

# Predict the classes for the test data using the trained SVC
y_pred = svc.predict(X_test_count)

# Calculate and print the accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy of the SVC classifier: {accuracy:.4f}")


In [None]:
 # Generate LIME explanations
lime_explanations = []
for instance in sample_instances:
    # Generate explanations for all classes to ensure you cover the class with highest LIME predicted probability
    exp = explainer.explain_instance(instance, custom_predict, num_features=15, top_labels=len(explainer.class_names))
    lime_explanations.append(exp)


In [None]:
# Display LIME explanations for all instances in sample_instances
for i, instance in enumerate(sample_instances):
    exp = lime_explanations[i]

    # Predict the class using the SVC classifier for the instance
    predicted_class = svc.predict(count_vectorizer.transform([instance]))[0]

    # Fetch the actual label (ground truth) from the dataframe
    actual_label = df_preprocessed[df_preprocessed['tweet_text'] == instance]['cyberbullying_type'].values[0]

    # Get the class with the highest LIME predicted probability
    lime_probs = exp.predict_proba
    top_class_index = lime_probs.argmax()
    top_class = explainer.class_names[top_class_index]

    # Display the information
    print(f"Instance {i+1}:")
    print("Original Text:", instance)
    print("Predicted Class by SVC:", predicted_class)
    print("Actual Label:", actual_label)
    print("Class with Highest LIME Predicted Probability:", top_class)
    print("\nLIME Explanation for Class:", top_class)

    # Display the LIME explanation matching the class with the highest LIME predicted probability
    exp.show_in_notebook(labels=(top_class_index,), text=True)
    print("\n" + "="*50 + "\n")



In [None]:
# alternative Implementierung mit max. 4000 Features im Countvectorizer, um die Ausführungszeit von SHAP zu verringern
from sklearn.svm import SVC
from lime import lime_text
from lime.lime_text import LimeTextExplainer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
# Create another CountVectorizer with limited features
limited_count_vectorizer = CountVectorizer(max_features=4000)
X_count_limited = limited_count_vectorizer.fit_transform(df_preprocessed['tweet_text'])
y_count_limited = df_preprocessed['cyberbullying_type']

In [None]:
# Split the data with limited features
X_train_count_limited, X_test_count_limited, y_train_limited, y_test_limited = train_test_split(X_count_limited, y_count_limited, test_size=0.2, random_state=42)

# Create and train another SVC model on the limited feature set
svc_limited = SVC(probability=True)
svc_limited.fit(X_train_count_limited, y_train_limited)


In [None]:
from sklearn.metrics import accuracy_score
# Predict on test data
y_pred_limited = svc_limited.predict(X_test_count_limited)

# Check accuracy
accuracy = accuracy_score(y_test_limited, y_pred_limited)
print(f"Accuracy with limited features: {accuracy:.2f}")


In [None]:

# Using the previously initialized limited_count_vectorizer and svc_limited

# Custom prediction function for LIME with limited features
def custom_predict_limited(texts):
    transformed_data = limited_count_vectorizer.transform(texts)
    return svc_limited.predict_proba(transformed_data)

# LIME for limited features
explainer_limited = LimeTextExplainer(class_names=y_count_limited.unique())

# Generate LIME explanations using the same sample_instances for the limited features
lime_explanations_limited = []
for instance in sample_instances:  # Using the same sample_instances
    exp = explainer_limited.explain_instance(instance, custom_predict_limited, num_features=15, top_labels=len(explainer_limited.class_names))
    lime_explanations_limited.append(exp)

# Display LIME explanations for all instances in sample_instances
for i, instance in enumerate(sample_instances):  # Again, using the same sample_instances
    exp = lime_explanations_limited[i]

    # Predict the class using the SVC classifier with limited features for the instance
    predicted_class = svc_limited.predict(limited_count_vectorizer.transform([instance]))[0]

    # Fetch the actual label (ground truth) from the dataframe
    actual_label = df_preprocessed[df_preprocessed['tweet_text'] == instance]['cyberbullying_type'].values[0]

    # Get the class with the highest LIME predicted probability
    lime_probs = exp.predict_proba
    top_class_index = lime_probs.argmax()
    top_class = explainer_limited.class_names[top_class_index]

    # Display the information
    print(f"Instance {i+1}:")
    print("Original Text:", instance)
    print("Predicted Class by SVC with limited features:", predicted_class)
    print("Actual Label:", actual_label)
    print("Class with Highest LIME Predicted Probability:", top_class)
    print("\nLIME Explanation for Class:", top_class)

    # Display the LIME explanation matching the class with the highest LIME predicted probability
    exp.show_in_notebook(labels=(top_class_index,), text=True)
    print("\n" + "="*50 + "\n")


In [None]:
# Custom prediction function for LIME with limited features
def custom_predict_limited(texts):
    transformed_data = limited_count_vectorizer.transform(texts)
    return svc_limited.predict_proba(transformed_data)

In [None]:
print("Sample Instances:")
for idx, instance in enumerate(sample_instances):
    print(f"Instance {idx + 1}:\n{instance}\n" + "="*50)


In [None]:
from sklearn.metrics import accuracy_score

# Predict on the test set
y_pred = svc_limited.predict(X_test_count_limited)

# Calculate accuracy
accuracy = accuracy_score(y_test_limited, y_pred)
print(f"Accuracy of the SVC with limited features: {accuracy:.2f}")


In [None]:
import shap
import signal

# Function to handle the alarm timeout
def handler(signum, frame):
    raise TimeoutError("SHAP computation took too long!")

# Set the signal function handler
signal.signal(signal.SIGALRM, handler)

# Set an alarm for 10 minutes (600 seconds)
signal.alarm(2000)  # 10 minutes * 60 seconds/minute

try:
    # Initialize the SHAP explainer
    explainer_shap = shap.KernelExplainer(svc_limited.predict_proba, shap.sample(X_train_count_limited, 20))

    # Select the fifth instance
    selected_instance = [sample_instances[4]]

    # Get SHAP values for the selected instance
    shap_values = explainer_shap.shap_values(limited_count_vectorizer.transform(selected_instance))

    # Display SHAP force plot for the fifth instance
    predicted_class = svc_limited.predict(limited_count_vectorizer.transform(selected_instance))[0]
    actual_label = df_preprocessed[df_preprocessed['tweet_text'] == selected_instance[0]]['cyberbullying_type'].values[0]

    print("Instance 5:")
    print("Original Text:", selected_instance[0])
    print("Predicted Class by SVC with limited features:", predicted_class)
    print("Actual Label:", actual_label)

    # Display the SHAP force plot
    shap.force_plot(explainer_shap.expected_value[1], shap_values[1][0], feature_names=limited_count_vectorizer.get_feature_names_out(), matplotlib=True)

except TimeoutError:
    print("Computation took too long and was terminated.")

# Cancel the alarm
signal.alarm(0)



In [None]:
# training eines neuen SVC um overfitting zu verhindern, dann erneut SHAP
from sklearn.metrics import accuracy_score

# Adjusted SVC with regularization and linear kernel
svc_adjusted = SVC(C=0.5, kernel='linear', probability=True)
svc_adjusted.fit(X_train_count_limited, y_train_limited)

# Checking accuracy on test set
y_pred = svc_adjusted.predict(X_test_count_limited)
print("Adjusted SVC Accuracy:", accuracy_score(y_test_limited, y_pred))

# Compute SHAP values for instance 5
import shap

# Function to handle timeout
def handler(signum, frame):
    raise TimeoutError("SHAP computation took too long!")

signal.signal(signal.SIGALRM, handler)
signal.alarm(600)  # 10 minutes

try:
    explainer_shap_adjusted = shap.KernelExplainer(svc_adjusted.predict_proba, shap.sample(X_train_count_limited, 40))
    shap_values_adjusted = explainer_shap_adjusted.shap_values(limited_count_vectorizer.transform([sample_instances[4]]))
    shap.force_plot(explainer_shap_adjusted.expected_value[1], shap_values_adjusted[1][0], feature_names=limited_count_vectorizer.get_feature_names_out(), matplotlib=True)
except TimeoutError:
    print("Computation took too long and was terminated.")

signal.alarm(0)  # Cancel the alarm


In [None]:
#Vergleiche Ausführungszeit von LIME und SHAP exemplarisch für Instance 5
import time
import pandas as pd

# Instances to focus on
focus_instances = [4, 7]  # Indices for 5th and 8th instances

# Lists to store execution times
lime_times = []
shap_times = []

# Measure LIME execution time for instances 5 and 8
for idx in focus_instances:
    start_time = time.time()

    instance = sample_instances[idx]
    exp = explainer_limited.explain_instance(instance, custom_predict_limited, num_features=15, top_labels=len(explainer_limited.class_names))

    end_time = time.time()
    lime_times.append(end_time - start_time)

# Measure SHAP execution time for instances 5 and 8
for idx in focus_instances:
    start_time = time.time()

    instance = [sample_instances[idx]]
    shap_values = explainer_shap.shap_values(limited_count_vectorizer.transform(instance))

    end_time = time.time()
    shap_times.append(end_time - start_time)

# Create a table to compare execution times visually
df = pd.DataFrame({
    'Instance': ['5', '8'],
    'LIME Execution Time (s)': lime_times,
    'SHAP Execution Time (s)': shap_times
})

print(df)


In [None]:
# similarity: wie verhalten sich Instanzen der gleichen Klasse?
# LIME for 'age' category
explainer_religion = LimeTextExplainer(class_names=y.unique())

# 5 instances for the 'religion' category
sample_instances_religion = df_preprocessed[df_preprocessed['cyberbullying_type'] == 'religion'].sample(5)['tweet_text'].tolist()

# Apply LIME to these instances
lime_explanations_religion = []
for instance in sample_instances_religion:
    exp = explainer_religion.explain_instance(instance, custom_predict_limited, num_features=15, top_labels=len(explainer_religion.class_names))
    lime_explanations_religion.append(exp)

# Display LIME explanations for these instances
for i, instance in enumerate(sample_instances_religion):
    exp = lime_explanations_religion[i]

    # Predict the class using the SVC classifier for the instance
    predicted_class = svc.predict(count_vectorizer.transform([instance]))[0]
    actual_label = df_preprocessed[df_preprocessed['tweet_text'] == instance]['cyberbullying_type'].values[0]

    # Get the class with the highest LIME predicted probability
    lime_probs = exp.predict_proba
    top_class_index = lime_probs.argmax()
    top_class = explainer_religion.class_names[top_class_index]

    # Display the information
    print(f"Instance {i+1}:")
    print("Original Text:", instance)
    print("Predicted Class by SVC:", predicted_class)
    print("Actual Label:", actual_label)
    print("Class with Highest LIME Predicted Probability:", top_class)
    print("\nLIME Explanation for Class:", top_class)

    # Display the LIME explanation matching the class with the highest LIME predicted probability
    exp.show_in_notebook(labels=(top_class_index,), text=True)
    print("\n" + "="*50 + "\n")


In [None]:
#Aequivalent für SHAP
import shap
import signal

# Set up timeout function to handle long-running SHAP computations
def handler(signum, frame):
    raise TimeoutError("SHAP computation took too long!")
signal.signal(signal.SIGALRM, handler)

# Initialize SHAP explainer with the model using limited features
explainer_shap_age = shap.KernelExplainer(svc_limited.predict_proba, shap.sample(X_train_count_limited, 100))

for i, instance in enumerate(sample_instances_age):
    # Set an alarm for 10 minutes
    signal.alarm(600)

    try:
        # Get SHAP values for the instance
        shap_values = explainer_shap_age.shap_values(limited_count_vectorizer.transform([instance]))

        # Display SHAP force plot for the instance
        predicted_class = svc_limited.predict(limited_count_vectorizer.transform([instance]))[0]
        actual_label = 'age'  # Since we're only considering the 'age' category

        print(f"Instance {i+1}:")
        print("Original Text:", instance)
        print("Predicted Class by SVC with limited features:", predicted_class)
        print("Actual Label:", actual_label)

        # Display the SHAP force plot
        shap.force_plot(explainer_shap_age.expected_value[1], shap_values[1][0], feature_names=limited_count_vectorizer.get_feature_names_out(), matplotlib=True)
        print("\n" + "="*50 + "\n")

    except TimeoutError:
        print(f"Computation for instance {i+1} took too long and was terminated.")
        print("\n" + "="*50 + "\n")

    # Cancel the alarm
    signal.alarm(0)


In [None]:
# 5 instances from the category "religion"
sample_instances_religion = df_preprocessed[df_preprocessed['cyberbullying_type'] == 'religion'].sample(5)['tweet_text'].tolist()

# Generate LIME explanations using the sample_instances_religion
lime_explanations_religion = []
for instance in sample_instances_religion:
    exp = explainer_limited.explain_instance(instance, custom_predict_limited, num_features=15, top_labels=len(explainer_limited.class_names))
    lime_explanations_religion.append(exp)

# To store the top 5 features for each instance
top_features_list_lime_religion = []

for i, exp in enumerate(lime_explanations_religion):
    # Extract the top 5 important features for the instance
    # Get the tuple list for the class with the highest predicted probability
    top_class_index = exp.predict_proba.argmax()
    feature_tuples = exp.as_list(label=top_class_index)

    # Extract only the feature names
    top_features = [feature[0] for feature in feature_tuples[:5]]
    top_features_list_lime_religion.append(top_features)

    # Display the LIME explanation for the instance
    print(f"Instance {i+1}:")
    exp.show_in_notebook(labels=(top_class_index,), text=True)
    print("\n" + "="*50 + "\n")

# Convert the list of top features into a pandas DataFrame for display
df_top_features_lime_religion = pd.DataFrame(top_features_list_lime_religion, columns=[f"Top Feature {i+1}" for i in range(5)])
df_top_features_lime_religion.index = [f"Instance {i+1}" for i in range(len(sample_instances_religion))]
display(df_top_features_lime_religion)


In [None]:
selected_instance_religion = sample_instances_religion[0]


In [None]:
import shap

# Initialize a new SHAP KernelExplainer for the svc_limited model
background_data = shap.sample(X_train_count_limited, 20)
explainer_shap_new = shap.KernelExplainer(svc_limited.predict_proba, background_data)

# Get the first instance from sample_instances_religion
first_instance_religion = [sample_instances_religion[0]]

# Get SHAP values for the first instance
shap_values_first_instance = explainer_shap_new.shap_values(limited_count_vectorizer.transform(first_instance_religion))

# Visualize SHAP values for the first instance using a summary plot
shap.summary_plot(shap_values_first_instance, limited_count_vectorizer.transform(first_instance_religion), feature_names=limited_count_vectorizer.get_feature_names_out())



In [None]:
# Get the first instance from sample_instances_religion
second_instance_religion = [sample_instances_religion[1]]

# Get SHAP values for the first instance
shap_values_second_instance = explainer_shap_new.shap_values(limited_count_vectorizer.transform(second_instance_religion))

# Visualize SHAP values for the first instance using a summary plot
shap.summary_plot(shap_values_second_instance, limited_count_vectorizer.transform(second_instance_religion), feature_names=limited_count_vectorizer.get_feature_names_out())

In [None]:
# Get the first instance from sample_instances_religion
third_instance_religion = [sample_instances_religion[2]]

# Get SHAP values for the first instance
shap_values_third_instance = explainer_shap_new.shap_values(limited_count_vectorizer.transform(third_instance_religion))

# Visualize SHAP values for the first instance using a summary plot
shap.summary_plot(shap_values_third_instance, limited_count_vectorizer.transform(third_instance_religion), feature_names=limited_count_vectorizer.get_feature_names_out())

In [None]:
# Get the first instance from sample_instances_religion
fourth_instance_religion = [sample_instances_religion[3]]

# Get SHAP values for the first instance
shap_values_fourth_instance = explainer_shap_new.shap_values(limited_count_vectorizer.transform(fourth_instance_religion))

# Visualize SHAP values for the first instance using a summary plot
shap.summary_plot(shap_values_fourth_instance, limited_count_vectorizer.transform(fourth_instance_religion), feature_names=limited_count_vectorizer.get_feature_names_out())

In [None]:
# Get the first instance from sample_instances_religion
fifth_instance_religion = [sample_instances_religion[4]]

# Get SHAP values for the first instance
shap_values_fifth_instance = explainer_shap_new.shap_values(limited_count_vectorizer.transform(fifth_instance_religion))

# Visualize SHAP values for the first instance using a summary plot
shap.summary_plot(shap_values_fifth_instance, limited_count_vectorizer.transform(fifth_instance_religion), feature_names=limited_count_vectorizer.get_feature_names_out())

In [None]:

# List to store the top 5 features for each instance
top_features_list_shap_religion = []

# List of all SHAP values for instances 1-5
shap_values_all_instances = [
    shap_values_first_instance,
    shap_values_second_instance,
    shap_values_third_instance,
    shap_values_fourth_instance,
    shap_values_fifth_instance,
]

for instance_shap_values in shap_values_all_instances:
    # Convert SHAP values to DataFrame for easy manipulation
    shap_df = pd.DataFrame(instance_shap_values[0], columns=limited_count_vectorizer.get_feature_names_out())

    # Extract the top 5 features with highest absolute SHAP values
    sorted_shap = shap_df.T.abs().sort_values(by=0, ascending=False)
    top_features = sorted_shap.index[:5].tolist()
    top_features_list_shap_religion.append(top_features)

# Convert the list of top features into a pandas DataFrame for display
df_top_features_shap_religion = pd.DataFrame(top_features_list_shap_religion, columns=[f"Top Feature {i+1}" for i in range(5)])
df_top_features_shap_religion.index = [f"Instance {i+1}" for i in range(len(sample_instances_religion))]
display(df_top_features_shap_religion)


In [None]:
# Vergleich einzelner Instanzen mit Feature pertubation: LIME Explanation der Instanz 15988
# Extract the instance at index 15988 from the original dataframe
instance_15988 = df_preprocessed.loc[15988, 'tweet_text']

# Generate LIME explanation for instance_15988
exp_15988 = explainer_limited.explain_instance(instance_15988, custom_predict_limited, num_features=15, top_labels=len(explainer_limited.class_names))

# Extract and display the top 5 important features for instance_15988
top_class_index_15988 = exp_15988.predict_proba.argmax()
feature_tuples_15988 = exp_15988.as_list(label=top_class_index_15988)

# Extract only the feature names
top_features_15988 = [feature[0] for feature in feature_tuples_15988[:5]]

# Display the LIME explanation for instance_15988
print(f"Instance at index 15988:")
print("Original Text:", instance_15988)
exp_15988.show_in_notebook(labels=(top_class_index_15988,), text=True)
print("\nTop 5 Most Important Features:")
print(top_features_15988)
print("\n" + "="*50 + "\n")


In [None]:
import shap

# Filter instances from the category "religion"
religion_instances = df_preprocessed[df_preprocessed['cyberbullying_type'] == 'religion']

# Use the instances from the "religion" category as background samples for the SHAP explainer
background_samples = limited_count_vectorizer.transform(religion_instances['tweet_text'])

# Initialize the SHAP explainer using the religion instances as background
explainer_shap = shap.KernelExplainer(svc_limited.predict_proba, background_samples)

# Extract the instance at index 15988 from the dataframe
instance_15988 = df_preprocessed.loc[15988, 'tweet_text']

# Get SHAP values for instance_15988
shap_values_15988 = explainer_shap.shap_values(limited_count_vectorizer.transform([instance_15988]))

# Determine the class with the highest predicted probability for the instance
class_index = svc_limited.predict(limited_count_vectorizer.transform([instance_15988]))[0]

# Display the waterfall plot for instance_15988 for the specific class
shap.waterfall_plot(explainer_shap.expected_value[class_index], shap_values_15988[class_index][0], feature_names=limited_count_vectorizer.get_feature_names_out())


In [None]:
results_tfidf