In [1]:
# Vulnerability Detection Script

import pandas as pd
import numpy as np
import re
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from joblib import dump, load
import os
import nltk
from nltk.corpus import stopwords

nltk.download('stopwords')
stopword_set = set(stopwords.words('english'))

# Load non-vulnerable whitelist terms
with open("nvw.txt", "r") as f:
    non_vulnerable_whitelist = set(line.strip().lower() for line in f if line.strip())

# ---------------------- Part 1: Load & Prepare Data ----------------------

def load_training_data():
    with open("improved_positive_training_data.txt", "r") as pos_file:
        positive_sentences = [line.strip() for line in pos_file if line.strip()]
    with open("negative_training_data (2).txt", "r") as neg_file:
        negative_sentences = [line.strip() for line in neg_file if line.strip()]

    data = positive_sentences + negative_sentences
    labels = [1]*len(positive_sentences) + [0]*len(negative_sentences)
    return data, labels

# ---------------------- Part 2: Logistic Regression with Gradient Descent ----------------------

class CustomLogisticRegression:
    def _init_(self, lr=0.1, epochs=1000):
        self.lr = lr
        self.epochs = epochs

    def sigmoid(self, z):
        return 1 / (1 + np.exp(-z))

    def fit(self, X, y):
        self.theta = np.zeros(X.shape[1])
        for _ in range(self.epochs):
            z = np.dot(X, self.theta)
            h = self.sigmoid(z)
            gradient = np.dot(X.T, (h - y)) / y.size
            self.theta -= self.lr * gradient

    def predict_proba(self, X):
        return self.sigmoid(np.dot(X, self.theta))

    def predict(self, X):
        return self.predict_proba(X) >= 0.5

# ---------------------- Part 3: Boyer-Moore Pattern Matching ----------------------

def preprocess_patterns(file_path):
    df = pd.read_csv(file_path, header=None)
    return df[0].str.lower().tolist()

def bad_char_heuristic(pattern):
    bad_char = [-1]*256
    for i in range(len(pattern)):
        bad_char[ord(pattern[i])] = i
    return bad_char

def suffixes(pattern):
    m = len(pattern)
    suff = [0] * m
    suff[m-1] = m
    g = m - 1
    f = 0
    for i in range(m-2, -1, -1):
        if i > g and suff[i+m-1-f] < i - g:
            suff[i] = suff[i+m-1-f]
        else:
            g = min(g, i)
            f = i
            while g >= 0 and pattern[g] == pattern[g+m-1-f]:
                g -= 1
            suff[i] = f - g
    return suff

def good_suffix_heuristic(pattern):
    m = len(pattern)
    suff = suffixes(pattern)
    good_suffix = [m]*m
    j = 0
    for i in range(m-1, -1, -1):
        if suff[i] == i+1:
            for j in range(m-1-i):
                if good_suffix[j] == m:
                    good_suffix[j] = m-1-i
    for i in range(m-1):
        good_suffix[m-1-suff[i]] = m-1-i
    return good_suffix

def boyer_moore_search(text, pattern):
    m = len(pattern)
    n = len(text)
    if m == 0:
        return False

    bad_char = bad_char_heuristic(pattern)
    good_suffix = good_suffix_heuristic(pattern)

    s = 0
    while s <= n - m:
        j = m - 1
        while j >= 0 and pattern[j] == text[s + j]:
            j -= 1
        if j < 0:
            return True
        else:
            s += max(good_suffix[j], j - bad_char[ord(text[s + j])])
    return False

def find_vulnerable_terms(input_text, terms):
    input_text_lower = input_text.lower()
    words = re.findall(r'\b\w+\b', input_text_lower)
    found = []
    for term in terms:
        if (
            term in words and
            term not in stopword_set and
            term not in non_vulnerable_whitelist
        ):
            found.append(term)
    return found





# ---------------------- Part 4: Masking Utility ----------------------

def mask_word(word):
    if len(word) <= 2:
        return "*" * len(word)
    return word[0] + "*" * (len(word)-2) + word[-1]

def mask_text(text, words):
    for word in words:
        pattern = re.compile(r'\b' + re.escape(word) + r'\b', re.IGNORECASE)
        text = pattern.sub(mask_word(word), text)
    return text

# ---------------------- Part 5: Runtime Interface ----------------------

def run_system():
    data, labels = load_training_data()
    vectorizer = TfidfVectorizer(ngram_range=(1, 2), min_df=2)
    X = vectorizer.fit_transform(data)
    y = np.array(labels)

    clf = LogisticRegression(max_iter=1000)
    clf.fit(X, y)

    blood_terms = preprocess_patterns("blood_relations.csv")

    while True:
        input_text = input("\nEnter a sentence (or type 'exit'): ").strip()
        if input_text.lower() == 'exit':
            break

        # Step 1: Check with Boyer-Moore
        matched = find_vulnerable_terms(input_text, blood_terms)
        if matched:
            print(f"Alert: This message contains {len(matched)} vulnerable word(s): {', '.join(matched)}")
            if input("Do you want to mask them? (y/n): ").lower() == 'y':
                print("Masked Output:", mask_text(input_text, matched))
            continue

        # Step 2: Use ML if no term matched
        vec_input = vectorizer.transform([input_text]).toarray()
        pred = clf.predict(vec_input)
        if pred[0] == 1:
            print("Model detected potential vulnerability in the message.")
            words = re.findall(r'\b\w+\b', input_text.lower())
            new_terms = [w for w in words if w not in blood_terms and w not in stopword_set and w not in non_vulnerable_whitelist]
            for word in new_terms:
                choice = input(f"Do you want to treat '{word}' as a vulnerable term and mask it? (y/n): ").lower()
                if choice == 'y':
                    print("Masked Output:", mask_text(input_text, [word]))
                    blood_terms.append(word)

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\ASUS\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [2]:
run_system()


Enter a sentence (or type 'exit'):  "Papa and Mumma love spending weekends with Bhai, Behn, Dadaji, Dadiji, Chachu, Chachi, Mamu, Mami, and all the cousins, where we share stories, eat together, laugh loudly, click photos, and create beautiful memories that keep our family bonds strong, making every moment spent with loved ones priceless and full of happiness."


Alert: This message contains 8 vulnerable word(s): bhai, papa, mumma, mamu, chachi, chachu, chachi, love


Do you want to mask them? (y/n):  y


Masked Output: "p**a and m***a l**e spending weekends with b**i, Behn, Dadaji, Dadiji, c****u, c****i, m**u, Mami, and all the cousins, where we share stories, eat together, laugh loudly, click photos, and create beautiful memories that keep our family bonds strong, making every moment spent with loved ones priceless and full of happiness."



Enter a sentence (or type 'exit'):  "Papa and Mumma love spending weekends with Bhai, Behen, Dada, Dadi, Chachu, Chachi, Mamu, Maami, and all the cousins, where we share stories, eat together, laugh loudly, click photos, and create beautiful memories that keep our family bonds strong, making every moment spent with loved ones priceless and full of happiness."


Alert: This message contains 12 vulnerable word(s): bhai, behen, papa, mumma, mamu, chachi, maami, dada, dadi, chachu, chachi, love


Do you want to mask them? (y/n):  y


Masked Output: "p**a and m***a l**e spending weekends with b**i, b***n, d**a, d**i, c****u, c****i, m**u, m***i, and all the cousins, where we share stories, eat together, laugh loudly, click photos, and create beautiful memories that keep our family bonds strong, making every moment spent with loved ones priceless and full of happiness."



Enter a sentence (or type 'exit'):  exit
