In [77]:

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.metrics import ConfusionMatrixDisplay, confusion_matrix
import pandas as pd
import matplotlib.pyplot as plt
from simplemma import lemmatize


def create_lr_model(guideline: str, test_size: float = 0.2):
    def preprocess_text(text: str):
        gender_neutralizing_dict = {
            "he": "applicant",
            "she": "applicant",
            "husband": "spouse",
            "wife": "spouse",
        }
        
        text_list = text.lower().split()
        for i, word in enumerate(text_list):
            if word in gender_neutralizing_dict.keys():
                text_list[i] = gender_neutralizing_dict[word]

        return " ".join([lemmatize(word, lang="en") for word in text_list if len(word) > 2])

    data = pd.read_csv(
        f"C:\\Users\\11jul\\Documents\\security-clearance-classification\\src\\data\\formal_finding_results_guideline_{guideline.upper()}.csv"
    )

    train_data, test_data = train_test_split(data, test_size=test_size)

    model = Pipeline(
        [
            (
                "tfidf",
                TfidfVectorizer(
                    ngram_range=(1, 3),
                    stop_words="english",
                    preprocessor=preprocess_text,
                    min_df=0.01,
                    max_df=0.9,
                ),
            ),
            (
                "clf",
                LogisticRegression(
                    class_weight="balanced", penalty='l2', solver="liblinear"
                ),
            ),
        ]
    )

    model.fit(train_data["text"], train_data["label"])

    return model, train_data, test_data

lr, train_data, test_data = {}, {}, {}


lr['A'], train_data['A'], test_data['A'] = create_lr_model("A")
lr['B'], train_data['B'], test_data['B'] = create_lr_model("B")
lr['C'], train_data['C'], test_data['C'] = create_lr_model("C")
lr['D'], train_data['D'], test_data['D'] = create_lr_model("D")
lr['E'], train_data['E'], test_data['E'] = create_lr_model("E")
lr['F'], train_data['F'], test_data['F'] = create_lr_model("F")
lr['G'], train_data['G'], test_data['G'] = create_lr_model("G")
lr['H'], train_data['H'], test_data['H'] = create_lr_model("H")
lr['H'], train_data['H'], test_data['H'] = create_lr_model("I")
lr['I'], train_data['I'], test_data['I'] = create_lr_model("J")
lr['J'], train_data['J'], test_data['J'] = create_lr_model("J")
lr['K'], train_data['K'], test_data['K'] = create_lr_model("K")
lr['M'], train_data['M'], test_data['M'] = create_lr_model("M")




In [79]:
for guideline, model in lr.items():
    pd.DataFrame(
        {
            "feature": lr[guideline]["tfidf"].get_feature_names_out(),
            "coef": lr[guideline]["clf"].coef_[0],
        },
    ).set_index("feature").sort_values("coef").to_csv(f"{guideline}_lr_model.csv")

In [75]:
from sklearn.metrics import classification_report

print(classification_report(test_data_B["label"], lr_B.predict(test_data_B["text"])))

              precision    recall  f1-score   support

       False       0.58      0.66      0.62        32
        True       0.69      0.62      0.65        39

    accuracy                           0.63        71
   macro avg       0.63      0.64      0.63        71
weighted avg       0.64      0.63      0.63        71



In [59]:
lr_B.predict_proba(["The applicant has contracts in russia",
                    "The applicant has contracts in russia and is a russian citizen",
                    "The applicant has contracts in russia and is a russian citizen and has a russian wife",
                    "The applicant has contracts in russia and is a russian citizen and has a russian wife and has a russian wife",
                    "The applicant has contracts in russia and is a russian citizen and has a russian wife and has a russian wife and has a russian wife",
                    "The applicant has contracts in russia and is a russian citizen and has a russian wife and has a russian wife and has a russian wife and has a russian wife",
                    "The applicant has contracts in russia and is a russian citizen and has a russian wife and has a russian wife and has a russian wife and has a russian wife and has a russian wife",
                    "The applicant has contracts in russia and is a russian citizen and has a russian wife and has a russian wife and has a russian wife and has a russian wife and has a russian wife and has a russian wife",
                    "The applicant has contracts in russia and is a russian citizen"])

array([[0.5, 0.5],
       [0.5, 0.5],
       [0.5, 0.5],
       [0.5, 0.5],
       [0.5, 0.5],
       [0.5, 0.5],
       [0.5, 0.5],
       [0.5, 0.5],
       [0.5, 0.5]])