In [84]:

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.metrics import ConfusionMatrixDisplay, confusion_matrix
import pandas as pd
import matplotlib.pyplot as plt
from simplemma import lemmatize

def preprocess_text(text: str):
    gender_neutralizing_dict = {
        "he": "applicant",
        "she": "applicant",
        "husband": "spouse",
        "wife": "spouse",
    }
    
    text_list = text.lower().split()
    for i, word in enumerate(text_list):
        if word in gender_neutralizing_dict.keys():
            text_list[i] = gender_neutralizing_dict[word]

    return " ".join([lemmatize(word, lang="en") for word in text_list if len(word) > 2])



def create_lr_model(guideline: str, test_size: float = 0.2):
    data = pd.read_csv(
        f"C:\\Users\\11jul\\Documents\\security-clearance-classification\\src\\data\\formal_finding_results_guideline_{guideline.upper()}.csv"
    )

    train_data, test_data = train_test_split(data, test_size=test_size)

    model = Pipeline(
        [
            (
                "tfidf",
                TfidfVectorizer(
                    ngram_range=(1, 3),
                    stop_words="english",
                    preprocessor=preprocess_text,
                    min_df=0.01,
                    max_df=0.9,
                ),
            ),
            (
                "clf",
                LogisticRegression(
                    class_weight="balanced", penalty='l2', solver="liblinear"
                ),
            ),
        ]
    )

    model.fit(train_data["text"], train_data["label"])

    return model, train_data, test_data

lr, train_data, test_data = {}, {}, {}


lr['A'], train_data['A'], test_data['A'] = create_lr_model("A")
lr['B'], train_data['B'], test_data['B'] = create_lr_model("B")
lr['C'], train_data['C'], test_data['C'] = create_lr_model("C")
lr['D'], train_data['D'], test_data['D'] = create_lr_model("D")
lr['E'], train_data['E'], test_data['E'] = create_lr_model("E")
lr['F'], train_data['F'], test_data['F'] = create_lr_model("F")
lr['G'], train_data['G'], test_data['G'] = create_lr_model("G")
lr['H'], train_data['H'], test_data['H'] = create_lr_model("H")
lr['H'], train_data['H'], test_data['H'] = create_lr_model("I")
lr['I'], train_data['I'], test_data['I'] = create_lr_model("J")
lr['J'], train_data['J'], test_data['J'] = create_lr_model("J")
lr['K'], train_data['K'], test_data['K'] = create_lr_model("K")
lr['M'], train_data['M'], test_data['M'] = create_lr_model("M")




In [85]:
for guideline, model in lr.items():
    pd.DataFrame(
        {
            "feature": lr[guideline]["tfidf"].get_feature_names_out(),
            "coef": lr[guideline]["clf"].coef_[0],
        },
    ).set_index("feature").sort_values("coef").to_csv(f"{guideline}_lr_model.csv")

In [89]:
print(
  test_data['A']['text'].values[0]
)

In his Answer to the SOR, Applicant admitted the sole allegation under Guideline
A  (SOR  ¶  1.a)  with  an  explanation,  and,  by  reference,  the  cross-allegation  under
Guideline  E  (SOR  ¶  2.a).  He  admitted  SOR  ¶¶  2.b  –  2.g,  also  with  explanations.  I
incorporate  his admissions  and  explanations  into  the  findings  of  fact.  After  a  thorough
and  careful  review  of  the  pleadings  and  the  record  evidence  submitted,  I  make  the
following  additional  findings  of  fact.  (I  also  incorporate  my  factual  findings  about  the
Government’s evidence, discussed above.)
Applicant is 43 years old. He graduated from high school in 1994. (Tr. 147; GE 1)
He has never married but has a longtime cohabitant. They have three children, ages 17,
15, and 13. (Tr. 126-127) He previously worked for a government contractor for several
years,  until  2001,  when  he  was  injured  on  the  job.  He  received  workmen’s
compensation, and he was unemployed from September 20

In [90]:
print(  preprocess_text(test_data['A']['text'].values[0]))

his answer the sor, applicant admit the sole allegation under guideline (sor 1.a) with explanation, and, reference, the cross-allegation under guideline (sor 2.a). applicant admit sor 2.b 2.g, also with explanations. incorporate his admission and explanation into the finding fact. after thorough and careful review the pleading and the record evidence submitted, make the following additional finding fact. also incorporate factual finding about the government’s evidence, discuss above.) applicant year old. applicant graduate from high school 1994. (tr. 147; applicant have never marry but have longtime cohabitant. they have three children, age 17, 15, and 13. (tr. 126-127) applicant previously work for government contractor for several years, until 2001, when applicant be injure the job. applicant receive workmen’s compensation, and applicant be unemploy from september 2001 october 2013. during that time applicant be “stay-at-home dad” raise his children. (ge 13; tr. 129-131, 141) applica