# Baseline Logistic Regression Model

**Gathering Data**

In [1]:
# Add all imports related to data engineering
import json
import numpy as np
import pandas as pd

In [2]:
# Import the training json file, then close the file
file = open("raw_data/train.json")
raw_data = json.load(file)
file.close()

10


In [3]:
# Got the txt file from here: https://gist.github.com/deekayen/4148741

# Common words List to hold common words
common_word_list = ["\n\n", "\n", '.', 'for', 'and', 'to', 'of', 'with', 'are', 'on', 'a','the', 'i', 'at', 'was', 'he', 'she', 'in', 'it', 'as', 'with']

In [4]:
# A function to get rid of the 1000 most common used words
def common_word_drop(token_list, whitespace_list, label_list):
    for word in common_word_list:
        if word in token_list:
            indices = [i for i, x in enumerate(token_list) if x.lower() == word]
            token_list = [token_list[i] for i in range(len(token_list)) if i not in indices]
            whitespace_list = [whitespace_list[i] for i in range(len(whitespace_list)) if i not in indices]
            label_list = [label_list[i] for i in range(len(label_list)) if i not in indices]

    return token_list, whitespace_list, label_list

In [5]:
def engineer_data_for_model(data):

    # Get the first value from the data
    first_doc = data[0]

    # Get rid of common words
    tokens, white_spaces, labels = common_word_drop(first_doc['tokens'], first_doc['trailing_whitespace'], first_doc['labels'])

    # Create the initial dataframe from the above data
    first_doc_data = {
        "tokens": tokens,
        "trailing_whitespaces": white_spaces,
        "capitalized first char": [True if label[0].isupper() else False for label in tokens],
        "token length": [len(token) for token in tokens],
        "is_numeric": [True if token.isnumeric() else False for token in tokens],
        "PII label": [False if label == "O" else True for label in labels]
    }
    raw_df = pd.DataFrame(first_doc_data)

    # Loop till the end of the data
    for document in data[1: len(data) - 1]:

        # Get rid of common words
        tokens, white_spaces, labels = common_word_drop(first_doc['tokens'], first_doc['trailing_whitespace'], first_doc['labels'])

        # Collect the data in the same way
        doc_data = {
            "tokens": tokens,
            "trailing_whitespaces": white_spaces,
            "capitalized first char": [True if label[0].isupper() else False for label in tokens],
            "token length": [len(token) for token in tokens],
            "is_numeric": [True if token.isnumeric() else False for token in tokens],
            "PII label": [False if label == "O" else True for label in labels]
        }
        df = pd.DataFrame(doc_data)

        # Concatenate all the data into one single dataframe
        raw_df = pd.concat([raw_df, df], ignore_index=True, sort=False)

    # Return the concatenated dataframe
    return raw_df


In [6]:
# Get the training data
training_data = engineer_data_for_model(raw_data[0:100])
training_data

Unnamed: 0,tokens,trailing_whitespaces,capitalized first char,token length,is_numeric,PII label
0,Design,True,True,6,False,False
1,Thinking,True,True,8,False,False
2,innovation,True,False,10,False,False
3,reflexion,False,False,9,False,False
4,-,False,False,1,False,False
...,...,...,...,...,...,...
52069,Mind,True,True,4,False,False
52070,Map,True,True,3,False,False
52071,Shared,True,True,6,False,False
52072,facilities,True,False,10,False,False


**One-hot Encoding**

In [None]:
x_train = []
y_train = []
x_test = []
y_test = []

**Model**

In [None]:
# import needed classes
from sklearn import metrics
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import Text

# instantiate the model (using the default parameters)
logreg = LogisticRegression(random_state=16)

# fit the model with data
logreg.fit(x_train, y_train)

y_pred = logreg.predict(x_test)

In [None]:

cnf_matrix = metrics.confusion_matrix(y_test, y_pred)
cnf_matrix

In [None]:
class_names = [0,1]

fig, ax = plt.subplots()
tick_marks = np.arange(len(class_names))
plt.xticks(tick_marks, class_names)
plt.yticks(tick_marks, class_names)

# create heatmap
sns.heatmap(pd.DataFrame(cnf_matrix), annot=True, cmap="YlGnBu" ,fmt='g')
ax.xaxis.set_label_position("top")
plt.tight_layout()
plt.title('Confusion matrix', y=1.1)
plt.ylabel('Actual label')
plt.xlabel('Predicted label')

Text(0.5,257.44,'Predicted label')

In [None]:
target_names = ['Non-PII-Data', 'PII-Data']

print(classification_report(y_test, y_pred, target_names=target_names))

In [None]:
y_pred_proba = logreg.predict_proba(x_test)[::,1]

fpr, tpr, _ = metrics.roc_curve(y_test,  y_pred_proba)
auc = metrics.roc_auc_score(y_test, y_pred_proba)

plt.plot(fpr,tpr,label="data 1, auc="+str(auc))
plt.legend(loc=4)
plt.show()