In [1]:

import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report,accuracy_score
from sklearn.model_selection import train_test_split

In [2]:
import json

In [3]:
dataList = []
sentences = []
labels = []
# Stopwords should be removed or excluded from the given text so that more
# focus can be given to those words which define the meaning of the text.
stopwords = [ "a", "about", "above", "after", "again", "against", "all", "am", "an", "and", "any", "are", "as", "at", "be", "because", "been", "before", "being", "below", "between", "both", "but", "by", "could", "did", "do", "does", "doing", "down", "during", "each", "few", "for", "from", "further", "had", "has", "have", "having", "he", "he'd", "he'll", "he's", "her", "here", "here's", "hers", "herself", "him", "himself", "his", "how", "how's", "i", "i'd", "i'll", "i'm", "i've", "if", "in", "into", "is", "it", "it's", "its", "itself", "let's", "me", "more", "most", "my", "myself", "nor", "of", "on", "once", "only", "or", "other", "ought", "our", "ours", "ourselves", "out", "over", "own", "same", "she", "she'd", "she'll", "she's", "should", "so", "some", "such", "than", "that", "that's", "the", "their", "theirs", "them", "themselves", "then", "there", "there's", "these", "they", "they'd", "they'll", "they're", "they've", "this", "those", "through", "to", "too", "under", "until", "up", "very", "was", "we", "we'd", "we'll", "we're", "we've", "were", "what", "what's", "when", "when's", "where", "where's", "which", "while", "who", "who's", "whom", "why", "why's", "with", "would", "you", "you'd", "you'll", "you're", "you've", "your", "yours", "yourself", "yourselves" ]

def loadDataset(filename):
  d=[]
  with open(filename, 'r') as f:
      datastore = json.load(f)
  for item in datastore:
    sentence = item['data']
    label = item['is_sensitive']
    for word in stopwords: #Remove stop words in sentence
      token = " " + word + " "
      sentence = sentence.replace(token, " ")
    d.append([sentence,label])
  return d

In [4]:
# Load sensitive and non-sensitive data from JSON files
sen = loadDataset('SensitiveDataset.json')
nonsen = loadDataset('NonSensitiveDatasetnew.json')

In [5]:
import pandas as pd
sen=pd.DataFrame(data=sen,columns=['sentences','labels'])
nonsen=pd.DataFrame(data=nonsen,columns=['sentences','labels'])
df= pd.concat([sen, nonsen], axis=0)
df=df.sample(frac=1).reset_index(drop=True)
df_sen=df[df['labels']==1]
df_non=df[df['labels']==0]
df_non_downsampled = df_non.sample(15000)
df_sen_downsampled=df_sen.sample(10000)
df_balanced = pd.concat([df_non_downsampled, df_sen_downsampled])

# Splitting into features (X) and labels (y)
X = df_balanced['sentences'].values
y = df_balanced['labels'].values

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [6]:
from sklearn.feature_extraction.text import CountVectorizer
vectorizer = CountVectorizer()
X_train = vectorizer.fit_transform(X_train)
X_test = vectorizer.transform(X_test)

In [7]:
# Create a Random Forest classifier
classifier = RandomForestClassifier(n_estimators=1)
# Train the classifier
classifier.fit(X_train, y_train)
# Predict the labels for the test data
y_pred = classifier.predict(X_test)
# Print the classification report
print(classification_report(y_test, y_pred))
print("Accuracy: ",accuracy_score(y_test,y_pred))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00      4502
           1       1.00      1.00      1.00      2998

    accuracy                           1.00      7500
   macro avg       1.00      1.00      1.00      7500
weighted avg       1.00      1.00      1.00      7500

Accuracy:  0.9990666666666667


In [8]:
from sklearn.tree import DecisionTreeClassifier
# Create a Decision Tree classifier
classifier = DecisionTreeClassifier(criterion='entropy', random_state=0, max_depth=3)
# Train the classifier
classifier.fit(X_train, y_train)
# Predict the labels for the test data
y_pred = classifier.predict(X_test)
# Print the classification report
print(classification_report(y_test, y_pred))
print("Accuracy: ",accuracy_score(y_test,y_pred))


              precision    recall  f1-score   support

           0       0.99      0.98      0.99      4502
           1       0.98      0.98      0.98      2998

    accuracy                           0.98      7500
   macro avg       0.98      0.98      0.98      7500
weighted avg       0.98      0.98      0.98      7500

Accuracy:  0.984


In [9]:
# prompt: Now make user enter his attributes in a list and get seperate outputs for both above models and respective label for each attribute

# Get user input for attributes
attributes = input("Enter your attributes separated by commas: ").split(",")

# Transform user input into a list of strings
attributes_str = [str(attribute) for attribute in attributes]

# Convert user input into a vectorized format
attributes_vec = vectorizer.transform(attributes_str)

# Predict the labels for the user input using Random Forest classifier
rf_predictions = classifier.predict(attributes_vec)

# Predict the labels for the user input using Decision Tree classifier
dt_predictions = classifier.predict(attributes_vec)

# Print the predictions for each model
print("Random Forest Predictions:", rf_predictions)
print("Decision Tree Predictions:", dt_predictions)

# Print the label for each prediction
for i in range(len(rf_predictions)):
  if rf_predictions[i] == 1:
    print("Attribute", attributes[i], "is sensitive (Random Forest)")
  else:
    print("Attribute", attributes[i], "is not sensitive (Random Forest)")

for i in range(len(dt_predictions)):
  if dt_predictions[i] == 1:
    print("Attribute", attributes[i], "is sensitive (Decision Tree)")
  else:
    print("Attribute", attributes[i], "is not sensitive (Decision Tree)")


Random Forest Predictions: [0 1 0 0 0]
Decision Tree Predictions: [0 1 0 0 0]
Attribute Name is not sensitive (Random Forest)
Attribute phone number is sensitive (Random Forest)
Attribute email is not sensitive (Random Forest)
Attribute address is not sensitive (Random Forest)
Attribute occupation is not sensitive (Random Forest)
Attribute Name is not sensitive (Decision Tree)
Attribute phone number is sensitive (Decision Tree)
Attribute email is not sensitive (Decision Tree)
Attribute address is not sensitive (Decision Tree)
Attribute occupation is not sensitive (Decision Tree)
