In [1]:
## Anonymizing Data for Machine Learning Models

In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report

In [3]:
# Sample dataset with sensitive information
data = pd.DataFrame({
    'Name': ['Alice', 'Bob', 'Charlie', 'David', 'Eva'],
    'SSN': ['123-45-6789', '987-65-4321', '456-78-9012', '321-54-9876', '654-32-1098'],
    'Age': [25, 45, 35, 50, 29],
    'Income': [50000, 80000, 60000, 100000, 55000],
    'Defaulted': [0, 1, 0, 1, 0]  # Target variable: 1 = Defaulted, 0 = Not Defaulted
})

print("Original Data:")
print(data)

Original Data:
      Name          SSN  Age  Income  Defaulted
0    Alice  123-45-6789   25   50000          0
1      Bob  987-65-4321   45   80000          1
2  Charlie  456-78-9012   35   60000          0
3    David  321-54-9876   50  100000          1
4      Eva  654-32-1098   29   55000          0


In [4]:
# Anonymize sensitive data
data['Anonymized_ID'] = range(1, len(data) + 1)  # Assign unique IDs
anonymized_data = data.drop(columns=['Name', 'SSN'])  # Remove sensitive columns

print("\nAnonymized Data:")
print(anonymized_data)


Anonymized Data:
   Age  Income  Defaulted  Anonymized_ID
0   25   50000          0              1
1   45   80000          1              2
2   35   60000          0              3
3   50  100000          1              4
4   29   55000          0              5


In [5]:
# Splitting features and target
X = anonymized_data[['Age', 'Income']]
y = anonymized_data['Defaulted']

# Splitting into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [6]:
# Training the model
model = RandomForestClassifier(random_state=42)
model.fit(X_train, y_train)

# Making predictions
y_pred = model.predict(X_test)
# Evaluating the model
print("Accuracy:", accuracy_score(y_test, y_pred))
print("\nClassification Report:")
print(classification_report(y_test, y_pred))

Accuracy: 0.5

Classification Report:
              precision    recall  f1-score   support

           0       0.50      1.00      0.67         1
           1       0.00      0.00      0.00         1

    accuracy                           0.50         2
   macro avg       0.25      0.50      0.33         2
weighted avg       0.25      0.50      0.33         2



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [7]:
# Verify that sensitive data was removed
if 'Name' not in anonymized_data.columns and 'SSN' not in anonymized_data.columns:
    print("\nData Privacy Compliance: Sensitive information successfully removed!")
else:
    print("\nWarning: Sensitive information is still present!")


Data Privacy Compliance: Sensitive information successfully removed!
