## Anonymize Data

In [13]:
import pandas as pd
import hashlib

# Load your data
df = pd.read_excel('mobile_customers.xlsx')

### Hash Data

In [14]:
def anonymize_data(data):
    if isinstance(data, str):  # Check if data is a string
        return hashlib.sha256(data.encode()).hexdigest()
    else:
        return data

sensitive_columns = ['customer_id', 'username', 'email']

for column in sensitive_columns:
    df[column] = df[column].apply(anonymize_data)
df.head()

Unnamed: 0.1,Unnamed: 0,customer_id,date_registered,username,name,gender,address,email,birthdate,current_location,residence,employer,job,age,salary,credit_card_provider,credit_card_number,credit_card_security_code,credit_card_expire
0,0,cf96b84be95602a01d470fef6c5775ae9b187421e24906...,2021-09-29,e2a28c51bbb5fb36092eac5ca20da5e95d8345c96139eb...,Jonathan Snyder,M,"24675 Susan Valley\nNorth Dianabury, MO 02475",0993c9ad2c26ea3acbeec4efff4d3cda8ea4eeb14339cf...,1978-03-11,"['78.937112', '71.260464']","195 Brandi Junctions\nNew Julieberg, NE 63410","Byrd, Welch and Holt",Chief Technology Officer,49,53979,VISA 19 digit,38985874269846,994,2023-10-27 00:00:00
1,1,389c7185edc98f251ac1235788b66a03b1e394dddffd24...,2019-08-17,35ad94d094dd85a018a4370706aa9a48c9a9a0b1c4af7d...,Susan Dominguez,F,"4212 Cheryl Inlet\nPort Davidmouth, NC 54884",55dcb55982c87177f1eb7faa8e78b2d7417e7828785360...,1970-11-29,"['-24.1692185', '100.746122']","58272 Brown Isle Apt. 698\nPort Michael, HI 04693",Hurst PLC,Data scientist,43,81510,Discover,6525743622515979,163,2023-07-30 00:00:00
2,2,7a9394657a1fcbe2a525e5f8f3add3329bb0022e39eb19...,2019-11-01,922109f67599e77aa3e8177c5dcab310ddba3b6c775cca...,Corey Hebert,M,"07388 Coleman Prairie\nLake Amy, IA 78695",f7cb16dd4efde776b69477f61bc8f171fdad967e2dd389...,2009-04-23,"['8.019908', '-19.603269']","36848 Jones Lane Suite 282\nMarquezbury, ID 26822","Mora, Caldwell and Guerrero",Chief Operating Officer,47,205345,VISA 16 digit,4010729915028682247,634,2023-04-26 00:00:00
3,3,511d9da9211917b4c3f23ed14854d350edcbfe3a4b48f7...,2021-12-31,99a147d621d2ac9afcbf975d7f142a248d08c4f064ee22...,Latasha Griffin,F,"PSC 6217, Box 2610\nAPO AA 53585",7210be59c4a06de2cd101248493b9843bf378a0933c2aa...,1992-07-27,"['62.497506', '2.717198']","317 Lamb Cape Apt. 884\nLake Amy, DC 79074",Patel PLC,Counselling psychologist,34,116095,VISA 16 digit,4854862659569207844,7957,2023-10-31 00:00:00
4,4,9535dcb56772ba94bea4e23b547c522ae4c667022c07e9...,2020-08-09,8958bc33f4d51668dccb0ce7568da30e7176dc9f52b4b1...,Colleen Wheeler,F,"0325 Potter Roads\nLake Lisashire, NM 77502",6cbda70247614623788c6839ffd898b52faebaa27da31e...,1989-09-16,"['73.7924695', '-80.314720']","21936 Mary Islands\nMendozafort, TN 37124",Smith-Mejia,Mining engineer,57,107529,JCB 16 digit,213152724828217,72,2023-05-28 00:00:00


### Mask Data

In [15]:
def mask_data(data):
    if pd.isnull(data):
        return data
    else:
        return '*' * len(str(data))

mask_columns = ['address', 'residence', 'credit_card_security_code', 'credit_card_number', 'credit_card_expire']

for column in mask_columns:
    df[column] = df[column].apply(mask_data)

df.head()

Unnamed: 0.1,Unnamed: 0,customer_id,date_registered,username,name,gender,address,email,birthdate,current_location,residence,employer,job,age,salary,credit_card_provider,credit_card_number,credit_card_security_code,credit_card_expire
0,0,cf96b84be95602a01d470fef6c5775ae9b187421e24906...,2021-09-29,e2a28c51bbb5fb36092eac5ca20da5e95d8345c96139eb...,Jonathan Snyder,M,********************************************,0993c9ad2c26ea3acbeec4efff4d3cda8ea4eeb14339cf...,1978-03-11,"['78.937112', '71.260464']",********************************************,"Byrd, Welch and Holt",Chief Technology Officer,49,53979,VISA 19 digit,**************,***,*******************
1,1,389c7185edc98f251ac1235788b66a03b1e394dddffd24...,2019-08-17,35ad94d094dd85a018a4370706aa9a48c9a9a0b1c4af7d...,Susan Dominguez,F,*******************************************,55dcb55982c87177f1eb7faa8e78b2d7417e7828785360...,1970-11-29,"['-24.1692185', '100.746122']",************************************************,Hurst PLC,Data scientist,43,81510,Discover,****************,***,*******************
2,2,7a9394657a1fcbe2a525e5f8f3add3329bb0022e39eb19...,2019-11-01,922109f67599e77aa3e8177c5dcab310ddba3b6c775cca...,Corey Hebert,M,****************************************,f7cb16dd4efde776b69477f61bc8f171fdad967e2dd389...,2009-04-23,"['8.019908', '-19.603269']",************************************************,"Mora, Caldwell and Guerrero",Chief Operating Officer,47,205345,VISA 16 digit,*******************,***,*******************
3,3,511d9da9211917b4c3f23ed14854d350edcbfe3a4b48f7...,2021-12-31,99a147d621d2ac9afcbf975d7f142a248d08c4f064ee22...,Latasha Griffin,F,*******************************,7210be59c4a06de2cd101248493b9843bf378a0933c2aa...,1992-07-27,"['62.497506', '2.717198']",*****************************************,Patel PLC,Counselling psychologist,34,116095,VISA 16 digit,*******************,****,*******************
4,4,9535dcb56772ba94bea4e23b547c522ae4c667022c07e9...,2020-08-09,8958bc33f4d51668dccb0ce7568da30e7176dc9f52b4b1...,Colleen Wheeler,F,******************************************,6cbda70247614623788c6839ffd898b52faebaa27da31e...,1989-09-16,"['73.7924695', '-80.314720']",****************************************,Smith-Mejia,Mining engineer,57,107529,JCB 16 digit,***************,**,*******************


### Generalize Data

In [16]:
def generalize_birthdate(birthdate):
    if pd.isnull(birthdate):
        return birthdate
    else:
        return pd.to_datetime(birthdate).year

def generalize_age(age):
    if pd.isnull(age):
        return age
    elif age < 20:
        return 'Under 20'
    elif age < 30:
        return '20-29'
    elif age < 40:
        return '30-39'
    elif age < 50:
        return '40-49'
    else:
        return '50 and above'

def generalize_salary(salary):
    if pd.isnull(salary):
        return salary
    elif salary < 50000:
        return 'Under 50K'
    elif salary < 100000:
        return '50K-100K'
    elif salary < 150000:
        return '100K-150K'
    else:
        return '150K and above'

df['birthdate'] = df['birthdate'].apply(generalize_birthdate)
df['age'] = df['age'].apply(generalize_age)
df['salary'] = df['salary'].apply(generalize_salary)

df.head()

Unnamed: 0.1,Unnamed: 0,customer_id,date_registered,username,name,gender,address,email,birthdate,current_location,residence,employer,job,age,salary,credit_card_provider,credit_card_number,credit_card_security_code,credit_card_expire
0,0,cf96b84be95602a01d470fef6c5775ae9b187421e24906...,2021-09-29,e2a28c51bbb5fb36092eac5ca20da5e95d8345c96139eb...,Jonathan Snyder,M,********************************************,0993c9ad2c26ea3acbeec4efff4d3cda8ea4eeb14339cf...,1978,"['78.937112', '71.260464']",********************************************,"Byrd, Welch and Holt",Chief Technology Officer,40-49,50K-100K,VISA 19 digit,**************,***,*******************
1,1,389c7185edc98f251ac1235788b66a03b1e394dddffd24...,2019-08-17,35ad94d094dd85a018a4370706aa9a48c9a9a0b1c4af7d...,Susan Dominguez,F,*******************************************,55dcb55982c87177f1eb7faa8e78b2d7417e7828785360...,1970,"['-24.1692185', '100.746122']",************************************************,Hurst PLC,Data scientist,40-49,50K-100K,Discover,****************,***,*******************
2,2,7a9394657a1fcbe2a525e5f8f3add3329bb0022e39eb19...,2019-11-01,922109f67599e77aa3e8177c5dcab310ddba3b6c775cca...,Corey Hebert,M,****************************************,f7cb16dd4efde776b69477f61bc8f171fdad967e2dd389...,2009,"['8.019908', '-19.603269']",************************************************,"Mora, Caldwell and Guerrero",Chief Operating Officer,40-49,150K and above,VISA 16 digit,*******************,***,*******************
3,3,511d9da9211917b4c3f23ed14854d350edcbfe3a4b48f7...,2021-12-31,99a147d621d2ac9afcbf975d7f142a248d08c4f064ee22...,Latasha Griffin,F,*******************************,7210be59c4a06de2cd101248493b9843bf378a0933c2aa...,1992,"['62.497506', '2.717198']",*****************************************,Patel PLC,Counselling psychologist,30-39,100K-150K,VISA 16 digit,*******************,****,*******************
4,4,9535dcb56772ba94bea4e23b547c522ae4c667022c07e9...,2020-08-09,8958bc33f4d51668dccb0ce7568da30e7176dc9f52b4b1...,Colleen Wheeler,F,******************************************,6cbda70247614623788c6839ffd898b52faebaa27da31e...,1989,"['73.7924695', '-80.314720']",****************************************,Smith-Mejia,Mining engineer,50 and above,100K-150K,JCB 16 digit,***************,**,*******************


### Remove Unneccessary Columns

In [17]:
# Specify the columns to be removed
remove_columns = ['name', 'employer', 'job']

# Remove the columns
df = df.drop(columns=remove_columns)

df.head()

Unnamed: 0.1,Unnamed: 0,customer_id,date_registered,username,gender,address,email,birthdate,current_location,residence,age,salary,credit_card_provider,credit_card_number,credit_card_security_code,credit_card_expire
0,0,cf96b84be95602a01d470fef6c5775ae9b187421e24906...,2021-09-29,e2a28c51bbb5fb36092eac5ca20da5e95d8345c96139eb...,M,********************************************,0993c9ad2c26ea3acbeec4efff4d3cda8ea4eeb14339cf...,1978,"['78.937112', '71.260464']",********************************************,40-49,50K-100K,VISA 19 digit,**************,***,*******************
1,1,389c7185edc98f251ac1235788b66a03b1e394dddffd24...,2019-08-17,35ad94d094dd85a018a4370706aa9a48c9a9a0b1c4af7d...,F,*******************************************,55dcb55982c87177f1eb7faa8e78b2d7417e7828785360...,1970,"['-24.1692185', '100.746122']",************************************************,40-49,50K-100K,Discover,****************,***,*******************
2,2,7a9394657a1fcbe2a525e5f8f3add3329bb0022e39eb19...,2019-11-01,922109f67599e77aa3e8177c5dcab310ddba3b6c775cca...,M,****************************************,f7cb16dd4efde776b69477f61bc8f171fdad967e2dd389...,2009,"['8.019908', '-19.603269']",************************************************,40-49,150K and above,VISA 16 digit,*******************,***,*******************
3,3,511d9da9211917b4c3f23ed14854d350edcbfe3a4b48f7...,2021-12-31,99a147d621d2ac9afcbf975d7f142a248d08c4f064ee22...,F,*******************************,7210be59c4a06de2cd101248493b9843bf378a0933c2aa...,1992,"['62.497506', '2.717198']",*****************************************,30-39,100K-150K,VISA 16 digit,*******************,****,*******************
4,4,9535dcb56772ba94bea4e23b547c522ae4c667022c07e9...,2020-08-09,8958bc33f4d51668dccb0ce7568da30e7176dc9f52b4b1...,F,******************************************,6cbda70247614623788c6839ffd898b52faebaa27da31e...,1989,"['73.7924695', '-80.314720']",****************************************,50 and above,100K-150K,JCB 16 digit,***************,**,*******************


### Save Dataframe

In [18]:
# Save the new DataFrame
df.to_csv('anonymize_data.csv', index=False)