In [1]:
import pandas as pd
import numpy as np

# User Credit Card


In [2]:
df_user_cc = pd.read_pickle(
    'Customer Management Department/user_credit_card.pickle')


print(df_user_cc)


print(df_user_cc.columns.ravel())

        user_id              name  credit_card_number   issuing_bank
0     USER40678        Zion Feest          4294956114            bpi
1     USER08728  Kattie Bergstrom          2742902159            bdo
2     USER29759      Aiden Corwin          1917471950            bdo
3     USER16806    Vince Gislason          3290792253      chinabank
4     USER27644     Adele Okuneva          2313395832      chinabank
...         ...               ...                 ...            ...
4995  USER49969   Kendall Waelchi          1870581879      chinabank
4996  USER57209  Lucious Kshlerin          1139377065       mayabank
4997  USER36424    Cordie Jenkins          2187584552  robinsonsbank
4998  USER45166        Tyrel Feil          2542420860            bdo
4999  USER54154       Betty Zieme          4104712921   securitybank

[5000 rows x 4 columns]
Index(['user_id', 'name', 'credit_card_number', 'issuing_bank'], dtype='object')


## Fix the naming of name and issuing_bank


In [3]:
unique_issuing_banks = df_user_cc['issuing_bank'].unique()
unique_issuing_banks

array(['bpi', 'bdo', 'chinabank', 'metrobank', 'mayabank',
       'robinsonsbank', 'securitybank', 'eastwest'], dtype=object)

#### Fix the name of the banks


In [4]:
df_user_cc['issuing_bank'] = df_user_cc['issuing_bank'].replace({'chinabank': 'China Bank', 'bpi': 'BPI', 'bdo': 'BDO', 'metrobank': 'Metrobank',
                                                                'mayabank': 'Maya Bank', 'robinsonsbank': 'Robinsons Bank', 'securitybank': 'Security Bank', 'eastwest': 'EastWest'})

unique_issuing_banks = df_user_cc['issuing_bank'].unique()
print(unique_issuing_banks)

['BPI' 'BDO' 'China Bank' 'Metrobank' 'Maya Bank' 'Robinsons Bank'
 'Security Bank' 'EastWest']


#### Convert name to title case


In [5]:
df_user_cc['name'] = df_user_cc['name'].str.title()
unique_issuing_names = df_user_cc['name'].unique()
unique_issuing_names

array(['Zion Feest', 'Kattie Bergstrom', 'Aiden Corwin', ...,
       'Cordie Jenkins', 'Tyrel Feil', 'Betty Zieme'], dtype=object)

#### Check for special characters


In [6]:
special_chars_check = df_user_cc['name'].str.contains(
    r'[^A-Za-z0-9\s]', regex=True)
print(df_user_cc[special_chars_check])

Empty DataFrame
Columns: [user_id, name, credit_card_number, issuing_bank]
Index: []


In [7]:
df_user_cc

Unnamed: 0,user_id,name,credit_card_number,issuing_bank
0,USER40678,Zion Feest,4294956114,BPI
1,USER08728,Kattie Bergstrom,2742902159,BDO
2,USER29759,Aiden Corwin,1917471950,BDO
3,USER16806,Vince Gislason,3290792253,China Bank
4,USER27644,Adele Okuneva,2313395832,China Bank
...,...,...,...,...
4995,USER49969,Kendall Waelchi,1870581879,China Bank
4996,USER57209,Lucious Kshlerin,1139377065,Maya Bank
4997,USER36424,Cordie Jenkins,2187584552,Robinsons Bank
4998,USER45166,Tyrel Feil,2542420860,BDO


## Remove invalid credit card number


Check if there are credit card numbers that is not 10 digits


In [8]:
df_user_cc['credit_card_number'] = df_user_cc['credit_card_number'].astype(str)

not_ten_digits_check = ~df_user_cc['credit_card_number'].str.match(r'^\d{10}$')

print(df_user_cc[not_ten_digits_check])

        user_id              name credit_card_number   issuing_bank
10    USER02300   Aubrey Mitchell          779912290      Metrobank
12    USER15171    Junius Watsica          489754926      Maya Bank
15    USER29028       Cali Walker          786901098  Security Bank
23    USER23143       Dora Jacobs          660162130       EastWest
38    USER24767      Pink Schmitt          276041500  Security Bank
...         ...               ...                ...            ...
4976  USER04133       Kenyon King          629941793  Security Bank
4977  USER24235  Calista Prohaska          633907739  Security Bank
4978  USER05332   Benton Kassulke          195020692       EastWest
4985  USER44983       Rahul Olson          106565871      Metrobank
4986  USER47658      Jerrell Veum          597295254            BPI

[1115 rows x 4 columns]


Remove rows that are not 10 digits in credit card number


In [9]:
df_user_cc = df_user_cc[df_user_cc['credit_card_number'].astype(
    str).apply(len) == 10]

df_user_cc['credit_card_number'] = df_user_cc['credit_card_number'].astype(str)
not_ten_digits_check = ~df_user_cc['credit_card_number'].str.match(r'^\d{10}$')
print(df_user_cc[not_ten_digits_check])

Empty DataFrame
Columns: [user_id, name, credit_card_number, issuing_bank]
Index: []


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_user_cc['credit_card_number'] = df_user_cc['credit_card_number'].astype(str)


## Check for rows with same credit card number but different names


In [10]:

duplicate_credit_cards = df_user_cc[df_user_cc.duplicated(
    'credit_card_number', keep=False)]

print(duplicate_credit_cards)

Empty DataFrame
Columns: [user_id, name, credit_card_number, issuing_bank]
Index: []


## Check for different user id but same name, credit card number, and issuing bank


In [11]:
duplicate_rows_diff_id = df_user_cc[df_user_cc.duplicated(
    ['name', 'credit_card_number', 'issuing_bank'], keep=False)]

print(duplicate_rows_diff_id)

Empty DataFrame
Columns: [user_id, name, credit_card_number, issuing_bank]
Index: []


## Check for duplicate user id


In [12]:
duplicate_user_ids = df_user_cc[df_user_cc.duplicated(
    subset='user_id', keep=False)]

duplicate_user_ids.sort_values(by='user_id', inplace=True)
duplicate_user_ids

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  duplicate_user_ids.sort_values(by='user_id', inplace=True)


Unnamed: 0,user_id,name,credit_card_number,issuing_bank
366,USER00304,Tom Welch,1356102080,BDO
554,USER00304,Mariam Kemmer,2209453432,EastWest
440,USER00393,Liliane Smith,2345218200,BDO
1230,USER00393,Shaina Bailey,2603941907,BDO
2551,USER00473,Landen Mayer,1873877802,BDO
...,...,...,...,...
3803,USER63701,Johan Hirthe,2728919776,BDO
1124,USER63839,Carole Jewess,1030639809,Robinsons Bank
1446,USER63839,Jairo Greenholt,3194193845,China Bank
738,USER64913,Madisen Bahringer,2256946049,China Bank


In [13]:
df_user_cc['user_id'] = df_user_cc['user_id'].str.replace(
    'USER', '').astype(int)
df_user_cc

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_user_cc['user_id'] = df_user_cc['user_id'].str.replace(


Unnamed: 0,user_id,name,credit_card_number,issuing_bank
0,40678,Zion Feest,4294956114,BPI
1,8728,Kattie Bergstrom,2742902159,BDO
2,29759,Aiden Corwin,1917471950,BDO
3,16806,Vince Gislason,3290792253,China Bank
4,27644,Adele Okuneva,2313395832,China Bank
...,...,...,...,...
4995,49969,Kendall Waelchi,1870581879,China Bank
4996,57209,Lucious Kshlerin,1139377065,Maya Bank
4997,36424,Cordie Jenkins,2187584552,Robinsons Bank
4998,45166,Tyrel Feil,2542420860,BDO


In [14]:
df_user_cc['user_id'] = df_user_cc.groupby('user_id').cumcount().add(
    1).astype(str).radd('USER') + df_user_cc['user_id'].astype(str)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_user_cc['user_id'] = df_user_cc.groupby('user_id').cumcount().add(


In [15]:
duplicate_user_ids = df_user_cc[df_user_cc.duplicated(
    subset='user_id', keep=False)]

duplicate_user_ids.sort_values(by='user_id', inplace=True)
duplicate_user_ids

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  duplicate_user_ids.sort_values(by='user_id', inplace=True)


Unnamed: 0,user_id,name,credit_card_number,issuing_bank


In [16]:
duplicate_check = df_user_cc.duplicated('user_id', keep=False)

if duplicate_check.any():
    print("There are still duplicate user IDs.")
else:
    print("There are no duplicate user IDs.")

There are no duplicate user IDs.


## Checking for Nulls


In [17]:
df_user_cc.isnull().sum()

user_id               0
name                  0
credit_card_number    0
issuing_bank          0
dtype: int64

In [18]:
df_user_cc

Unnamed: 0,user_id,name,credit_card_number,issuing_bank
0,USER140678,Zion Feest,4294956114,BPI
1,USER18728,Kattie Bergstrom,2742902159,BDO
2,USER129759,Aiden Corwin,1917471950,BDO
3,USER116806,Vince Gislason,3290792253,China Bank
4,USER127644,Adele Okuneva,2313395832,China Bank
...,...,...,...,...
4995,USER149969,Kendall Waelchi,1870581879,China Bank
4996,USER157209,Lucious Kshlerin,1139377065,Maya Bank
4997,USER236424,Cordie Jenkins,2187584552,Robinsons Bank
4998,USER145166,Tyrel Feil,2542420860,BDO


## Convert to parquet


In [19]:
df_user_cc.to_parquet(
    'Customer Management Department/user_credit_card.parquet', index=False)

# User Data


In [20]:
df_user_data = pd.read_json('Customer Management Department/user_data.json')
df_user_data.head()

Unnamed: 0,user_id,creation_date,name,street,state,city,country,birthdate,gender,device_address,user_type
0,USER40678,2021-03-17 22:56:13,Zion Feest,14938 West Trace side,New Jersey,Birmingham,Hong Kong,1998-04-06 05:29:37,male,17:fb:f2:60:94:4b,basic
1,USER08728,2022-10-10 12:53:20,Kattie Bergstrom,4476 West Haven fort,Alabama,Irvine,Mayotte,2003-05-22 11:16:19,male,b0:17:a7:0b:d6:67,premium
2,USER29759,2020-05-20 04:34:44,Aiden Corwin,59980 North Crest chester,North Carolina,Tampa,Iraq,2008-08-29 16:42:05,female,24:f2:0b:88:2f:bd,basic
3,USER16806,2021-05-28 07:36:30,Vince Gislason,541 Radial mouth,Illinois,Orlando,New Zealand,2012-02-09 14:12:37,male,a4:f5:fd:fe:07:f9,basic
4,USER27644,2023-03-16 19:25:35,Adele Okuneva,896 Glen bury,Arizona,Reno,Mexico,1976-10-13 00:53:54,male,ac:80:b3:bc:8d:5f,premium


## Fix the naming of name, street, state, city, country, gender, and user type


#### Convert them to title case


In [55]:
df_user_data['name'] = df_user_data['name'].str.title()
df_user_data['street'] = df_user_data['street'].str.title()
df_user_data['state'] = df_user_data['state'].str.title()
df_user_data['city'] = df_user_data['city'].str.title()
df_user_data['country'] = df_user_data['country'].str.title()
df_user_data['gender'] = df_user_data['gender'].str.title()
df_user_data['user_type'] = df_user_data['user_type'].str.title()

df_user_data

Unnamed: 0,user_id,creation_date,name,street,state,city,country,birthdate,gender,device_address,user_type
0,USER140678,2021-03-17 22:56:13,Zion Feest,14938 West Trace Side,New Jersey,Birmingham,Hong Kong,1998-04-06 05:29:37,Male,17:fb:f2:60:94:4b,Basic
1,USER18728,2022-10-10 12:53:20,Kattie Bergstrom,4476 West Haven Fort,Alabama,Irvine,Mayotte,2003-05-22 11:16:19,Male,b0:17:a7:0b:d6:67,Premium
2,USER129759,2020-05-20 04:34:44,Aiden Corwin,59980 North Crest Chester,North Carolina,Tampa,Iraq,2008-08-29 16:42:05,Female,24:f2:0b:88:2f:bd,Basic
3,USER116806,2021-05-28 07:36:30,Vince Gislason,541 Radial Mouth,Illinois,Orlando,New Zealand,2012-02-09 14:12:37,Male,a4:f5:fd:fe:07:f9,Basic
4,USER127644,2023-03-16 19:25:35,Adele Okuneva,896 Glen Bury,Arizona,Reno,Mexico,1976-10-13 00:53:54,Male,ac:80:b3:bc:8d:5f,Premium
...,...,...,...,...,...,...,...,...,...,...,...
4995,USER149969,2021-11-18 19:04:30,Kendall Waelchi,792 North Manors Haven,Florida,San Diego,Jersey,1996-03-11 09:05:32,Female,8e:57:7e:62:90:b4,Basic
4996,USER157209,2022-03-27 09:17:43,Lucious Kshlerin,48282 Burgs Chester,Kentucky,Garland,Ethiopia,2013-07-07 08:53:27,Female,00:35:b5:e4:e1:aa,Basic
4997,USER236424,2022-10-05 21:36:56,Cordie Jenkins,8928 North Mountains Haven,North Dakota,Scottsdale,Tonga,1991-05-18 05:07:36,Male,0b:0d:57:d6:c1:ab,Basic
4998,USER245166,2020-07-20 03:51:41,Tyrel Feil,920 New Manor Port,Rhode Island,Arlington,Virgin Islands (U.S.),1975-12-19 02:27:16,Male,83:c9:1e:4e:52:f4,Premium


#### Check for special characters


In [22]:
special_chars_regex = r'[^A-Za-z0-9\s]'

special_chars_check = df_user_data['name'].str.contains(
    special_chars_regex, regex=True)
rows_with_special_chars = df_user_data[special_chars_check]
rows_with_special_chars

Unnamed: 0,user_id,creation_date,name,street,state,city,country,birthdate,gender,device_address,user_type


## Check for duplicate rows with different user id


In [23]:
duplicate_rows = df_user_data[df_user_data.duplicated(
    subset=df_user_data.columns.difference(['user_id']), keep=False)]

print(duplicate_rows)

Empty DataFrame
Columns: [user_id, creation_date, name, street, state, city, country, birthdate, gender, device_address, user_type]
Index: []


## Check if creation date and birthdate is valid


In [24]:
date_columns = ['creation_date', 'birthdate']

for column in date_columns:
    try:
        df_user_data[column] = pd.to_datetime(df_user_data[column])
        print(f"{column} is a valid datetime.")
    except ValueError:
        print(f"{column} contains invalid datetime values.")

creation_date is a valid datetime.
birthdate is a valid datetime.


#### Check if there are creation date before birthdate


In [25]:
invalid_dates = df_user_data[df_user_data['creation_date']
                             <= df_user_data['birthdate']]

print(invalid_dates)

Empty DataFrame
Columns: [user_id, creation_date, name, street, state, city, country, birthdate, gender, device_address, user_type]
Index: []


Remove future dates


In [56]:
df_user_data = df_user_data[(df_user_data['creation_date'] <= pd.to_datetime(
    'now')) & (df_user_data['birthdate'] <= pd.to_datetime('now'))]

df_user_data

Unnamed: 0,user_id,creation_date,name,street,state,city,country,birthdate,gender,device_address,user_type
0,USER140678,2021-03-17 22:56:13,Zion Feest,14938 West Trace Side,New Jersey,Birmingham,Hong Kong,1998-04-06 05:29:37,Male,17:fb:f2:60:94:4b,Basic
1,USER18728,2022-10-10 12:53:20,Kattie Bergstrom,4476 West Haven Fort,Alabama,Irvine,Mayotte,2003-05-22 11:16:19,Male,b0:17:a7:0b:d6:67,Premium
2,USER129759,2020-05-20 04:34:44,Aiden Corwin,59980 North Crest Chester,North Carolina,Tampa,Iraq,2008-08-29 16:42:05,Female,24:f2:0b:88:2f:bd,Basic
3,USER116806,2021-05-28 07:36:30,Vince Gislason,541 Radial Mouth,Illinois,Orlando,New Zealand,2012-02-09 14:12:37,Male,a4:f5:fd:fe:07:f9,Basic
4,USER127644,2023-03-16 19:25:35,Adele Okuneva,896 Glen Bury,Arizona,Reno,Mexico,1976-10-13 00:53:54,Male,ac:80:b3:bc:8d:5f,Premium
...,...,...,...,...,...,...,...,...,...,...,...
4995,USER149969,2021-11-18 19:04:30,Kendall Waelchi,792 North Manors Haven,Florida,San Diego,Jersey,1996-03-11 09:05:32,Female,8e:57:7e:62:90:b4,Basic
4996,USER157209,2022-03-27 09:17:43,Lucious Kshlerin,48282 Burgs Chester,Kentucky,Garland,Ethiopia,2013-07-07 08:53:27,Female,00:35:b5:e4:e1:aa,Basic
4997,USER236424,2022-10-05 21:36:56,Cordie Jenkins,8928 North Mountains Haven,North Dakota,Scottsdale,Tonga,1991-05-18 05:07:36,Male,0b:0d:57:d6:c1:ab,Basic
4998,USER245166,2020-07-20 03:51:41,Tyrel Feil,920 New Manor Port,Rhode Island,Arlington,Virgin Islands (U.S.),1975-12-19 02:27:16,Male,83:c9:1e:4e:52:f4,Premium


## Check for invalid device address


In [26]:
device_address_pattern = r'^([0-9A-Fa-f]{2}[:-]){5}([0-9A-Fa-f]{2})$'

invalid_device_address = df_user_data[~df_user_data['device_address'].str.match(
    device_address_pattern)]

print(invalid_device_address)

Empty DataFrame
Columns: [user_id, creation_date, name, street, state, city, country, birthdate, gender, device_address, user_type]
Index: []


## Fix for duplicate user id


In [27]:
duplicate_user_ids = df_user_data[df_user_data.duplicated(
    subset=['user_id'], keep=False)]

sorted_duplicate_user_ids = duplicate_user_ids.sort_values(by=['user_id'])

sorted_duplicate_user_ids.head()

Unnamed: 0,user_id,creation_date,name,street,state,city,country,birthdate,gender,device_address,user_type
366,USER00304,2022-02-26 23:45:04,Tom Welch,6161 Lake Plains Haven,New Mexico,Charlotte,Seychelles,1990-05-31 21:51:53,Male,3f:ac:2b:19:0e:a5,Basic
554,USER00304,2020-09-24 17:46:58,Mariam Kemmer,8376 Hill Ton,Colorado,Indianapolis,Mali,1973-06-25 19:24:02,Male,ab:80:d2:7c:98:76,Basic
440,USER00393,2022-10-16 17:26:09,Liliane Smith,4315 West Lodge Town,Florida,Cincinnati,Iraq,1973-06-08 08:48:19,Male,31:c7:99:cf:1f:cb,Verified
1230,USER00393,2022-08-06 02:47:01,Shaina Bailey,27558 Shores Furt,Alabama,Mesa,Brazil,1995-01-15 16:28:17,Female,a5:25:61:fd:a3:9f,Basic
775,USER00473,2023-03-10 07:52:20,Camron Stehr,9987 Port Neck Side,California,Anaheim,Brazil,1988-12-12 01:34:45,Male,86:ee:ae:92:d0:53,Basic


In [28]:
df_user_data['user_id'] = df_user_data['user_id'].str.replace(
    'USER', '').astype(int)
df_user_data.head()

Unnamed: 0,user_id,creation_date,name,street,state,city,country,birthdate,gender,device_address,user_type
0,40678,2021-03-17 22:56:13,Zion Feest,14938 West Trace Side,New Jersey,Birmingham,Hong Kong,1998-04-06 05:29:37,Male,17:fb:f2:60:94:4b,Basic
1,8728,2022-10-10 12:53:20,Kattie Bergstrom,4476 West Haven Fort,Alabama,Irvine,Mayotte,2003-05-22 11:16:19,Male,b0:17:a7:0b:d6:67,Premium
2,29759,2020-05-20 04:34:44,Aiden Corwin,59980 North Crest Chester,North Carolina,Tampa,Iraq,2008-08-29 16:42:05,Female,24:f2:0b:88:2f:bd,Basic
3,16806,2021-05-28 07:36:30,Vince Gislason,541 Radial Mouth,Illinois,Orlando,New Zealand,2012-02-09 14:12:37,Male,a4:f5:fd:fe:07:f9,Basic
4,27644,2023-03-16 19:25:35,Adele Okuneva,896 Glen Bury,Arizona,Reno,Mexico,1976-10-13 00:53:54,Male,ac:80:b3:bc:8d:5f,Premium


In [29]:
df_user_data['user_id'] = df_user_data.groupby('user_id').cumcount().add(
    1).astype(str).radd('USER') + df_user_data['user_id'].astype(str)

In [30]:
duplicate_check = df_user_data.duplicated('user_id', keep=False)

if duplicate_check.any():
    print("There are still duplicate user IDs.")
else:
    print("There are no duplicate user IDs.")

There are no duplicate user IDs.


In [31]:
df_user_data.head()

Unnamed: 0,user_id,creation_date,name,street,state,city,country,birthdate,gender,device_address,user_type
0,USER140678,2021-03-17 22:56:13,Zion Feest,14938 West Trace Side,New Jersey,Birmingham,Hong Kong,1998-04-06 05:29:37,Male,17:fb:f2:60:94:4b,Basic
1,USER18728,2022-10-10 12:53:20,Kattie Bergstrom,4476 West Haven Fort,Alabama,Irvine,Mayotte,2003-05-22 11:16:19,Male,b0:17:a7:0b:d6:67,Premium
2,USER129759,2020-05-20 04:34:44,Aiden Corwin,59980 North Crest Chester,North Carolina,Tampa,Iraq,2008-08-29 16:42:05,Female,24:f2:0b:88:2f:bd,Basic
3,USER116806,2021-05-28 07:36:30,Vince Gislason,541 Radial Mouth,Illinois,Orlando,New Zealand,2012-02-09 14:12:37,Male,a4:f5:fd:fe:07:f9,Basic
4,USER127644,2023-03-16 19:25:35,Adele Okuneva,896 Glen Bury,Arizona,Reno,Mexico,1976-10-13 00:53:54,Male,ac:80:b3:bc:8d:5f,Premium


## Checking For Nulls


In [32]:
df_user_data.isnull().sum()

user_id           0
creation_date     0
name              0
street            0
state             0
city              0
country           0
birthdate         0
gender            0
device_address    0
user_type         0
dtype: int64

In [33]:
print(df_user_data.duplicated().sum())

0


## Convert to parquet


In [34]:
df_user_data.to_parquet(
    'Customer Management Department/user_data.parquet', index=False)

# User Job


In [35]:
df_user_job = pd.read_csv('Customer Management Department/user_job.csv')
print(df_user_job)
print(df_user_job.columns.ravel())

      Unnamed: 0    user_id              name       job_title   job_level
0              0  USER40678        Zion Feest      Technician    Accounts
1              1  USER08728  Kattie Bergstrom      Technician   Solutions
2              2  USER29759      Aiden Corwin         Student         NaN
3              3  USER16806    Vince Gislason         Student         NaN
4              4  USER27644     Adele Okuneva       Associate   Usability
...          ...        ...               ...             ...         ...
4995        4995  USER49969   Kendall Waelchi     Facilitator  Directives
4996        4996  USER57209  Lucious Kshlerin         Student         NaN
4997        4997  USER36424    Cordie Jenkins  Representative  Directives
4998        4998  USER45166        Tyrel Feil      Supervisor     Metrics
4999        4999  USER54154       Betty Zieme         Student         NaN

[5000 rows x 5 columns]
Index(['Unnamed: 0', 'user_id', 'name', 'job_title', 'job_level'], dtype='object')


In [36]:
df_user_job

Unnamed: 0.1,Unnamed: 0,user_id,name,job_title,job_level
0,0,USER40678,Zion Feest,Technician,Accounts
1,1,USER08728,Kattie Bergstrom,Technician,Solutions
2,2,USER29759,Aiden Corwin,Student,
3,3,USER16806,Vince Gislason,Student,
4,4,USER27644,Adele Okuneva,Associate,Usability
...,...,...,...,...,...
4995,4995,USER49969,Kendall Waelchi,Facilitator,Directives
4996,4996,USER57209,Lucious Kshlerin,Student,
4997,4997,USER36424,Cordie Jenkins,Representative,Directives
4998,4998,USER45166,Tyrel Feil,Supervisor,Metrics


## Drop Unnamed column


In [37]:
df_user_job = df_user_job.drop(columns=['Unnamed: 0'])
df_user_job

Unnamed: 0,user_id,name,job_title,job_level
0,USER40678,Zion Feest,Technician,Accounts
1,USER08728,Kattie Bergstrom,Technician,Solutions
2,USER29759,Aiden Corwin,Student,
3,USER16806,Vince Gislason,Student,
4,USER27644,Adele Okuneva,Associate,Usability
...,...,...,...,...
4995,USER49969,Kendall Waelchi,Facilitator,Directives
4996,USER57209,Lucious Kshlerin,Student,
4997,USER36424,Cordie Jenkins,Representative,Directives
4998,USER45166,Tyrel Feil,Supervisor,Metrics


## Check unique values of job title and job level


In [38]:
unique_job_titles = df_user_job['job_title'].unique()
unique_job_levels = df_user_job['job_level'].unique()

print("Unique Job Titles:")
print(unique_job_titles)

print("\nUnique Job Levels:")
print(unique_job_levels)

Unique Job Titles:
['Technician' 'Student' 'Associate' 'Liaison' 'Director' 'Producer'
 'Executive' 'Strategist' 'Planner' 'Facilitator' 'Designer'
 'Orchestrator' 'Developer' 'Manager' 'Consultant' 'Engineer' 'Assistant'
 'Supervisor' 'Representative' 'Specialist' 'Analyst' 'Coordinator'
 'Architect' 'Agent' 'Officer' 'Administrator']

Unique Job Levels:
['Accounts' 'Solutions' nan 'Usability' 'Mobility' 'Operations' 'Security'
 'Assurance' 'Markets' 'Implementation' 'Paradigm' 'Creative' 'Response'
 'Accountability' 'Interactions' 'Directives' 'Intranet' 'Configuration'
 'Integration' 'Program' 'Identity' 'Factors' 'Division' 'Research'
 'Brand' 'Tactics' 'Quality' 'Data' 'Optimization' 'Metrics'
 'Infrastructure' 'Applications' 'Branding' 'Web' 'Communications'
 'Marketing' 'Functionality' 'Group']


#### Check unique combinations of job title and job level


In [39]:
unique_combinations = df_user_job[['job_title', 'job_level']].drop_duplicates()

print("Unique Combinations of Job Title and Job Level:")
print(unique_combinations)

Unique Combinations of Job Title and Job Level:
        job_title   job_level
0      Technician    Accounts
1      Technician   Solutions
2         Student         NaN
4       Associate   Usability
6      Technician    Mobility
...           ...         ...
4559  Facilitator  Operations
4673      Manager    Security
4821    Associate    Division
4857      Manager    Branding
4998   Supervisor     Metrics

[897 rows x 2 columns]


In [40]:
student_job_combinations = df_user_job[df_user_job['job_title'] == 'Student'][[
    'job_title', 'job_level']].drop_duplicates()

print("Unique Combinations of Student and Job Level:")
print(student_job_combinations)

Unique Combinations of Student and Job Level:
  job_title job_level
2   Student       NaN


## Check for invalid names


In [41]:
df_user_job['name'] = df_user_job['name'].str.title()
special_chars_check = df_user_job['name'].str.contains(
    r'[^A-Za-z0-9\s]', regex=True)

rows_with_special_chars = df_user_job[special_chars_check]
print("Rows with Special Characters in the Name Column:")
print(rows_with_special_chars)

Rows with Special Characters in the Name Column:
Empty DataFrame
Columns: [user_id, name, job_title, job_level]
Index: []


## Check for duplicate rows with different user id


In [42]:
duplicate_rows = df_user_job[df_user_job.duplicated(
    subset=df_user_job.columns.difference(['user_id']), keep=False)]

print(duplicate_rows)

        user_id           name job_title job_level
2036  USER45050  Nya Abernathy   Student       NaN
2293  USER44994  Nya Abernathy   Student       NaN


#### Remove duplicate row with different user id


In [43]:
df_user_job = df_user_job.drop_duplicates(
    subset=['name', 'job_title', 'job_level'], keep='first')

duplicate_rows = df_user_job[df_user_job.duplicated(
    subset=df_user_job.columns.difference(['user_id']), keep=False)]

print(duplicate_rows)

Empty DataFrame
Columns: [user_id, name, job_title, job_level]
Index: []


## Check for duplicate user id


In [44]:
duplicate_user_jobs = df_user_job[df_user_job.duplicated(
    subset=['user_id'], keep=False)]

sorted_duplicate_user_jobs = duplicate_user_jobs.sort_values(by=['user_id'])

sorted_duplicate_user_jobs

Unnamed: 0,user_id,name,job_title,job_level
366,USER00304,Tom Welch,Liaison,Integration
554,USER00304,Mariam Kemmer,Associate,Usability
440,USER00393,Liliane Smith,Orchestrator,Applications
1230,USER00393,Shaina Bailey,Specialist,Research
775,USER00473,Camron Stehr,Facilitator,Configuration
...,...,...,...,...
3764,USER64103,Ed Conroy,Student,
3671,USER64103,Cristopher Padberg,Student,
4104,USER64913,Kaia Heaney,Executive,Branding
4214,USER64913,Petra Hansen,Orchestrator,Mobility


In [45]:
df_user_job['user_id'] = df_user_job['user_id'].str.replace(
    'USER', '').astype(int)
df_user_job

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_user_job['user_id'] = df_user_job['user_id'].str.replace(


Unnamed: 0,user_id,name,job_title,job_level
0,40678,Zion Feest,Technician,Accounts
1,8728,Kattie Bergstrom,Technician,Solutions
2,29759,Aiden Corwin,Student,
3,16806,Vince Gislason,Student,
4,27644,Adele Okuneva,Associate,Usability
...,...,...,...,...
4995,49969,Kendall Waelchi,Facilitator,Directives
4996,57209,Lucious Kshlerin,Student,
4997,36424,Cordie Jenkins,Representative,Directives
4998,45166,Tyrel Feil,Supervisor,Metrics


In [46]:
df_user_job['user_id'] = df_user_job.groupby('user_id').cumcount().add(
    1).astype(str).radd('USER') + df_user_job['user_id'].astype(str)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_user_job['user_id'] = df_user_job.groupby('user_id').cumcount().add(


In [47]:
duplicate_check = df_user_job.duplicated('user_id', keep=False)

if duplicate_check.any():
    print("There are still duplicate user IDs.")
else:
    print("There are no duplicate user IDs.")

There are no duplicate user IDs.


## Checking for Nulls


In [48]:
df_user_job.isnull().sum()

user_id         0
name            0
job_title       0
job_level    1628
dtype: int64

In [49]:
df_user_job

Unnamed: 0,user_id,name,job_title,job_level
0,USER140678,Zion Feest,Technician,Accounts
1,USER18728,Kattie Bergstrom,Technician,Solutions
2,USER129759,Aiden Corwin,Student,
3,USER116806,Vince Gislason,Student,
4,USER127644,Adele Okuneva,Associate,Usability
...,...,...,...,...
4995,USER149969,Kendall Waelchi,Facilitator,Directives
4996,USER157209,Lucious Kshlerin,Student,
4997,USER236424,Cordie Jenkins,Representative,Directives
4998,USER245166,Tyrel Feil,Supervisor,Metrics


## Convert to parquet


In [50]:
df_user_job.to_parquet(
    'Customer Management Department/user_job.parquet', index=False)