# Import Libraries

In [45]:
import pandas as pd
import numpy as np

# Load Dataset

In [46]:
patients = pd.read_csv('patients.csv')
treatments = pd.read_csv('treatments.csv')
adverse_reactions = pd.read_csv('adverse_reactions.csv')

# Step 1: Cleaning Missing Data

## Cleaning missing data in the `treatments` table

In [47]:
treatments_clean = treatments.copy()
treatments.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 280 entries, 0 to 279
Data columns (total 7 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   given_name    280 non-null    object 
 1   surname       280 non-null    object 
 2   auralin       280 non-null    object 
 3   novodra       280 non-null    object 
 4   hba1c_start   280 non-null    float64
 5   hba1c_end     280 non-null    float64
 6   hba1c_change  171 non-null    float64
dtypes: float64(3), object(4)
memory usage: 15.4+ KB


In [48]:
treatments_cut = pd .read_csv('treatments_cut.csv')
treatments_cut.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 70 entries, 0 to 69
Data columns (total 7 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   given_name    70 non-null     object 
 1   surname       70 non-null     object 
 2   auralin       70 non-null     object 
 3   novodra       70 non-null     object 
 4   hba1c_start   70 non-null     float64
 5   hba1c_end     70 non-null     float64
 6   hba1c_change  42 non-null     float64
dtypes: float64(3), object(4)
memory usage: 4.0+ KB


### Concatentating `treatments` and `treatments_cut`

The `treatments` and the `treatments_cut` tables can be merged through concatentation.

In [49]:
treatments_clean = pd.concat([treatments_clean,treatments_cut],ignore_index=True)

In [50]:
treatments_clean.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 350 entries, 0 to 349
Data columns (total 7 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   given_name    350 non-null    object 
 1   surname       350 non-null    object 
 2   auralin       350 non-null    object 
 3   novodra       350 non-null    object 
 4   hba1c_start   350 non-null    float64
 5   hba1c_end     350 non-null    float64
 6   hba1c_change  213 non-null    float64
dtypes: float64(3), object(4)
memory usage: 19.3+ KB


### Handling missing data in the `hba1c_change` column

Instead of removing the `hba1c_change` column, it's values can be calculated since the orginal source for this column can be found on the `hba1c_start` and `hba1c_end` columns.

In [51]:
treatments_clean['hba1c_change'].head()

0     NaN
1    0.97
2     NaN
3    0.35
4    0.32
Name: hba1c_change, dtype: float64

In [52]:
treatments_clean['hba1c_change'] = treatments_clean['hba1c_start'] - treatments_clean['hba1c_end']

In [53]:
treatments_clean.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 350 entries, 0 to 349
Data columns (total 7 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   given_name    350 non-null    object 
 1   surname       350 non-null    object 
 2   auralin       350 non-null    object 
 3   novodra       350 non-null    object 
 4   hba1c_start   350 non-null    float64
 5   hba1c_end     350 non-null    float64
 6   hba1c_change  350 non-null    float64
dtypes: float64(3), object(4)
memory usage: 19.3+ KB


In [54]:
treatments_clean['hba1c_change'].sample()

164    0.35
Name: hba1c_change, dtype: float64

# Step 2: Cleaning Tidiness Issues

## Issue 1: `contact` column in the `patients` table

As mentioned during the assessment lessons, the `contanct` column contains both the phone number and the email of the patient and not in an organized manner. They can be split up using a regex and the `extract()` function. The `expand=True` parameter can be used to set extract all addtional information to a seperate row)

In [55]:
patients.head()

Unnamed: 0,patient_id,assigned_sex,given_name,surname,address,city,state,zip_code,country,contact,birthdate,weight,height,bmi
0,1,female,Zoe,Wellish,576 Brown Bear Drive,Rancho California,California,92390.0,United States,951-719-9170ZoeWellish@superrito.com,7/10/1976,121.7,66,19.6
1,2,female,Pamela,Hill,2370 University Hill Road,Armstrong,Illinois,61812.0,United States,PamelaSHill@cuvox.de+1 (217) 569-3204,4/3/1967,118.8,66,19.2
2,3,male,Jae,Debord,1493 Poling Farm Road,York,Nebraska,68467.0,United States,402-363-6804JaeMDebord@gustr.com,2/19/1980,177.8,71,24.8
3,4,male,Liêm,Phan,2335 Webster Street,Woodbridge,NJ,7095.0,United States,PhanBaLiem@jourrapide.com+1 (732) 636-8246,7/26/1951,220.9,70,31.7
4,5,male,Tim,Neudorf,1428 Turkey Pen Lane,Dothan,AL,36303.0,United States,334-515-7487TimNeudorf@cuvox.de,2/18/1928,192.3,27,26.1


In [56]:
patients_clean = patients.copy()

### Extracting patients' phone number

In [57]:
patients_clean['phone_number'] = patients_clean['contact'].str.extract(r'((?:\+\d{1,2}\s)?\(?\d{3}\)?[\s.-]?\d{3}[\s.-]?\d{4})',expand=True)

### Extracting patients' email

In [58]:
patients_clean['email'] = patients_clean['contact'].str.extract(r'([a-zA-Z][a-zA-Z0-9_.+-]+@[a-zA-Z0-9-]+\.[a-zA-Z0-9-.]+[a-zA-Z])',expand=True)

### Dropping the `contact` column

The `contact` column is no longer needed now that the phone numbers and emails of patients of been extracted to their own columns

In [59]:
patients_clean = patients_clean.drop('contact',axis=1)

In [60]:
patients_clean['phone_number'].sample(25)

240    +1 (256) 615-5522
4           334-515-7487
32          562-985-4582
202         503-820-7877
307         228-237-2271
246         714-431-2746
371    +1 (908) 287-7099
2           402-363-6804
354         201-586-2848
403         401-535-2675
126         508-739-5632
95          325-282-4087
63          619-299-1495
394         903-939-1025
98     +1 (907) 328-4125
370         508-821-2421
103    +1 (407) 838-0201
292         815-457-5970
358         203 235 1076
442         516-512-4875
309         256-872-9211
288         831-427-4114
436         703-547-0551
296                  NaN
379         580-622-5674
Name: phone_number, dtype: object

In [61]:
patients_clean['email'].sort_values().head()

404               AaliyahRice@dayrep.com
11          Abdul-NurMummarIsa@rhyta.com
332                AbelEfrem@fleckens.hu
258              AbelYonatan@teleworm.us
305    AddolorataLombardi@jourrapide.com
Name: email, dtype: object

None of the emails start with a number

## Issue 2: Three variables in two columns in treatments table

The `auralin` and `novodra` columns contain 3 variables: the start dose, end dose and the type of medicine used. These each should be placed into their own columns. This can be done by melting the `auralin` and `novodra` columns into a treatment and dose column. Then the dose column can be split into two different columns: start dose and end dose.

In [62]:
treatments_clean.head()

Unnamed: 0,given_name,surname,auralin,novodra,hba1c_start,hba1c_end,hba1c_change
0,veronika,jindrová,41u - 48u,-,7.63,7.2,0.43
1,elliot,richardson,-,40u - 45u,7.56,7.09,0.47
2,yukitaka,takenaka,-,39u - 36u,7.68,7.25,0.43
3,skye,gormanston,33u - 36u,-,7.97,7.62,0.35
4,alissa,montez,-,33u - 29u,7.78,7.46,0.32


### Melting the `auralin` and `novodra` columns

In [63]:
treatments_clean = pd.melt(treatments_clean, id_vars= ['given_name', 'surname', 'hba1c_start','hba1c_end','hba1c_change']
                           ,var_name = 'treatment', value_name='dose')
treatments_clean.head()

Unnamed: 0,given_name,surname,hba1c_start,hba1c_end,hba1c_change,treatment,dose
0,veronika,jindrová,7.63,7.2,0.43,auralin,41u - 48u
1,elliot,richardson,7.56,7.09,0.47,auralin,-
2,yukitaka,takenaka,7.68,7.25,0.43,auralin,-
3,skye,gormanston,7.97,7.62,0.35,auralin,33u - 36u
4,alissa,montez,7.78,7.46,0.32,auralin,-


### Removing the rows that have the dose as '-'

In [64]:
treatments_clean = treatments_clean[treatments_clean['dose'] != "-"]
treatments_clean

Unnamed: 0,given_name,surname,hba1c_start,hba1c_end,hba1c_change,treatment,dose
0,veronika,jindrová,7.63,7.20,0.43,auralin,41u - 48u
3,skye,gormanston,7.97,7.62,0.35,auralin,33u - 36u
6,sophia,haugen,7.65,7.27,0.38,auralin,37u - 42u
7,eddie,archer,7.89,7.55,0.34,auralin,31u - 38u
9,asia,woźniak,7.76,7.37,0.39,auralin,30u - 36u
...,...,...,...,...,...,...,...
688,christopher,woodward,7.51,7.06,0.45,novodra,55u - 51u
690,maret,sultygov,7.67,7.30,0.37,novodra,26u - 23u
694,lixue,hsueh,9.21,8.80,0.41,novodra,22u - 23u
696,jakob,jakobsen,7.96,7.51,0.45,novodra,28u - 26u


### Splitting the dose column into `dose_start` and `dose_end` columns

In [65]:
treatments_clean[['dose_start','dose_end']] = treatments_clean['dose'].str.split('-',n=1,expand=True) #n=1 represents the limit of the split where n is the maximum index

### Dropping the `dose` column

In [66]:
treatments_clean = treatments_clean.drop('dose',axis=1)

In [67]:
treatments_clean.head()

Unnamed: 0,given_name,surname,hba1c_start,hba1c_end,hba1c_change,treatment,dose_start,dose_end
0,veronika,jindrová,7.63,7.2,0.43,auralin,41u,48u
3,skye,gormanston,7.97,7.62,0.35,auralin,33u,36u
6,sophia,haugen,7.65,7.27,0.38,auralin,37u,42u
7,eddie,archer,7.89,7.55,0.34,auralin,31u,38u
9,asia,woźniak,7.76,7.37,0.39,auralin,30u,36u


## Issue 3: Adverse reactions should be a part of the treatments table

The `adverse reaction` table can be merged onto the `treatments` table to add the `adverse_reaction` column to it. It can be merged based on the `given_name` and `surname` columns, using left merge. Left merge is used as opposed to a different kind of merge because the focus is to add onto the treatments table rather than maintain every entry within the `adverse reaction` table.

In [68]:
adverse_reactions_clean = adverse_reactions.copy()
treatments_clean = pd.merge(treatments_clean, adverse_reactions_clean, on=['given_name','surname'],how="left")

In [69]:
treatments_clean

Unnamed: 0,given_name,surname,hba1c_start,hba1c_end,hba1c_change,treatment,dose_start,dose_end,adverse_reaction
0,veronika,jindrová,7.63,7.20,0.43,auralin,41u,48u,
1,skye,gormanston,7.97,7.62,0.35,auralin,33u,36u,
2,sophia,haugen,7.65,7.27,0.38,auralin,37u,42u,
3,eddie,archer,7.89,7.55,0.34,auralin,31u,38u,
4,asia,woźniak,7.76,7.37,0.39,auralin,30u,36u,
...,...,...,...,...,...,...,...,...,...
345,christopher,woodward,7.51,7.06,0.45,novodra,55u,51u,nausea
346,maret,sultygov,7.67,7.30,0.37,novodra,26u,23u,
347,lixue,hsueh,9.21,8.80,0.41,novodra,22u,23u,injection site discomfort
348,jakob,jakobsen,7.96,7.51,0.45,novodra,28u,26u,hypoglycemia


Any rows that contain a NaN in their adverse_reacion column can be considereed to have no adverse_reaction.

## Issue 4: Duplicates in `given_name` and `surname` in different tables

Some names in the `patients` table can be found in the `treatments` table as duplicates but not as it is written, it may appear as a nickname of the same person or be in different character case.

Working with the patients' IDs and their names in lowercase should help discover the duplicated rows.

In [70]:
patients_clean.head()

Unnamed: 0,patient_id,assigned_sex,given_name,surname,address,city,state,zip_code,country,birthdate,weight,height,bmi,phone_number,email
0,1,female,Zoe,Wellish,576 Brown Bear Drive,Rancho California,California,92390.0,United States,7/10/1976,121.7,66,19.6,951-719-9170,ZoeWellish@superrito.com
1,2,female,Pamela,Hill,2370 University Hill Road,Armstrong,Illinois,61812.0,United States,4/3/1967,118.8,66,19.2,+1 (217) 569-3204,PamelaSHill@cuvox.de
2,3,male,Jae,Debord,1493 Poling Farm Road,York,Nebraska,68467.0,United States,2/19/1980,177.8,71,24.8,402-363-6804,JaeMDebord@gustr.com
3,4,male,Liêm,Phan,2335 Webster Street,Woodbridge,NJ,7095.0,United States,7/26/1951,220.9,70,31.7,+1 (732) 636-8246,PhanBaLiem@jourrapide.com
4,5,male,Tim,Neudorf,1428 Turkey Pen Lane,Dothan,AL,36303.0,United States,2/18/1928,192.3,27,26.1,334-515-7487,TimNeudorf@cuvox.de


In [71]:
treatments_clean.head()

Unnamed: 0,given_name,surname,hba1c_start,hba1c_end,hba1c_change,treatment,dose_start,dose_end,adverse_reaction
0,veronika,jindrová,7.63,7.2,0.43,auralin,41u,48u,
1,skye,gormanston,7.97,7.62,0.35,auralin,33u,36u,
2,sophia,haugen,7.65,7.27,0.38,auralin,37u,42u,
3,eddie,archer,7.89,7.55,0.34,auralin,31u,38u,
4,asia,woźniak,7.76,7.37,0.39,auralin,30u,36u,


### Getting the names of the patients and converting them to lowercase

In [72]:
id_names = patients_clean[['patient_id', 'given_name', 'surname']].copy()
id_names['given_name'] = id_names['given_name'].str.lower()
id_names['surname'] = id_names['surname'].str.lower()

### Merging the patients names in the `treatments` table based on the names in the `patients` table

In [73]:
treatments_clean = pd.merge(treatments_clean,id_names, on=['given_name','surname'])
treatments_clean = treatments_clean.drop(['given_name', 'surname'], axis=1)

### Confirming the merge was successful

In [74]:
treatments_clean.head()

Unnamed: 0,hba1c_start,hba1c_end,hba1c_change,treatment,dose_start,dose_end,adverse_reaction,patient_id
0,7.63,7.2,0.43,auralin,41u,48u,,225
1,7.97,7.62,0.35,auralin,33u,36u,,242
2,7.65,7.27,0.38,auralin,37u,42u,,345
3,7.89,7.55,0.34,auralin,31u,38u,,276
4,7.76,7.37,0.39,auralin,30u,36u,,15


In [75]:
patients_clean.head()

Unnamed: 0,patient_id,assigned_sex,given_name,surname,address,city,state,zip_code,country,birthdate,weight,height,bmi,phone_number,email
0,1,female,Zoe,Wellish,576 Brown Bear Drive,Rancho California,California,92390.0,United States,7/10/1976,121.7,66,19.6,951-719-9170,ZoeWellish@superrito.com
1,2,female,Pamela,Hill,2370 University Hill Road,Armstrong,Illinois,61812.0,United States,4/3/1967,118.8,66,19.2,+1 (217) 569-3204,PamelaSHill@cuvox.de
2,3,male,Jae,Debord,1493 Poling Farm Road,York,Nebraska,68467.0,United States,2/19/1980,177.8,71,24.8,402-363-6804,JaeMDebord@gustr.com
3,4,male,Liêm,Phan,2335 Webster Street,Woodbridge,NJ,7095.0,United States,7/26/1951,220.9,70,31.7,+1 (732) 636-8246,PhanBaLiem@jourrapide.com
4,5,male,Tim,Neudorf,1428 Turkey Pen Lane,Dothan,AL,36303.0,United States,2/18/1928,192.3,27,26.1,334-515-7487,TimNeudorf@cuvox.de


### Confirming that `patient_id` is the only duplicated column between the two tables

In [76]:
all_columns = pd.Series(list(patients_clean) + list(treatments_clean))
all_columns[all_columns.duplicated()]

22    patient_id
dtype: object

# Step 3: Cleaning Quality Issues

## Issue 1 Validity: Incorrect data types.

### 1.1 In the `treatments` table, the `dose_start` and `dose_end` columns are stored as a string due to the 'u' in their values. The 'u' should be removed and the column should be converted to an integer.

In [77]:
treatments_clean[['dose_start','dose_end']].head()

Unnamed: 0,dose_start,dose_end
0,41u,48u
1,33u,36u
2,37u,42u
3,31u,38u
4,30u,36u


#### Stripping you and converting the columns to integers

In [78]:
treatments_clean['dose_start'] = treatments_clean['dose_start'].str.strip('u ').astype(int)
treatments_clean['dose_end'] = treatments_clean['dose_end'].str.strip('u').astype(int)

#### Confirming the data types

In [79]:
treatments_clean[['dose_start','dose_end']].dtypes

dose_start    int32
dose_end      int32
dtype: object

In [83]:
assert treatments_clean[['dose_start', 'dose_end']].dtypes.iloc[0] == 'int32'
assert treatments_clean[['dose_start', 'dose_end']].dtypes.iloc[1] == 'int32'

### 1.2 Zip codes should not be represented as a float, but as string.

Zip codes can sometimes be less than 5 digits long and should have an initial zero to show that. This can be achieved by converting the `zip_code` column, in the `patients` table, to a string and padding any zip codes less than 5 digits long with zeros.

In [87]:
patients_clean['zip_code'].head()

0    92390.0
1    61812.0
2    68467.0
3     7095.0
4    36303.0
Name: zip_code, dtype: float64

#### Performing the convertion and padding operation

In [89]:
patients_clean['zip_code'] = patients_clean['zip_code'].astype(str).str[:-2].str.pad(5,fillchar='0')

Applying this code will convert NaN values with '0000n', these should be converted back to NaN

In [92]:
patients_clean['zip_code'] = patients_clean['zip_code'].replace('0000n',np.nan)

In [93]:
patients_clean['zip_code'].head()

0    92390
1    61812
2    68467
3    07095
4    36303
Name: zip_code, dtype: object

### 1.3: In the `patients` table, the `assigned sex`, `state` and `birthdate` should be converted to their proper data type

`assigned sex` and `state` should be conerted to a category data type. `birthdate` should be converted to a datetime object.

In [94]:
patients_clean.dtypes

patient_id        int64
assigned_sex     object
given_name       object
surname          object
address          object
city             object
state            object
zip_code         object
country          object
birthdate        object
weight          float64
height            int64
bmi             float64
phone_number     object
email            object
dtype: object

#### Converting `assigned_sex` and `state` columns to a category data type

In [95]:
patients_clean['assigned_sex'] = patients_clean['assigned_sex'].astype('category')
patients_clean['state'] = patients_clean['state'].astype('category')

#### Converting `birthdate` to a datetime date type

In [96]:
patients_clean['birthdate'] = pd.to_datetime(patients_clean.birthdate)

In [97]:
patients_clean.dtypes

patient_id               int64
assigned_sex          category
given_name              object
surname                 object
address                 object
city                    object
state                 category
zip_code                object
country                 object
birthdate       datetime64[ns]
weight                 float64
height                   int64
bmi                    float64
phone_number            object
email                   object
dtype: object

## Issue 3: Consistency

As mentioned during the assessment phase (different notebook), there was a weight and a height that needed correction

### Getting the patient with the incorrect weight using a mask

In [100]:
mask = patients_clean['surname'] == 'Zaitseva'
weight_kg= patients_clean[mask]['weight']
weight_kg

210    48.8
Name: weight, dtype: float64

### Fixing the weight issue

In [101]:
patients_clean.loc[mask,'weight'] = weight_kg*2.20462

In [102]:
patients_clean['weight'].sort_values()

459    102.1
335    102.7
74     103.2
317    106.0
171    106.5
       ...  
144    244.9
61     244.9
283    245.5
118    254.5
485    255.9
Name: weight, Length: 503, dtype: float64

In [103]:
assert patients_clean['weight'].min() >= 100

## Issue 4: Uniqueness and Validity

### 4.1: There are lots of default "John Doe"s. The non-recoverable ones should be removed from the patients table

In [104]:
john_doe =  patients_clean[(patients_clean['surname'] == 'Doe') & (patients_clean['given_name'] == 'John')]
john_doe

Unnamed: 0,patient_id,assigned_sex,given_name,surname,address,city,state,zip_code,country,birthdate,weight,height,bmi,phone_number,email
215,216,male,John,Doe,123 Main Street,New York,NY,12345,United States,1975-01-01,180.0,72,24.4,1234567890,johndoe@email.com
229,230,male,John,Doe,123 Main Street,New York,NY,12345,United States,1975-01-01,180.0,72,24.4,1234567890,johndoe@email.com
237,238,male,John,Doe,123 Main Street,New York,NY,12345,United States,1975-01-01,180.0,72,24.4,1234567890,johndoe@email.com
244,245,male,John,Doe,123 Main Street,New York,NY,12345,United States,1975-01-01,180.0,72,24.4,1234567890,johndoe@email.com
251,252,male,John,Doe,123 Main Street,New York,NY,12345,United States,1975-01-01,180.0,72,24.4,1234567890,johndoe@email.com
277,278,male,John,Doe,123 Main Street,New York,NY,12345,United States,1975-01-01,180.0,72,24.4,1234567890,johndoe@email.com


#### Removing the 'John Doe' records

In [106]:
patients_clean = patients_clean[~((patients_clean['surname'] == 'Doe') & (patients_clean['given_name'] == 'John'))]

#### Confirming the records have been removed

In [108]:
patients_clean['surname'].value_counts()

surname
Jakobsen       3
Taylor         3
Aranda         2
Tucker         2
Souza          2
              ..
Casárez        1
Mata           1
Pospíšil       1
Rukavina       1
Onyekaozulu    1
Name: count, Length: 465, dtype: int64

In [111]:
patients_clean['address'].value_counts()

address
2778 North Avenue           2
2476 Fulton Street          2
648 Old Dear Lane           2
576 Brown Bear Drive        1
2272 Williams Avenue        1
                           ..
1066 Goosetown Drive        1
4291 Patton Lane            1
4643 Reeves Street          1
174 Lost Creek Road         1
3652 Boone Crockett Lane    1
Name: count, Length: 482, dtype: int64

### 4.2: Removing the duplicates "Jake Jakobsen", "Pat Gersten", and "Sandy Taylor" rows from the `patients` table.

These rows are duplicates of other rows. They are just the nick names of the orignals. "Jake" is originally "Jacob". "Pat" is "Patrick". "Sandy" is "Sandra". These nicknames are not in the `treatments` table.

In [112]:
patients_clean[patients_clean['surname'] == 'Jakobsen']

Unnamed: 0,patient_id,assigned_sex,given_name,surname,address,city,state,zip_code,country,birthdate,weight,height,bmi,phone_number,email
24,25,male,Jakob,Jakobsen,648 Old Dear Lane,Port Jervis,New York,12771,United States,1985-08-01,155.8,67,24.4,+1 (845) 858-7707,JakobCJakobsen@einrot.com
29,30,male,Jake,Jakobsen,648 Old Dear Lane,Port Jervis,New York,12771,United States,1985-08-01,155.8,67,24.4,+1 (845) 858-7707,JakobCJakobsen@einrot.com
432,433,female,Karen,Jakobsen,1690 Fannie Street,Houston,TX,77020,United States,1962-11-25,185.2,67,29.0,979 203 0438,KarenJakobsen@jourrapide.com


In [113]:
patients_clean[patients_clean['surname'] == 'Gersten']

Unnamed: 0,patient_id,assigned_sex,given_name,surname,address,city,state,zip_code,country,birthdate,weight,height,bmi,phone_number,email
97,98,male,Patrick,Gersten,2778 North Avenue,Burr,NE,68324,United States,1954-05-03,138.2,71,19.3,402-848-4923,PatrickGersten@rhyta.com
502,503,male,Pat,Gersten,2778 North Avenue,Burr,Nebraska,68324,United States,1954-05-03,138.2,71,19.3,402-848-4923,PatrickGersten@rhyta.com


In [114]:
patients_clean[patients_clean['surname'] == 'Taylor']

Unnamed: 0,patient_id,assigned_sex,given_name,surname,address,city,state,zip_code,country,birthdate,weight,height,bmi,phone_number,email
131,132,female,Sandra,Taylor,2476 Fulton Street,Rainelle,WV,25962,United States,1960-10-23,206.1,64,35.4,304-438-2648,SandraCTaylor@dayrep.com
282,283,female,Sandy,Taylor,2476 Fulton Street,Rainelle,WV,25962,United States,1960-10-23,206.1,64,35.4,304-438-2648,SandraCTaylor@dayrep.com
426,427,male,Rogelio,Taylor,4064 Marigold Lane,Miami,FL,33179,United States,1992-09-02,186.6,69,27.6,305-434-6299,RogelioJTaylor@teleworm.us


#### They can also be found by checking if the address is duplicated (and not empty)

In [116]:
patients_clean[(patients_clean['address'].duplicated()) & (patients['address'].notnull()) ]

  patients_clean[(patients_clean['address'].duplicated()) & (patients['address'].notnull()) ]


Unnamed: 0,patient_id,assigned_sex,given_name,surname,address,city,state,zip_code,country,birthdate,weight,height,bmi,phone_number,email
29,30,male,Jake,Jakobsen,648 Old Dear Lane,Port Jervis,New York,12771,United States,1985-08-01,155.8,67,24.4,+1 (845) 858-7707,JakobCJakobsen@einrot.com
282,283,female,Sandy,Taylor,2476 Fulton Street,Rainelle,WV,25962,United States,1960-10-23,206.1,64,35.4,304-438-2648,SandraCTaylor@dayrep.com
502,503,male,Pat,Gersten,2778 North Avenue,Burr,Nebraska,68324,United States,1954-05-03,138.2,71,19.3,402-848-4923,PatrickGersten@rhyta.com


#### Removing these duplicated entries

In [117]:
patients_clean = patients_clean[~((patients_clean['address'].duplicated()) & (patients['address'].notnull()))]

  patients_clean = patients_clean[~((patients_clean['address'].duplicated()) & (patients['address'].notnull()))]


#### Confirming teh duplicates are gone

In [118]:
patients_clean[patients_clean['surname'] == 'Jakobsen']

Unnamed: 0,patient_id,assigned_sex,given_name,surname,address,city,state,zip_code,country,birthdate,weight,height,bmi,phone_number,email
24,25,male,Jakob,Jakobsen,648 Old Dear Lane,Port Jervis,New York,12771,United States,1985-08-01,155.8,67,24.4,+1 (845) 858-7707,JakobCJakobsen@einrot.com
432,433,female,Karen,Jakobsen,1690 Fannie Street,Houston,TX,77020,United States,1962-11-25,185.2,67,29.0,979 203 0438,KarenJakobsen@jourrapide.com


In [119]:
patients_clean[patients_clean['surname'] == 'Gersten']

Unnamed: 0,patient_id,assigned_sex,given_name,surname,address,city,state,zip_code,country,birthdate,weight,height,bmi,phone_number,email
97,98,male,Patrick,Gersten,2778 North Avenue,Burr,NE,68324,United States,1954-05-03,138.2,71,19.3,402-848-4923,PatrickGersten@rhyta.com


In [120]:
patients_clean[patients_clean['surname'] == 'Taylor']

Unnamed: 0,patient_id,assigned_sex,given_name,surname,address,city,state,zip_code,country,birthdate,weight,height,bmi,phone_number,email
131,132,female,Sandra,Taylor,2476 Fulton Street,Rainelle,WV,25962,United States,1960-10-23,206.1,64,35.4,304-438-2648,SandraCTaylor@dayrep.com
426,427,male,Rogelio,Taylor,4064 Marigold Lane,Miami,FL,33179,United States,1992-09-02,186.6,69,27.6,305-434-6299,RogelioJTaylor@teleworm.us
