# Data cleaning in pandas

In [1]:
import pandas as pd

In [2]:
df = pd.read_excel('Customer Call List.xlsx')
df.head()

Unnamed: 0,CustomerID,First_Name,Last_Name,Phone_Number,Address,Paying Customer,Do_Not_Contact,Not_Useful_Column
0,1001,Frodo,Baggins,123-545-5421,"123 Shire Lane, Shire",Yes,No,True
1,1002,Abed,Nadir,123/643/9775,93 West Main Street,No,Yes,False
2,1003,Walter,/White,7066950392,298 Drugs Driveway,N,,True
3,1004,Dwight,Schrute,123-543-2345,"980 Paper Avenue, Pennsylvania, 18503",Yes,Y,True
4,1005,Jon,Snow,876|678|3469,123 Dragons Road,Y,No,True


In [3]:
#Drop duplicates
df = df.drop_duplicates()
df.tail()

Unnamed: 0,CustomerID,First_Name,Last_Name,Phone_Number,Address,Paying Customer,Do_Not_Contact,Not_Useful_Column
15,1016,Ron,Weasley,123-545-5421,2395 Hogwarts Avenue,No,N,False
16,1017,Michael,Scott,123/643/9775,"121 Paper Avenue, Pennsylvania",Yes,No,False
17,1018,Clark,Kent,7066950392,3498 Super Lane,Y,,True
18,1019,Creed,Braton,N/a,N/a,N/a,Yes,True
19,1020,Anakin,Skywalker,876|678|3469,"910 Tatooine Road, Tatooine",Yes,N,True


In [4]:
#Remove the columns that we do not need
df = df.drop(columns = "Not_Useful_Column")
df.head()

Unnamed: 0,CustomerID,First_Name,Last_Name,Phone_Number,Address,Paying Customer,Do_Not_Contact
0,1001,Frodo,Baggins,123-545-5421,"123 Shire Lane, Shire",Yes,No
1,1002,Abed,Nadir,123/643/9775,93 West Main Street,No,Yes
2,1003,Walter,/White,7066950392,298 Drugs Driveway,N,
3,1004,Dwight,Schrute,123-543-2345,"980 Paper Avenue, Pennsylvania, 18503",Yes,Y
4,1005,Jon,Snow,876|678|3469,123 Dragons Road,Y,No


In [5]:
#Clean up the Last_name Column 
df['Last_Name'] = df['Last_Name'].str.lstrip("...")

In [6]:
df['Last_Name'] = df['Last_Name'].str.lstrip("/")

In [7]:
df['Last_Name'] = df['Last_Name'].str.rstrip("_")

In [8]:
df['Last_Name']

0        Baggins
1          Nadir
2          White
3        Schrute
4           Snow
5        Swanson
6         Winger
7         Holmes
8            NaN
9         Parker
10        Gamgee
11        Potter
12        Draper
13         Knope
14    Flenderson
15       Weasley
16         Scott
17          Kent
18        Braton
19     Skywalker
Name: Last_Name, dtype: object

In [9]:
#Remove all of them together
df['Last_Name'] = df['Last_Name'].str.strip("123,._/")

In [13]:
#Cleaning phone number to look nornmal    FORMAT # 123-456-7890
df["Phone_Number"] = df["Phone_Number"].str.replace('[^a-zA-Z0-9]',' ')
df.head()

Unnamed: 0,CustomerID,First_Name,Last_Name,Phone_Number,Address,Paying Customer,Do_Not_Contact
0,1001,Frodo,Baggins,123 545 5421,"123 Shire Lane, Shire",Yes,No
1,1002,Abed,Nadir,123 643 9775,93 West Main Street,No,Yes
2,1003,Walter,White,7066950392,298 Drugs Driveway,N,
3,1004,Dwight,Schrute,123 543 2345,"980 Paper Avenue, Pennsylvania, 18503",Yes,Y
4,1005,Jon,Snow,876 678 3469,123 Dragons Road,Y,No


In [14]:
df['Phone_Number'] = df['Phone_Number'].apply(lambda x: str(x))

In [15]:
df['Phone_Number'].apply(lambda x: x[0 : 3] +'-' + x[3: 6] +'-' + x[6 : 10])

0     123- 54-5 54
1     123- 64-3 97
2     706-695-0392
3     123- 54-3 23
4     876- 67-8 34
5     304- 76-2 24
6            nan--
7     876- 67-8 34
8            N a--
9     123- 54-5 54
10           nan--
11    706-695-0392
12    123- 54-3 23
13    876- 67-8 34
14    304- 76-2 24
15    123- 54-5 54
16    123- 64-3 97
17    706-695-0392
18           N a--
19    876- 67-8 34
Name: Phone_Number, dtype: object