# REGEX
Regex (Regular Expression) is a sequence of special characters that specifies a match in text. it can be
used to find and replace the weird text in data. Here, we will learn how to declare REGEX using pandas.Series.str.contains() and
pandas.Series.str.replace()

Check the documentation here : 
- https://pandas.pydata.org/docs/reference/api/pandas.Series.str.contains.html
- https://pandas.pydata.org/docs/reference/api/pandas.Series.str.replace.html

# REGEX on Pandas

In [6]:
# We can use regex on pandas use pandas.Series.str.contains() function
import pandas as pd
data = pd.read_csv("https://storage.googleapis.com/dqlab-dataset/dqlabregex.tsv",sep='\t')

# Here, we will find the 'kota' values with j or s prefix
data["kota_with_prefix_j_s"] = data['kota'].str.contains('^(j|s)',case=False)
data[['kota','kota_with_prefix_j_s']]

  data["kota_with_prefix_j_s"] = data['kota'].str.contains('^(j|s)',case=False)


Unnamed: 0,kota,kota_with_prefix_j_s
0,Jakarta,True
1,Jakarta,True
2,Bandung,False
3,Bandung,False
4,Semarang,True
5,Semarang,True


In [8]:
# Check the staff named 'senja/sendja/sen_ja/ any other equivalent patterns'
data['check_senja'] = data['staf_pencatat'].str.contains('Sen.?ja')
data

Unnamed: 0,no_pencatatan,tanggal_catat,kota,jumlah_member,staf_pencatat,kota_with_prefix_j_s,check_senja
0,1,01-05-2020,Jakarta,311,Andra,True,False
1,2,30-06-2020,Jakarta,1I2,Andra,True,False
2,3,05/02/2020,Bandung,5S0,Antara,False,False
3,4,06/28/2020,Bandung,670,Antara,False,False
4,5,05/10/2020,Semarang,81O,Senja,True,True
5,6,06/28/2020,Semarang,1O2,Sendja,True,True


# REPLACE

In [8]:
# After knowing how to locate the text pattern using regex, now we will learn how to replace that pattern
# Here, we will use df.str.replace() 
import pandas as pd
data = pd.read_csv("https://storage.googleapis.com/dqlab-dataset/dqlabregex.tsv",sep='\t')
# We will locate 'staf_pencatat' which its value has pattern sen_ja, senDja, or any other possible pattern
# and then replace it by 'Senja'
data['staf_pencatat'] = data['staf_pencatat'].str.replace("Sen.?ja","Senja")
data["staf_pencatat"]

  data['staf_pencatat'] = data['staf_pencatat'].str.replace("Sen.?ja","Senja")


0     Andra
1     Andra
2    Antara
3    Antara
4     Senja
5     Senja
Name: staf_pencatat, dtype: object

In [4]:
# Now, we will try to replace the non-numerical object in 'jumlah_member' column 
# and change O to 0, I to 1, and S to 5
mapchanges = {"O":"0", "I":"1", "S" : "5"}
data['jumlah_member_clean'] = data['jumlah_member']
for modify, modifier in mapchanges.items() :
    data["jumlah_member_clean"] = data["jumlah_member_clean"].str.replace(modify,modifier, case = False)
print(data[['jumlah_member','jumlah_member_clean']])

  jumlah_member jumlah_member_clean
0           311                 311
1           1I2                 112
2           5S0                 550
3           670                 670
4           81O                 810
5           1O2                 102


In [5]:
# How if we just want to delete the non-numerical object in 'jumlah_member' ?
data['jumlah_member'] = data['jumlah_member'].str.replace("[^0-9]","")
print(data[['jumlah_member']])

  jumlah_member
0           311
1            12
2            50
3           670
4            81
5            12


  data['jumlah_member'] = data['jumlah_member'].str.replace("[^0-9]","")


In [9]:
# If we look at 'tanggal_catat', we will find that there are 2 kinds of datetime format DD-MM-YYYY and MM/DD/YYYY
# Our purpose is to change DD-MM-YYYY to MM/DD/YYYY
data['tanggal_catat'] = data['tanggal_catat'].str.replace("([0-9]{2})-([0-9]{2})-([0-9]{4})","\\2/\\1/\\3")
print(data[['tanggal_catat']])

  tanggal_catat
0    05/01/2020
1    06/30/2020
2    05/02/2020
3    06/28/2020
4    05/10/2020
5    06/28/2020


  data['tanggal_catat'] = data['tanggal_catat'].str.replace("([0-9]{2})-([0-9]{2})-([0-9]{4})","\\2/\\1/\\3")


# Time to Take a Mini Project !

In [17]:
# Mini Project
'''
Given data, there are our purposes :
- change the datetime format of 'tanggal_catat' to YYYY-MM-DD and then change its type to pandas datetime
- delete the non-numerical text in 'jumlah_member' and then change its type to int
- change the Sendja, Sen-ja, Sen_ja, or any other equivalent patterns in 'staf_pencatat' to Senja
'''
import pandas as pd
data = pd.read_csv("https://storage.googleapis.com/dqlab-dataset/dqlabregex.tsv",sep="\t")
print("Uncleaned Data :\n",data)

#1
mapchanges = {"([0-9]{2})-([0-9]{2})-([0-9]{4})":"\\3/\\2/\\1",
             "([0-9]{2})/([0-9]{2})/([0-9]{4})":"\\3/\\1/\\2"}
for modify, modifier in mapchanges.items() :
    data["tanggal_catat"] = data["tanggal_catat"].str.replace(modify, modifier)
#print(data[['tanggal_catat']])
data["tanggal_catat"] = pd.to_datetime(data["tanggal_catat"])

#2
data["jumlah_member"] = data["jumlah_member"].str.replace("[^0-9]","")
data["jumlah_member"] = data["jumlah_member"].astype(int)
#print(data[['jumlah_member']])

#3
data["staf_pencatat"] = data["staf_pencatat"].str.replace("Sen.?ja","Senja")
#print(data[['staf_pencatat']])

print("\n")
print("Cleaned Data :\n",data)

Uncleaned Data :
    no_pencatatan tanggal_catat      kota jumlah_member staf_pencatat
0              1    01-05-2020   Jakarta           311         Andra
1              2    30-06-2020   Jakarta           1I2         Andra
2              3    05/02/2020   Bandung           5S0        Antara
3              4    06/28/2020   Bandung           670        Antara
4              5    05/10/2020  Semarang           81O         Senja
5              6    06/28/2020  Semarang           1O2        Sendja


Cleaned Data :
    no_pencatatan tanggal_catat      kota  jumlah_member staf_pencatat
0              1    2020-05-01   Jakarta            311         Andra
1              2    2020-06-30   Jakarta             12         Andra
2              3    2020-05-02   Bandung             50        Antara
3              4    2020-06-28   Bandung            670        Antara
4              5    2020-05-10  Semarang             81         Senja
5              6    2020-06-28  Semarang             12      

  data["tanggal_catat"] = data["tanggal_catat"].str.replace(modify, modifier)
  data["jumlah_member"] = data["jumlah_member"].str.replace("[^0-9]","")
  data["staf_pencatat"] = data["staf_pencatat"].str.replace("Sen.?ja","Senja")


Learning Source : www.dqlab.id