# Regular Expressions in Pandas

In [30]:
import pandas as pd
import regex as re

Pandas contains several functions that support pattern matching with regex,

Below are three major functions

Series.str.contains(pattern) - This function checks for the pattern in the column to return True or False values (a mask) where the pattern matches. The mask can be applied to the entire data frame to return only True rows.

Series.str.extract(pattern,expand,flags) - To use this function we must define groups using paranthesis inside a pattern. The function extracts the matches and returns the groups as a columns in the dataframe. when you have only one group in the pattern, use expand-False to return a series instead of a dataframe object.

Series.str.replace(pattern,repl,flags) - similar to re.sub(). This function replaces the matches with the repl string.

Series.str - can be used to access the values of the series as strings and apply several methods to it.

Series.str.contains(pat,case=True,flags=0,na=nan,regex=True)

In [31]:
df = pd.DataFrame({"city":["New York","Parague","new Delhi","Venice","New Orleans"],
                  "Event":["Music","Poetry","Theatre","Comedy","Tech Summit"],
                  "cost":[10000,5000,20000,15000,2000]})
df

Unnamed: 0,city,Event,cost
0,New York,Music,10000
1,Parague,Poetry,5000
2,new Delhi,Theatre,20000
3,Venice,Comedy,15000
4,New Orleans,Tech Summit,2000


In [32]:
df["city"].str.contains("^N.*",case=False)

0     True
1    False
2     True
3    False
4     True
Name: city, dtype: bool

In [33]:
df["city"].str.contains("^N.*",case=True)

0     True
1    False
2    False
3    False
4     True
Name: city, dtype: bool

In [34]:
df[df["city"].str.contains("^N.*",case=False)]

Unnamed: 0,city,Event,cost
0,New York,Music,10000
2,new Delhi,Theatre,20000
4,New Orleans,Tech Summit,2000


In [35]:
df=pd.read_csv(r"C:\Users\DELL\OneDrive\Documents\PG Program in Data Science Machine Learning and Neural Network\Internship\Datasets\titanic_train.csv")
df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [36]:
df.shape

(891, 12)

In [37]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Name         891 non-null    object 
 4   Sex          891 non-null    object 
 5   Age          714 non-null    float64
 6   SibSp        891 non-null    int64  
 7   Parch        891 non-null    int64  
 8   Ticket       891 non-null    object 
 9   Fare         891 non-null    float64
 10  Cabin        204 non-null    object 
 11  Embarked     889 non-null    object 
dtypes: float64(2), int64(5), object(5)
memory usage: 83.7+ KB


In [38]:
df.isnull().sum()

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64

In [39]:
pattern = r"C\.?A\.?"
mask=df["Ticket"].str.contains(pattern)
mask

0      False
1      False
2      False
3      False
4      False
       ...  
886    False
887    False
888    False
889    False
890    False
Name: Ticket, Length: 891, dtype: bool

In [40]:
df[mask]

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
33,34,0,2,"Wheadon, Mr. Edward H",male,66.0,0,0,C.A. 24579,10.5,,S
56,57,1,2,"Rugg, Miss. Emily",female,21.0,0,0,C.A. 31026,10.5,,S
58,59,1,2,"West, Miss. Constance Mirium",female,5.0,1,2,C.A. 34651,27.75,,S
59,60,0,3,"Goodwin, Master. William Frederick",male,11.0,5,2,CA 2144,46.9,,S
66,67,1,2,"Nye, Mrs. (Elizabeth Ramell)",female,29.0,0,0,C.A. 29395,10.5,F33,S
70,71,0,2,"Jenkin, Mr. Stephen Curnow",male,32.0,0,0,C.A. 33111,10.5,,S
71,72,0,3,"Goodwin, Miss. Lillian Amy",female,16.0,5,2,CA 2144,46.9,,S
93,94,0,3,"Dean, Mr. Bertram Frank",male,26.0,1,2,C.A. 2315,20.575,,S
134,135,0,2,"Sobey, Mr. Samuel James Hayden",male,25.0,0,0,C.A. 29178,13.0,,S
145,146,0,2,"Nicholls, Mr. Joseph Charles",male,19.0,1,1,C.A. 33112,36.75,,S


In [41]:
df["Name"].head()

0                              Braund, Mr. Owen Harris
1    Cumings, Mrs. John Bradley (Florence Briggs Th...
2                               Heikkinen, Miss. Laina
3         Futrelle, Mrs. Jacques Heath (Lily May Peel)
4                             Allen, Mr. William Henry
Name: Name, dtype: object

In [42]:
pattern = "\s(\w+)\."

mask = df["Name"].str.extract(pattern,expand=False)
cnt = mask.value_counts()
cnt

Name
Mr          517
Miss        182
Mrs         125
Master       40
Dr            7
Rev           6
Mlle          2
Major         2
Col           2
Countess      1
Capt          1
Ms            1
Sir           1
Lady          1
Mme           1
Don           1
Jonkheer      1
Name: count, dtype: int64

In [43]:
pattern = "\s(\w+)\."

df["Name"].str.replace(pattern,lambda m:m.group().upper(),regex=True)

0                                Braund, MR. Owen Harris
1      Cumings, MRS. John Bradley (Florence Briggs Th...
2                                 Heikkinen, MISS. Laina
3           Futrelle, MRS. Jacques Heath (Lily May Peel)
4                               Allen, MR. William Henry
                             ...                        
886                                Montvila, REV. Juozas
887                         Graham, MISS. Margaret Edith
888             Johnston, MISS. Catherine Helen "Carrie"
889                                Behr, MR. Karl Howell
890                                  Dooley, MR. Patrick
Name: Name, Length: 891, dtype: object