# String Operations on Columns 
- to fix dirty string col
- clean, parse, extract, split, manipulate
- needed for feature engineering before ML pipeline

In [1]:
import pandas as pd

data = {
    'Name': [' Alice ', 'bob', 'CHARLIE', 'DaViD'],
    'Email': ['alice@example.com', 'bob@gmail.com', 'charlie@outlook.com', 'david@yahoo.com'],
    'City': ['Pune', 'delhi', 'MUMBAI', 'Delhi'],
    'Comments': ['Great!', 'needs improvement', 'EXCELLENT', 'ok']
}

df = pd.DataFrame(data)
df

Unnamed: 0,Name,Email,City,Comments
0,Alice,alice@example.com,Pune,Great!
1,bob,bob@gmail.com,delhi,needs improvement
2,CHARLIE,charlie@outlook.com,MUMBAI,EXCELLENT
3,DaViD,david@yahoo.com,Delhi,ok


# .str
- only for object type
- avoids python slow loops
- effective string ops

In [2]:
df['Name'].str.upper()

0     ALICE 
1        BOB
2    CHARLIE
3      DAVID
Name: Name, dtype: object

## Operations

In [6]:
# change case

df['Name_upper'] = df['Name'].str.upper()
df['Name_lower'] = df['Name'].str.lower()
df['Name_title'] = df['Name'].str.title()
df

Unnamed: 0,Name,Email,City,Comments,Name_upper,Name_lower,Name_title
0,Alice,alice@example.com,Pune,Great!,ALICE,alice,Alice
1,bob,bob@gmail.com,delhi,needs improvement,BOB,bob,Bob
2,CHARLIE,charlie@outlook.com,MUMBAI,EXCELLENT,CHARLIE,charlie,Charlie
3,DaViD,david@yahoo.com,Delhi,ok,DAVID,david,David


In [None]:
# remove whitespace

df['Name_clean'] = df['Name'].str.strip()
df

# .lstrip for leading whitespace
# .rstrip for trailing 

Unnamed: 0,Name,Email,City,Comments,Name_upper,Name_lower,Name_title,Name_clean
0,Alice,alice@example.com,Pune,Great!,ALICE,alice,Alice,Alice
1,bob,bob@gmail.com,delhi,needs improvement,BOB,bob,Bob,bob
2,CHARLIE,charlie@outlook.com,MUMBAI,EXCELLENT,CHARLIE,charlie,Charlie,CHARLIE
3,DaViD,david@yahoo.com,Delhi,ok,DAVID,david,David,DaViD


In [9]:
# length of string

df['Name_length'] = df['Name_clean'].str.len()
df

Unnamed: 0,Name,Email,City,Comments,Name_upper,Name_lower,Name_title,Name_clean,Name_length
0,Alice,alice@example.com,Pune,Great!,ALICE,alice,Alice,Alice,5
1,bob,bob@gmail.com,delhi,needs improvement,BOB,bob,Bob,bob,3
2,CHARLIE,charlie@outlook.com,MUMBAI,EXCELLENT,CHARLIE,charlie,Charlie,CHARLIE,7
3,DaViD,david@yahoo.com,Delhi,ok,DAVID,david,David,DaViD,5


In [13]:
# replace substring

df['City_clean'] = df['City'].str.replace('delhi', 'Delhi', case=False)
df['Comments_clean'] = df['Comments'].str.replace('!','.', regex=False)  # this is only for literal replacement or by defualt it considers it as a str

# df['Comments_clean'] = df['Comments'].str.replace('.','!') ..... nothing happens coz it is not considering . as literal
df

Unnamed: 0,Name,Email,City,Comments,Name_upper,Name_lower,Name_title,Name_clean,Name_length,City_clean,Comments_clean
0,Alice,alice@example.com,Pune,Great!,ALICE,alice,Alice,Alice,5,Pune,Great.
1,bob,bob@gmail.com,delhi,needs improvement,BOB,bob,Bob,bob,3,Delhi,needs improvement
2,CHARLIE,charlie@outlook.com,MUMBAI,EXCELLENT,CHARLIE,charlie,Charlie,CHARLIE,7,MUMBAI,EXCELLENT
3,DaViD,david@yahoo.com,Delhi,ok,DAVID,david,David,DaViD,5,Delhi,ok


In [None]:
# check substr exists

df['has_gmail'] = df['Email'].str.contains('gmail', case=False)  # case INsenitive search
df

Unnamed: 0,Name,Email,City,Comments,Name_upper,Name_lower,Name_title,Name_clean,Name_length,City_clean,Comments_clean,has_gmail
0,Alice,alice@example.com,Pune,Great!,ALICE,alice,Alice,Alice,5,Pune,Great.,False
1,bob,bob@gmail.com,delhi,needs improvement,BOB,bob,Bob,bob,3,Delhi,needs improvement,True
2,CHARLIE,charlie@outlook.com,MUMBAI,EXCELLENT,CHARLIE,charlie,Charlie,CHARLIE,7,MUMBAI,EXCELLENT,False
3,DaViD,david@yahoo.com,Delhi,ok,DAVID,david,David,DaViD,5,Delhi,ok,False


In [19]:
# extract substr with regex and extract()

df['Email_domain'] = df['Email'].str.extract('@(.*)')  # everything after @
df

Unnamed: 0,Name,Email,City,Comments,Name_upper,Name_lower,Name_title,Name_clean,Name_length,City_clean,Comments_clean,has_gmail,Email_domain
0,Alice,alice@example.com,Pune,Great!,ALICE,alice,Alice,Alice,5,Pune,Great.,False,example.com
1,bob,bob@gmail.com,delhi,needs improvement,BOB,bob,Bob,bob,3,Delhi,needs improvement,True,gmail.com
2,CHARLIE,charlie@outlook.com,MUMBAI,EXCELLENT,CHARLIE,charlie,Charlie,CHARLIE,7,MUMBAI,EXCELLENT,False,outlook.com
3,DaViD,david@yahoo.com,Delhi,ok,DAVID,david,David,DaViD,5,Delhi,ok,False,yahoo.com


In [21]:
# splitting

df['Email_split'] = df['Email'].str.split('@')   # makes two halves of before @ and after

df['Email_user'] = df['Email'].str.split('@').str[0] # to get the specific part

df

Unnamed: 0,Name,Email,City,Comments,Name_upper,Name_lower,Name_title,Name_clean,Name_length,City_clean,Comments_clean,has_gmail,Email_domain,Email_split,Email_user
0,Alice,alice@example.com,Pune,Great!,ALICE,alice,Alice,Alice,5,Pune,Great.,False,example.com,"[alice, example.com]",alice
1,bob,bob@gmail.com,delhi,needs improvement,BOB,bob,Bob,bob,3,Delhi,needs improvement,True,gmail.com,"[bob, gmail.com]",bob
2,CHARLIE,charlie@outlook.com,MUMBAI,EXCELLENT,CHARLIE,charlie,Charlie,CHARLIE,7,MUMBAI,EXCELLENT,False,outlook.com,"[charlie, outlook.com]",charlie
3,DaViD,david@yahoo.com,Delhi,ok,DAVID,david,David,DaViD,5,Delhi,ok,False,yahoo.com,"[david, yahoo.com]",david


In [22]:
# replace multiple patterns (ADVANCED CLEANING)

df['City_clean'] = df['City'].str.strip().str.lower().str.title()
df

Unnamed: 0,Name,Email,City,Comments,Name_upper,Name_lower,Name_title,Name_clean,Name_length,City_clean,Comments_clean,has_gmail,Email_domain,Email_split,Email_user
0,Alice,alice@example.com,Pune,Great!,ALICE,alice,Alice,Alice,5,Pune,Great.,False,example.com,"[alice, example.com]",alice
1,bob,bob@gmail.com,delhi,needs improvement,BOB,bob,Bob,bob,3,Delhi,needs improvement,True,gmail.com,"[bob, gmail.com]",bob
2,CHARLIE,charlie@outlook.com,MUMBAI,EXCELLENT,CHARLIE,charlie,Charlie,CHARLIE,7,Mumbai,EXCELLENT,False,outlook.com,"[charlie, outlook.com]",charlie
3,DaViD,david@yahoo.com,Delhi,ok,DAVID,david,David,DaViD,5,Delhi,ok,False,yahoo.com,"[david, yahoo.com]",david


In [23]:
# handle missing val

df['Comments_upper'] = df['Comments'].str.upper()  # skips NaN

df['Comments'] = df['Comments'].fillna('')
df

Unnamed: 0,Name,Email,City,Comments,Name_upper,Name_lower,Name_title,Name_clean,Name_length,City_clean,Comments_clean,has_gmail,Email_domain,Email_split,Email_user,Comments_upper
0,Alice,alice@example.com,Pune,Great!,ALICE,alice,Alice,Alice,5,Pune,Great.,False,example.com,"[alice, example.com]",alice,GREAT!
1,bob,bob@gmail.com,delhi,needs improvement,BOB,bob,Bob,bob,3,Delhi,needs improvement,True,gmail.com,"[bob, gmail.com]",bob,NEEDS IMPROVEMENT
2,CHARLIE,charlie@outlook.com,MUMBAI,EXCELLENT,CHARLIE,charlie,Charlie,CHARLIE,7,Mumbai,EXCELLENT,False,outlook.com,"[charlie, outlook.com]",charlie,EXCELLENT
3,DaViD,david@yahoo.com,Delhi,ok,DAVID,david,David,DaViD,5,Delhi,ok,False,yahoo.com,"[david, yahoo.com]",david,OK


| Task                | Method                                           |
| ------------------- | ------------------------------------------------ |
| Change case         | `.str.upper()`, `.str.lower()`, `.str.title()`   |
| Remove whitespace   | `.str.strip()`, `.str.lstrip()`, `.str.rstrip()` |
| Length of strings   | `.str.len()`                                     |
| Replace substrings  | `.str.replace()`                                 |
| Check substring     | `.str.contains()`                                |
| Extract using regex | `.str.extract()`                                 |
| Split strings       | `.str.split()`                                   |