# Chapter 21 - Working with text columns

### Revisiting Python strings

In [3]:
message = 'hello, python for accounting'

message.upper()

'HELLO, PYTHON FOR ACCOUNTING'

In [4]:
message

'hello, python for accounting'

Calling a string method does not modify the original string in place. Otherwise:

In [6]:
message = message.upper()
message

'HELLO, PYTHON FOR ACCOUNTING'

In [7]:
message[:5]

'HELLO'

In [8]:
'PYTHON' in message

True

In [9]:
'pandas' in message

False

In [10]:
 'hello' + ', ' + 'python ' + 'for ' + 'accounting'

'hello, python for accounting'

In [11]:
 first_name = "Margaret"
last_name = "Hamilton"
age = 83

f'{first_name} {last_name} is {age} years old!'

'Margaret Hamilton is 83 years old!'

### String methods in pandas

In [13]:
import pandas as pd

In [14]:
# Read Dataframe 'Q1Sales.csv'

url = ("https://raw.githubusercontent.com/pythonforaccounting/workspace/refs/heads/main/P2%20-%20Working%20with%20tables/Q1Sales.csv")
ledger_df = pd.read_csv(url)

ledger_df.head()

Unnamed: 0,InvoiceNo,Channel,Product Name,ProductID,Account,AccountNo,Date,Deadline,Currency,Unit Price,Quantity,Total
0,1532,Shoppe.com,Cannon Water Bomb Balloons 100 Pack,T&G/CAN-97509,Sales,5004,2020-01-01,11/23/19,USD,20.11,14,281.54
1,1533,Walcart,LEGO Ninja Turtles Stealth Shell in Pursuit 79102,T&G/LEG-37777,Sales,5004,2020-01-01,06/15/20,USD,6.7,1,6.7
2,1534,Bullseye,,T&G/PET-14209,Sales,5004,2020-01-01,05/07/20,USD,11.67,5,58.35
3,1535,Bullseye,Transformers Age of Extinction Generations Del...,T&G/TRA-20170,Sales,5004,2020-01-01,12/22/19,USD,13.46,6,80.76
4,1535,Bullseye,Transformers Age of Extinction Generations Del...,T&G/TRA-20170,Sales,5004,2020-01-01,12/22/19,USD,13.46,6,80.76


In [15]:
ledger_df['Channel']

0            Shoppe.com
1               Walcart
2              Bullseye
3              Bullseye
4              Bullseye
              ...      
37703          iBay.com
37704        Shoppe.com
37705        Shoppe.com
37706        Shoppe.com
37707    Understock.com
Name: Channel, Length: 37708, dtype: object

In [16]:
ledger_df['Channel'].str.upper()     # The str keyword is a Series attribute

0            SHOPPE.COM
1               WALCART
2              BULLSEYE
3              BULLSEYE
4              BULLSEYE
              ...      
37703          IBAY.COM
37704        SHOPPE.COM
37705        SHOPPE.COM
37706        SHOPPE.COM
37707    UNDERSTOCK.COM
Name: Channel, Length: 37708, dtype: object

In [17]:
ledger_df['Product Name']

0                      Cannon Water Bomb Balloons 100 Pack
1        LEGO Ninja Turtles Stealth Shell in Pursuit 79102
2                                                      NaN
3        Transformers Age of Extinction Generations Del...
4        Transformers Age of Extinction Generations Del...
                               ...                        
37703    Nature's Bounty Garlic, 2000mg, Odor-Free, 120...
37704                        Funko Wonder Woman POP Heroes
37705    MONO GS1 GS1-BTY-BLK-L Betty Long Guitar Strap...
37706                                                  NaN
37707             3 Collapsible Bowl Set 32oz | 16oz | 4oz
Name: Product Name, Length: 37708, dtype: object

In [18]:
ledger_df['Product Name'].str.upper()

0                      CANNON WATER BOMB BALLOONS 100 PACK
1        LEGO NINJA TURTLES STEALTH SHELL IN PURSUIT 79102
2                                                      NaN
3        TRANSFORMERS AGE OF EXTINCTION GENERATIONS DEL...
4        TRANSFORMERS AGE OF EXTINCTION GENERATIONS DEL...
                               ...                        
37703    NATURE'S BOUNTY GARLIC, 2000MG, ODOR-FREE, 120...
37704                        FUNKO WONDER WOMAN POP HEROES
37705    MONO GS1 GS1-BTY-BLK-L BETTY LONG GUITAR STRAP...
37706                                                  NaN
37707             3 COLLAPSIBLE BOWL SET 32OZ | 16OZ | 4OZ
Name: Product Name, Length: 37708, dtype: object

For instance, if you want to check whether values in the
'Product Name' column contain the term 'LEGO', you can run:

In [20]:
# # Check if 'Product Name' column contains the string 'LEGO' for each row
ledger_df['Product Name'].str.contains('LEGO')

0        False
1         True
2          NaN
3        False
4        False
         ...  
37703    False
37704    False
37705    False
37706      NaN
37707    False
Name: Product Name, Length: 37708, dtype: object

In [21]:
# Filter rows where the 'Product Name' column contains the string 'LEGO', treating NaN as False
ledger_df[ledger_df['Product Name'].str.contains('LEGO', na=False)]

Unnamed: 0,InvoiceNo,Channel,Product Name,ProductID,Account,AccountNo,Date,Deadline,Currency,Unit Price,Quantity,Total
1,1533,Walcart,LEGO Ninja Turtles Stealth Shell in Pursuit 79102,T&G/LEG-37777,Sales,5004,2020-01-01,06/15/20,USD,6.70,1,6.70
43,1575,iBay.com,LEGO Star Wars Clone Troopers vs Droidekas 75000,T&G/LEG-16040,Sales,5004,2020-01-01,3-22-20,USD,14.68,2,29.36
105,1637,Bullseye,LEGO LOTR 79006 The Council of Elrond,T&G/LEG-76682,Sales,5004,2020-01-01,2-19-20,USD,7.67,6,46.02
176,1708,Shoppe.com,LEGO City Fire Chief Car 60001,T&G/LEG-89613,Sales,5004,2020-01-01,3-28-20,USD,24.95,1,24.95
228,1608,iBay.com,LEGO Star Wars Clone Troopers vs Droidekas 75000,T&G/LEG-16040,Sales,5004,2020-01-01,3-22-20,USD,14.68,2,29.36
...,...,...,...,...,...,...,...,...,...,...,...,...
37479,39011,Understock.com,LEGO Star Wars Clone Troopers vs Droidekas 75000,T&G/LEG-16040,Sales,5004,2020-03-30,5-15-20,USD,14.84,1,14.84
37487,38998,Understock.com,DUPLO LEGO Ville 10525 Big Farm,T&G/DUP-28439,Sales,5004,2020-03-30,5-22-20,USD,2.79,6,16.74
37526,39058,Shoppe.com,Belkin LEGO Case / Shield for iPhone 5 and 5S ...,CP&A/BEL-67900,Sales,5004,2020-03-31,Sat Mar 28 00:00:00 2020,USD,39.51,1,39.51
37534,39066,iBay.com,LEGO Disney Princess 41055 Cinderella's Romant...,T&G/LEG-92125,Sales,5004,2020-03-31,6-02-20,USD,9.02,29,261.58


### Replacing parts of text

In [23]:
ledger_df['Channel']

0            Shoppe.com
1               Walcart
2              Bullseye
3              Bullseye
4              Bullseye
              ...      
37703          iBay.com
37704        Shoppe.com
37705        Shoppe.com
37706        Shoppe.com
37707    Understock.com
Name: Channel, Length: 37708, dtype: object

In [24]:
ledger_df['Channel'].str.replace('.com', '')

0            Shoppe
1           Walcart
2          Bullseye
3          Bullseye
4          Bullseye
            ...    
37703          iBay
37704        Shoppe
37705        Shoppe
37706        Shoppe
37707    Understock
Name: Channel, Length: 37708, dtype: object

In [25]:
ledger_df['Channel'].replace('iBay.com', 'Anazon.com')

0            Shoppe.com
1               Walcart
2              Bullseye
3              Bullseye
4              Bullseye
              ...      
37703        Anazon.com
37704        Shoppe.com
37705        Shoppe.com
37706        Shoppe.com
37707    Understock.com
Name: Channel, Length: 37708, dtype: object

### Splitting text values into multiple columns

In [27]:
 ledger_df[['Product Name', 'ProductID', 'Unit Price', 'Quantity', 'Total']]

Unnamed: 0,Product Name,ProductID,Unit Price,Quantity,Total
0,Cannon Water Bomb Balloons 100 Pack,T&G/CAN-97509,20.11,14,281.54
1,LEGO Ninja Turtles Stealth Shell in Pursuit 79102,T&G/LEG-37777,6.70,1,6.70
2,,T&G/PET-14209,11.67,5,58.35
3,Transformers Age of Extinction Generations Del...,T&G/TRA-20170,13.46,6,80.76
4,Transformers Age of Extinction Generations Del...,T&G/TRA-20170,13.46,6,80.76
...,...,...,...,...,...
37703,"Nature's Bounty Garlic, 2000mg, Odor-Free, 120...",H&PC/NAT-15470,5.55,2,11.10
37704,Funko Wonder Woman POP Heroes,T&G/FUN-03366,28.56,1,28.56
37705,MONO GS1 GS1-BTY-BLK-L Betty Long Guitar Strap...,MI/MON-86723,3.33,1,3.33
37706,,T&G/MAG-68412,34.76,10,347.60


In [28]:
 ledger_df['ProductID'].str.split('/')

0         [T&G, CAN-97509]
1         [T&G, LEG-37777]
2         [T&G, PET-14209]
3         [T&G, TRA-20170]
4         [T&G, TRA-20170]
               ...        
37703    [H&PC, NAT-15470]
37704     [T&G, FUN-03366]
37705      [MI, MON-86723]
37706     [T&G, MAG-68412]
37707     [K&D, 3 C-07383]
Name: ProductID, Length: 37708, dtype: object

In [29]:
ledger_df['ProductID'].str.split('/', expand=True)

Unnamed: 0,0,1
0,T&G,CAN-97509
1,T&G,LEG-37777
2,T&G,PET-14209
3,T&G,TRA-20170
4,T&G,TRA-20170
...,...,...
37703,H&PC,NAT-15470
37704,T&G,FUN-03366
37705,MI,MON-86723
37706,T&G,MAG-68412


In [30]:
ledger_df[['CategoryID', 'ItemID']] = ledger_df['ProductID'].str.split('/', expand=True)

ledger_df.head()

Unnamed: 0,InvoiceNo,Channel,Product Name,ProductID,Account,AccountNo,Date,Deadline,Currency,Unit Price,Quantity,Total,CategoryID,ItemID
0,1532,Shoppe.com,Cannon Water Bomb Balloons 100 Pack,T&G/CAN-97509,Sales,5004,2020-01-01,11/23/19,USD,20.11,14,281.54,T&G,CAN-97509
1,1533,Walcart,LEGO Ninja Turtles Stealth Shell in Pursuit 79102,T&G/LEG-37777,Sales,5004,2020-01-01,06/15/20,USD,6.7,1,6.7,T&G,LEG-37777
2,1534,Bullseye,,T&G/PET-14209,Sales,5004,2020-01-01,05/07/20,USD,11.67,5,58.35,T&G,PET-14209
3,1535,Bullseye,Transformers Age of Extinction Generations Del...,T&G/TRA-20170,Sales,5004,2020-01-01,12/22/19,USD,13.46,6,80.76,T&G,TRA-20170
4,1535,Bullseye,Transformers Age of Extinction Generations Del...,T&G/TRA-20170,Sales,5004,2020-01-01,12/22/19,USD,13.46,6,80.76,T&G,TRA-20170


### Concatenating text columns

In [32]:
'Category ID is: ' + ledger_df['CategoryID']

0         Category ID is: T&G
1         Category ID is: T&G
2         Category ID is: T&G
3         Category ID is: T&G
4         Category ID is: T&G
                 ...         
37703    Category ID is: H&PC
37704     Category ID is: T&G
37705      Category ID is: MI
37706     Category ID is: T&G
37707     Category ID is: K&D
Name: CategoryID, Length: 37708, dtype: object

In [33]:
ledger_df['CategoryID'] + '/' + ledger_df['ItemID']

0         T&G/CAN-97509
1         T&G/LEG-37777
2         T&G/PET-14209
3         T&G/TRA-20170
4         T&G/TRA-20170
              ...      
37703    H&PC/NAT-15470
37704     T&G/FUN-03366
37705      MI/MON-86723
37706     T&G/MAG-68412
37707     K&D/3 C-07383
Length: 37708, dtype: object

In [34]:
 ledger_df['InvoiceNo'].astype('string') + '/' + ledger_df['AccountNo'].astype('string')

0         1532/5004
1         1533/5004
2         1534/5004
3         1535/5004
4         1535/5004
            ...    
37703    39235/5004
37704    39216/5004
37705    39219/5004
37706    39238/5004
37707    39239/5004
Length: 37708, dtype: string

### String data types in pandas

In [36]:
ledger_df2 = pd.read_excel('data/Q1Sales.xlsx')

ledger_df2.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 14054 entries, 0 to 14053
Data columns (total 12 columns):
 #   Column        Non-Null Count  Dtype         
---  ------        --------------  -----         
 0   InvoiceNo     14054 non-null  int64         
 1   Channel       14054 non-null  object        
 2   Product Name  12362 non-null  object        
 3   ProductID     14054 non-null  object        
 4   Account       14054 non-null  object        
 5   AccountNo     14054 non-null  int64         
 6   Date          14054 non-null  datetime64[ns]
 7   Deadline      14054 non-null  object        
 8   Currency      14054 non-null  object        
 9   Unit Price    14054 non-null  float64       
 10  Quantity      14054 non-null  int64         
 11  Total         14054 non-null  float64       
dtypes: datetime64[ns](1), float64(2), int64(3), object(6)
memory usage: 1.3+ MB


In [37]:
ledger_df2 = pd.read_excel('data/Q1Sales.xlsx').convert_dtypes()

ledger_df2.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 14054 entries, 0 to 14053
Data columns (total 12 columns):
 #   Column        Non-Null Count  Dtype         
---  ------        --------------  -----         
 0   InvoiceNo     14054 non-null  Int64         
 1   Channel       14054 non-null  string        
 2   Product Name  12362 non-null  string        
 3   ProductID     14054 non-null  string        
 4   Account       14054 non-null  string        
 5   AccountNo     14054 non-null  Int64         
 6   Date          14054 non-null  datetime64[ns]
 7   Deadline      14054 non-null  string        
 8   Currency      14054 non-null  string        
 9   Unit Price    14054 non-null  Float64       
 10  Quantity      14054 non-null  Int64         
 11  Total         14054 non-null  Float64       
dtypes: Float64(2), Int64(3), datetime64[ns](1), string(6)
memory usage: 1.4 MB


In [38]:
ledger_df['Product Name'].astype('string')

0                      Cannon Water Bomb Balloons 100 Pack
1        LEGO Ninja Turtles Stealth Shell in Pursuit 79102
2                                                     <NA>
3        Transformers Age of Extinction Generations Del...
4        Transformers Age of Extinction Generations Del...
                               ...                        
37703    Nature's Bounty Garlic, 2000mg, Odor-Free, 120...
37704                        Funko Wonder Woman POP Heroes
37705    MONO GS1 GS1-BTY-BLK-L Betty Long Guitar Strap...
37706                                                 <NA>
37707             3 Collapsible Bowl Set 32oz | 16oz | 4oz
Name: Product Name, Length: 37708, dtype: string

In [39]:
pd.Series([1011, '$1320', "$980", 645, 'follow', 340])

0      1011
1     $1320
2      $980
3       645
4    follow
5       340
dtype: object

In [40]:
pd.Series([1011, '$1320', "$980", 645, 'follow', 340]).str.strip('$')

0       NaN
1      1320
2       980
3       NaN
4    follow
5       NaN
dtype: object

In [41]:
pd.Series([1011, '$1320', "$980", 645, 'follow', 340]).astype('string').str.strip('$')

0      1011
1      1320
2       980
3       645
4    follow
5       340
dtype: string

### Overthinking: Regular expressions

In [43]:
is_camera = ledger_df['Product Name'].fillna('').str.contains('camera', case=False)  
is_camera

0        False
1        False
2        False
3        False
4        False
         ...  
37703    False
37704    False
37705    False
37706    False
37707    False
Name: Product Name, Length: 37708, dtype: bool

In [44]:
cameras_df = ledger_df[is_camera]                                # Filter the DataFrame to keep rows where 'Product Name' contains 'camera'
cameras_df = cameras_df[['ProductID', 'Product Name', 'Total']]  # Select specific columns to display
cameras_df 

Unnamed: 0,ProductID,Product Name,Total
66,C&P/KID-94587,Kidz Digital Camera,275.64
117,C&P/KOD-01305,Kodak ZM1-NM 1 MP 1-Inch LCD CMOS Sensor Digit...,64.10
154,C&P/FOS-95687,Foscam FI8910W White Wireless IP Cameras 2-pack,669.20
265,C&P/Q-S-31839,Q-See QSC414D Outdoor Dome Color CCD Camera wi...,7.82
287,C&P/KOD-01305,Kodak ZM1-NM 1 MP 1-Inch LCD CMOS Sensor Digit...,64.10
...,...,...,...
37467,T&G/HUB-12150,Hubsan X4 H107C 2.4G 4CH RC Quadcopter With Ca...,5.46
37482,C&P/NIK-92147,Nikon D3100 14.2MP Digital SLR Camera with 18-...,16.38
37535,C&P/ISM-72190,iSmart New Wireless WiFi HD IR Pan Tilt IP Sma...,42.63
37652,C&P/KID-94587,Kidz Digital Camera,229.70


In [45]:
pattern = 'Nikon|Canon|Kodak'  # Remove parentheses to avoid capturing groups

cameras_df = cameras_df[cameras_df['Product Name'].str.contains(pattern, case=False)]  # case-insensitive search

cameras_df 

Unnamed: 0,ProductID,Product Name,Total
117,C&P/KOD-01305,Kodak ZM1-NM 1 MP 1-Inch LCD CMOS Sensor Digit...,64.10
287,C&P/KOD-01305,Kodak ZM1-NM 1 MP 1-Inch LCD CMOS Sensor Digit...,64.10
616,C&P/KOD-32137,Kodak EasyShare Z990 12 MP Digital Camera with...,166.30
2151,C&P/CAN-50721,Canon PowerShot SX50 HS 12MP Digital Camera wi...,6.45
2280,C&P/CAN-50721,Canon PowerShot SX50 HS 12MP Digital Camera wi...,6.45
...,...,...,...
36760,C&P/CAN-92695,Canon PowerShot A810 16.0 MP Digital Camera wi...,6.93
36858,C&P/CAN-10320,Canon PowerShot SD1400IS 14.1 MP Digital Camer...,37.62
37251,C&P/NIK-92147,Nikon D3100 14.2MP Digital SLR Camera with 18-...,43.28
37394,C&P/NIK-92147,Nikon D3100 14.2MP Digital SLR Camera with 18-...,16.38


In [46]:
# Another way:

cameras_df[(cameras_df['Product Name'].str.contains('Nikon'))
            | (cameras_df['Product Name'].str.contains('Canon'))
            | (cameras_df['Product Name'].str.contains('Kodak'))
            ]

Unnamed: 0,ProductID,Product Name,Total
117,C&P/KOD-01305,Kodak ZM1-NM 1 MP 1-Inch LCD CMOS Sensor Digit...,64.10
287,C&P/KOD-01305,Kodak ZM1-NM 1 MP 1-Inch LCD CMOS Sensor Digit...,64.10
616,C&P/KOD-32137,Kodak EasyShare Z990 12 MP Digital Camera with...,166.30
2151,C&P/CAN-50721,Canon PowerShot SX50 HS 12MP Digital Camera wi...,6.45
2280,C&P/CAN-50721,Canon PowerShot SX50 HS 12MP Digital Camera wi...,6.45
...,...,...,...
36760,C&P/CAN-92695,Canon PowerShot A810 16.0 MP Digital Camera wi...,6.93
36858,C&P/CAN-10320,Canon PowerShot SD1400IS 14.1 MP Digital Camer...,37.62
37251,C&P/NIK-92147,Nikon D3100 14.2MP Digital SLR Camera with 18-...,43.28
37394,C&P/NIK-92147,Nikon D3100 14.2MP Digital SLR Camera with 18-...,16.38


`Extract` method:

In [48]:
# Define a regular expression pattern, followed by one or more whitespace characters, and then capture the following word
pattern = '(Nikon|Canon|Kodak)\s+(\w*)'  

cameras_df['Product Name'].str.extract(pattern)  # Apply the regular expression pattern 

Unnamed: 0,0,1
117,Kodak,ZM1
287,Kodak,ZM1
616,Kodak,EasyShare
2151,Canon,PowerShot
2280,Canon,PowerShot
...,...,...
36760,Canon,PowerShot
36858,Canon,PowerShot
37251,Nikon,D3100
37394,Nikon,D3100
