# Vectorized (String) Operations

## 0. Illustratief voorbeeld

<img src="vectorisation.png" alt="drawing" width="750"/>

In [2]:
import pandas as pd
import numpy as np
from time import time

from sqlalchemy.dialects.mssql.information_schema import columns

In [3]:
# Sample data: Product prices
prices = np.random.randint(100,1000,9**9)
len(prices)

387420489

In [5]:
# Creating a DataFrame
price_df = pd.DataFrame(prices, columns=['Original_Price'])
price_df.head()

Unnamed: 0,Original_Price
0,875
1,168
2,108
3,979
4,836


In [9]:
# Vectorization Method
start_time = time()
price_df['Discounted Price_Vector'] = price_df['Original_Price'] * .9
print(f'Vectorisation took {time() - start_time}')

Vectorisation took 1.6496260166168213


In [None]:
# Looping Method
new_prices = []
for price in price_df['Original_Price']:
    new_prices.append(price*0.9)

price_df['Discounted Price_Loop'] = new_prices


In [10]:
price_df['Discounted Price_Loop'] = [ price *9 for price in price_df['Original_Price']]

In [11]:
# Displaying the results
print(price_df)

           Original_Price  Discounted Price  Discounted Price_Vector  \
0                     875             787.5                    787.5   
1                     168             151.2                    151.2   
2                     108              97.2                     97.2   
3                     979             881.1                    881.1   
4                     836             752.4                    752.4   
...                   ...               ...                      ...   
387420484             272             244.8                    244.8   
387420485             254             228.6                    228.6   
387420486             536             482.4                    482.4   
387420487             755             679.5                    679.5   
387420488             140             126.0                    126.0   

           Discounted Price_Loop  
0                           7875  
1                           1512  
2                            9

## 1. Toy dataset

In [12]:
toy = pd.DataFrame({
    "name": ["  Alice", "BOB  ", 'Sam', "cHaRlIe", "  dora  ", "Émile", "FRANK", " gina", "Hank ", "  ian"],
    "city": ["Brussels", "antwerp", "gent", 'gent', "Bruges", "brussels", "GENT", "antwerp ", "  bruges", 'bruges'],
    "raw_code": ["A-001", "B_002", "C003", "A-004", 'A-004', "B_006", "C-007", "A_008", "B009", "C_010"],
    "text": [
        "Order #123 shipped",
        "order #124 delayed",
        "REF: 125 shipped",
        "REF: 126 shipped",
        "Order #127 SHIPPED",
        "order 128 cancelled",
        "Ref:129 shipped",
        "Order #130 returned",
        "order #131 shipped",
        "Ref: 132 delayed",
    ],
    "age": [23, 31, 40, 28, 35, 19, 52, 27, 41, 33],
})

### Quick frame

In [13]:
toy.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10 entries, 0 to 9
Data columns (total 5 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   name      10 non-null     object
 1   city      10 non-null     object
 2   raw_code  10 non-null     object
 3   text      10 non-null     object
 4   age       10 non-null     int64 
dtypes: int64(1), object(4)
memory usage: 532.0+ bytes


In [14]:
toy.head()

Unnamed: 0,name,city,raw_code,text,age
0,Alice,Brussels,A-001,Order #123 shipped,23
1,BOB,antwerp,B_002,order #124 delayed,31
2,Sam,gent,C003,REF: 125 shipped,40
3,cHaRlIe,gent,A-004,REF: 126 shipped,28
4,dora,Bruges,A-004,Order #127 SHIPPED,35


## 2. Vectorised string operations (`.str`)

In [15]:
toy['name'].str.lower()

0       alice
1       bob  
2         sam
3     charlie
4      dora  
5       émile
6       frank
7        gina
8       hank 
9         ian
Name: name, dtype: object

In [16]:
toy['name'].str.strip()


0      Alice
1        BOB
2        Sam
3    cHaRlIe
4       dora
5      Émile
6      FRANK
7       gina
8       Hank
9        ian
Name: name, dtype: object

In [17]:
toy['text'].str.contains('shipped')

0     True
1    False
2     True
3     True
4    False
5    False
6     True
7    False
8     True
9    False
Name: text, dtype: bool

In [18]:
toy[toy['text'].str.contains('shipped')]

Unnamed: 0,name,city,raw_code,text,age
0,Alice,Brussels,A-001,Order #123 shipped,23
2,Sam,gent,C003,REF: 125 shipped,40
3,cHaRlIe,gent,A-004,REF: 126 shipped,28
6,FRANK,GENT,C-007,Ref:129 shipped,52
8,Hank,bruges,B009,order #131 shipped,41


In [19]:
toy[toy['age'] > 20]


Unnamed: 0,name,city,raw_code,text,age
0,Alice,Brussels,A-001,Order #123 shipped,23
1,BOB,antwerp,B_002,order #124 delayed,31
2,Sam,gent,C003,REF: 125 shipped,40
3,cHaRlIe,gent,A-004,REF: 126 shipped,28
4,dora,Bruges,A-004,Order #127 SHIPPED,35
6,FRANK,GENT,C-007,Ref:129 shipped,52
7,gina,antwerp,A_008,Order #130 returned,27
8,Hank,bruges,B009,order #131 shipped,41
9,ian,bruges,C_010,Ref: 132 delayed,33


In [20]:
toy['raw_code'].str.replace('_', '-')

0    A-001
1    B-002
2     C003
3    A-004
4    A-004
5    B-006
6    C-007
7    A-008
8     B009
9    C-010
Name: raw_code, dtype: object

In [21]:
toy['raw_code'].str.split('-')

0    [A, 001]
1     [B_002]
2      [C003]
3    [A, 004]
4    [A, 004]
5     [B_006]
6    [C, 007]
7     [A_008]
8      [B009]
9     [C_010]
Name: raw_code, dtype: object

In [23]:
toy['raw_code'].str.replace('_', '-').str.split('-')

0    [A, 001]
1    [B, 002]
2      [C003]
3    [A, 004]
4    [A, 004]
5    [B, 006]
6    [C, 007]
7    [A, 008]
8      [B009]
9    [C, 010]
Name: raw_code, dtype: object

## 3. `Apply`

In [24]:
toy.head()

Unnamed: 0,name,city,raw_code,text,age
0,Alice,Brussels,A-001,Order #123 shipped,23
1,BOB,antwerp,B_002,order #124 delayed,31
2,Sam,gent,C003,REF: 125 shipped,40
3,cHaRlIe,gent,A-004,REF: 126 shipped,28
4,dora,Bruges,A-004,Order #127 SHIPPED,35


In [25]:
def normalize_code(code):
    if pd.isna(code):
        return code

    code = code.upper()

    if '-' in code:
        left, right = code.split('-')
    elif '_' in code:
        left, right = code.split('_')
    else:
        left, right = code[0], code[1:]

    return f'{left}-{right.zfill(3)}'

toy['raw_code'].apply(normalize_code)

0    A-001
1    B-002
2    C-003
3    A-004
4    A-004
5    B-006
6    C-007
7    A-008
8    B-009
9    C-010
Name: raw_code, dtype: object

In [28]:
def classify_person(row):
    if row['age'] > 30 and 'delayed' in row['text'].lower():
        return 'Will complain'
    elif row['city'].lower() == 'gent':
        return 'Will never complain'
    else:
        return 'Might complain'

toy['complain factor'] = toy.apply(classify_person , axis=1)
toy.head()

Unnamed: 0,name,city,raw_code,text,age,complain factor
0,Alice,Brussels,A-001,Order #123 shipped,23,Might complain
1,BOB,antwerp,B_002,order #124 delayed,31,Will complain
2,Sam,gent,C003,REF: 125 shipped,40,Will never complain
3,cHaRlIe,gent,A-004,REF: 126 shipped,28,Will never complain
4,dora,Bruges,A-004,Order #127 SHIPPED,35,Might complain


## 4. `transform`

In [40]:
toy_grouped = toy.groupby('city', as_index=False)['age'].mean()
toy_grouped.rename(columns={'age':'mean_age_groupby'}, inplace=True)
full_toy = toy.merge(toy_grouped, on='city')
full_toy

Unnamed: 0,name,city,raw_code,text,age,complain factor,mean_age_groupby
0,Alice,Brussels,A-001,Order #123 shipped,23,Might complain,23.0
1,BOB,antwerp,B_002,order #124 delayed,31,Will complain,31.0
2,Sam,gent,C003,REF: 125 shipped,40,Will never complain,34.0
3,cHaRlIe,gent,A-004,REF: 126 shipped,28,Will never complain,34.0
4,dora,Bruges,A-004,Order #127 SHIPPED,35,Might complain,35.0
5,Émile,brussels,B_006,order 128 cancelled,19,Might complain,19.0
6,FRANK,GENT,C-007,Ref:129 shipped,52,Will never complain,52.0
7,gina,antwerp,A_008,Order #130 returned,27,Might complain,27.0
8,Hank,bruges,B009,order #131 shipped,41,Might complain,41.0
9,ian,bruges,C_010,Ref: 132 delayed,33,Will complain,33.0


In [43]:
toy['mean_age_transform'] = toy.groupby('city')['age'].transform('mean')
toy

Unnamed: 0,name,city,raw_code,text,age,complain factor,mean_age_transform
0,Alice,Brussels,A-001,Order #123 shipped,23,Might complain,23.0
1,BOB,antwerp,B_002,order #124 delayed,31,Will complain,31.0
2,Sam,gent,C003,REF: 125 shipped,40,Will never complain,34.0
3,cHaRlIe,gent,A-004,REF: 126 shipped,28,Will never complain,34.0
4,dora,Bruges,A-004,Order #127 SHIPPED,35,Might complain,35.0
5,Émile,brussels,B_006,order 128 cancelled,19,Might complain,19.0
6,FRANK,GENT,C-007,Ref:129 shipped,52,Will never complain,52.0
7,gina,antwerp,A_008,Order #130 returned,27,Might complain,27.0
8,Hank,bruges,B009,order #131 shipped,41,Might complain,41.0
9,ian,bruges,C_010,Ref: 132 delayed,33,Will complain,33.0
