In [1]:
import pandas as pd

url = 'https://github.com/mattharrison/datasets/raw/master/data/''vehicles.csv.zip'
df = pd.read_csv(url, dtype_backend='pyarrow', engine='pyarrow')
pd.set_option('display.max_columns', None)

def gt20(val):
    return val > 20

city_mpg = df.city08
highway_mpg = df.highway08
make = df.make

In [2]:
make

0        Alfa Romeo
1           Ferrari
2             Dodge
3             Dodge
4            Subaru
            ...    
41139        Subaru
41140        Subaru
41141        Subaru
41142        Subaru
41143        Subaru
Name: make, Length: 41144, dtype: string[pyarrow]

In [3]:
make.astype(str)

0        Alfa Romeo
1           Ferrari
2             Dodge
3             Dodge
4            Subaru
            ...    
41139        Subaru
41140        Subaru
41141        Subaru
41142        Subaru
41143        Subaru
Name: make, Length: 41144, dtype: object

In [4]:
import pyarrow as pa
string_pa = pd.ArrowDtype(pa.string())
make.astype(str).astype(string_pa)

0        Alfa Romeo
1           Ferrari
2             Dodge
3             Dodge
4            Subaru
            ...    
41139        Subaru
41140        Subaru
41141        Subaru
41142        Subaru
41143        Subaru
Name: make, Length: 41144, dtype: string[pyarrow]

In [5]:
'Ford'.lower()
'Hello'.lower()

'hello'

In [6]:
make.str.lower()

0        alfa romeo
1           ferrari
2             dodge
3             dodge
4            subaru
            ...    
41139        subaru
41140        subaru
41141        subaru
41142        subaru
41143        subaru
Name: make, Length: 41144, dtype: string[pyarrow]

In [7]:
'Alpha Romeo'.find('A')

0

In [8]:
make.str.find('A')

0         0
1        -1
2        -1
3        -1
4        -1
         ..
41139    -1
41140    -1
41141    -1
41142    -1
41143    -1
Name: make, Length: 41144, dtype: int32[pyarrow]

In [9]:
print(make.str.extract(r'([^a-z A-Z])'))

ValueError: pat='([^a-z A-Z])' must contain a symbolic group name.

In [12]:
print(make.str.extract(r'(?P<non_alpha>[^a-z A-Z])'))

      non_alpha
0          <NA>
1          <NA>
2          <NA>
3          <NA>
4          <NA>
...         ...
41139      <NA>
41140      <NA>
41141      <NA>
41142      <NA>
41143      <NA>

[41144 rows x 1 columns]


In [14]:
(make
 .str.extract(r'(?P<non_alpha>[^a-z A-Z])',
              expand=False)
 .value_counts()
)

non_alpha
-    1727
.      46
,       9
Name: count, dtype: int64[pyarrow]

In [16]:
# If a column in a CSV file contains non-numeric characters, use the following code to find them:

col = make

(col
 .str.extract(r'(?P<non_num>[^0-9.])', expand=False)
 .value_counts()
)

non_num
C    5336
M    4833
F    3686
B    2796
G    2691
D    2679
P    2589
S    2234
T    2159
V    2001
H    1803
A    1610
N    1471
J    1435
L    1241
I     860
K     618
O     462
R     392
E     167
s      38
W      32
Y       8
Q       3
Name: count, dtype: int64[pyarrow]

In [18]:
import pyarrow as pa
string_pa = pd.ArrowDtype(pa.string())
age = pd.Series(['0-10', '11-15', '11-15', '61-65', '46-50'],
                 dtype=string_pa)
age

0     0-10
1    11-15
2    11-15
3    61-65
4    46-50
dtype: string[pyarrow]

In [20]:
age.str.split('-')

0     ['0' '10']
1    ['11' '15']
2    ['11' '15']
3    ['61' '65']
4    ['46' '50']
dtype: list<item: string>[pyarrow]

In [22]:
(age
 .str.split('-', expand=True)
 .iloc[:,0]
 .astype('int8[pyarrow]')
)

0     0
1    11
2    11
3    61
4    46
Name: 0, dtype: int8[pyarrow]

In [24]:
(age
 .str.slice(-2)
 .astype('int8[pyarrow]')
)

0    10
1    15
2    15
3    65
4    50
dtype: int8[pyarrow]

In [28]:
(age
 .str[-2:]
 .astype('int8[pyarrow]')
)

0    10
1    15
2    15
3    65
4    50
dtype: int8[pyarrow]

In [38]:
(age
 .str.split('-', expand=True)
 .astype('int8[pyarrow]')
 .mean(axis='columns')
)

0     5.0
1    13.0
2    13.0
3    63.0
4    48.0
dtype: double[pyarrow]

In [34]:
import random
def between(row):
    return random.randint(*row.values)

In [40]:
(age
 .str.split('-', expand=True)
 .astype(int)
 .apply(between, axis='columns')
)

0     3
1    11
2    12
3    63
4    48
dtype: int64

In [42]:
import numpy as np
print(age
      .str.split('-', expand=True)
      .rename(columns={0:'lower',1:'upper'})
      .astype('int8[pyarrow]')
      .assign(rand=np.random.rand(len(age)),
              age=lambda df_: (df_.lower + (df_.rand *
                                            (df_.upper - df_.lower)))
              .astype('int8[pyarrow]', errors='ignore')
             )
     )


   lower  upper      rand        age
0      0     10  0.934849   9.348491
1     11     15  0.710798   13.84319
2     11     15  0.312591  12.250366
3     61     65  0.849451  64.397805
4     46     50  0.228212  46.912848


In [48]:
import numpy as np # imports numpy
(age # uses the age bins shown here: age = pd.Series(['0-10', '11-15', '11-15', '61-65', '46-50'],
 .str.split('-', expand=True) # splits the bins by '-' into two columns
 .astype(int) # turns the output of the two columns into integers
 .pipe(lambda df_: pd.Series(np.random.randint(df_.iloc[:,0], # not yet sure what pipe does. # takes a random intiger and does something with iloc idk what the comma in iloc[:,0] does
                                               df_.iloc[:,1]), index = df_.index) # idk what [:,1] the comma does here, and does something with an index
      )
)

0     6
1    11
2    13
3    61
4    49
dtype: int64

In [90]:
age_100k = (age # takes age series
            .sample(100_000, replace=True, random_state=42) # creates a randomly sampled figures from age, 100,000 of them
            .reset_index(drop=True) # creates a natural index, as the original index values are kept from the source Series
           )
age_100k

0        61-65
1        46-50
2        11-15
3        46-50
4        46-50
         ...  
99995    11-15
99996     0-10
99997    46-50
99998    61-65
99999     0-10
Length: 100000, dtype: string[pyarrow]

In [64]:
%%timeit
(age_100k
 .str.split('-', expand=True)
 .astype('int8[pyarrow]')
 .pipe(lambda df_: pd.Series(np.random.randint(df_.iloc[:,0], 
                                               df_.iloc[:,1]), index =df_.index)
                            )
      )

11.9 ms ± 181 μs per loop (mean ± std. dev. of 7 runs, 100 loops each)


In [74]:
%%timeit
(age_100k
 .str.split('-', expand=True)
 .astype('int8[pyarrow]')
 .apply(between, axis='columns')
)

2.1 s ± 7.52 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [84]:
%%timeit
(age_100k
 .str.split('-', expand=True)
 .rename(columns={0:'lower', 1:'upper'})
 .astype('int8[pyarrow]')
 .assign(rand=np.random.rand(len(age_100k)),
         age=lambda df_: (df_.lower + (df_.rand * 
                                       (df_.upper - df_.lower)))
         .astype('int8[pyarrow]', errors='ignore')
        )
)

11.5 ms ± 110 μs per loop (mean ± std. dev. of 7 runs, 100 loops each)


In [94]:
(age_100k
 .str.split('-', expand=True) # splits into columns
 .rename(columns={0:'lower', 1:'upper'}) # renames the first column as lower, and second as upper
 .astype('int8[pyarrow]') # set type of intiger
 .assign(rand=np.random.rand(len(age_100k)), # creates rand column as decimals for 100 rows contained in age_100k 
         age=lambda df_: (df_.lower + (df_.rand *  ( # the lower age is added to the multiplication of the range differences with the random intiger found in in rand colum
                                       (df_.upper - df_.lower))) # finds the difference between lower and upper
         .astype('int8[pyarrow]', errors='ignore') # converts to int8 using PyArrow and ignores any values that can't be computed
        )
)
)

Unnamed: 0,lower,upper,rand,age
0,61,65,0.943068,64.772274
1,46,50,0.879028,49.516113
2,11,15,0.801416,14.205666
3,46,50,0.274917,47.099669
4,46,50,0.694535,48.778139
...,...,...,...,...
99995,11,15,0.777213,14.108854
99996,0,10,0.668945,6.689449
99997,46,50,0.018032,46.072127
99998,61,65,0.688045,63.75218


In [100]:
%load_ext Cython

In [102]:
%%cython
import random
def between_cy(row):
    return random.randint(*row.values)

Content of stderr:

In [104]:
%%cython
import random
import numpy as np
def between_cy3(x: np.int64, y: np.int64) -> np.int64:
    return random.randint(x,y)

Content of stderr:

In [139]:
(age
 .str.split('-', expand=True)
 .astype(int)
 .apply(lambda row: between_cy3(row[0], row[1]), axis=0)
)

0     3
1    13
dtype: int64