## Algorithms and How to Apply Them

In [1]:
import pandas as pd
import numpy as np
import pyarrow as pa

### Basic pd.Series arithmetic

In [2]:
ser = pd.Series(range(3), dtype=pd.Int64Dtype())
ser

0    0
1    1
2    2
dtype: Int64

In [4]:
print(ser + 42)
print(ser - 42)
print(ser * 42)
print(ser / 42)

0    42
1    43
2    44
dtype: Int64
0    -42
1    -41
2    -40
dtype: Int64
0     0
1    42
2    84
dtype: Int64
0         0.0
1     0.02381
2    0.047619
dtype: Float64


In [5]:
ser2 = pd.Series(range(10, 13), dtype=pd.Int64Dtype())
ser + ser2

0    10
1    12
2    14
dtype: Int64

In [6]:
ser1 = pd.Series([1., 2., 3.], dtype=pd.Float64Dtype())
ser2 = pd.Series([4., pd.NA, 6.], dtype=pd.Float64Dtype())
ser1.add(ser2)

0     5.0
1    <NA>
2     9.0
dtype: Float64

In [7]:
ser1.add(ser2, fill_value=0.)

0    5.0
1    2.0
2    9.0
dtype: Float64

### Basic pd.DataFrame arithmetic

In [8]:
np.random.seed(42)
df = pd.DataFrame(
    np.random.randn(3, 3),
    columns=["col1", "col2", "col3"],
    index=["row1", "row2", "row3"],
).convert_dtypes(dtype_backend="numpy_nullable")
df

Unnamed: 0,col1,col2,col3
row1,0.496714,-0.138264,0.647689
row2,1.52303,-0.234153,-0.234137
row3,1.579213,0.767435,-0.469474


In [10]:
print(df + 1)
print(df * 2)

          col1      col2      col3
row1  1.496714  0.861736  1.647689
row2   2.52303  0.765847  0.765863
row3  2.579213  1.767435  0.530526
          col1      col2      col3
row1  0.993428 -0.276529  1.295377
row2   3.04606 -0.468307 -0.468274
row3  3.158426  1.534869 -0.938949


In [11]:
ser = pd.Series(
    [20, 10, 0],
    index=["col1", "col2", "col3"],
    dtype=pd.Int64Dtype(),
)
ser

col1    20
col2    10
col3     0
dtype: Int64

In [12]:
df + ser

Unnamed: 0,col1,col2,col3
row1,20.496714,9.861736,0.647689
row2,21.52303,9.765847,-0.234137
row3,21.579213,10.767435,-0.469474


### Aggregations

In [13]:
np.random.seed(42)
ser = pd.Series(np.random.rand(10_000), dtype=pd.Float64Dtype())

In [14]:
print(f"Count is: {ser.count()}")
print(f"Mean value is: {ser.mean()}")
print(f"Standard deviation is: {ser.std()}")
print(f"Minimum value is: {ser.min()}")
print(f"Maximum value is: {ser.max()}")
print(f"Summation is: {ser.sum()}")

Count is: 10000
Mean value is: 0.49415955768429964
Standard deviation is: 0.28763012652699277
Minimum value is: 1.1634755366141114e-05
Maximum value is: 0.9997176732861306
Summation is: 4941.595576842997


In [15]:
print(f"Count is: {ser.agg('count')}")
print(f"Mean value is: {ser.agg('mean')}")
print(f"Standard deviation is: {ser.agg('std')}")
print(f"Minimum value is: {ser.agg('min')}")
print(f"Maximum value is: {ser.agg('max')}")
print(f"Summation is: {ser.agg('sum')}")

Count is: 10000
Mean value is: 0.49415955768429964
Standard deviation is: 0.28763012652699277
Minimum value is: 1.1634755366141114e-05
Maximum value is: 0.9997176732861306
Summation is: 4941.595576842997


In [16]:
ser.agg(["min", "max"])

min    0.000012
max    0.999718
dtype: float64

In [17]:
np.random.seed(42)
df = pd.DataFrame(
    np.random.randn(10_000, 6),
    columns=list("abcdef"),
).convert_dtypes(dtype_backend="numpy_nullable")
df

Unnamed: 0,a,b,c,d,e,f
0,0.496714,-0.138264,0.647689,1.52303,-0.234153,-0.234137
1,1.579213,0.767435,-0.469474,0.54256,-0.463418,-0.46573
2,0.241962,-1.91328,-1.724918,-0.562288,-1.012831,0.314247
3,-0.908024,-1.412304,1.465649,-0.225776,0.067528,-1.424748
4,-0.544383,0.110923,-1.150994,0.375698,-0.600639,-0.291694
...,...,...,...,...,...,...
9995,1.951254,0.324704,1.937021,-0.125083,0.589664,0.869128
9996,0.624062,-0.31734,-1.636983,2.390878,-0.597118,2.670553
9997,-0.470192,1.511932,0.718306,0.764051,-0.495094,-0.273401
9998,-0.259206,0.274769,-0.084735,-0.406717,-0.815527,-0.716988


In [18]:
df.sum()

a    -21.365908
b     -7.963987
c    152.032992
d   -180.727498
e     29.399311
f     25.042078
dtype: Float64

In [19]:
df.sum(axis=1)

0       2.060878
1       1.490586
2      -4.657107
3      -2.437675
4      -2.101088
          ...   
9995     5.54669
9996    3.134053
9997    1.755601
9998   -2.008404
9999   -3.314518
Length: 10000, dtype: Float64

In [20]:
df.agg(["min", "max"])

Unnamed: 0,a,b,c,d,e,f
min,-4.295391,-3.436062,-3.9224,-4.465604,-3.836656,-4.157734
max,3.602415,3.745379,3.727833,4.479084,3.691625,3.942331


### Transformations

In [22]:
ser = pd.Series([-1, 0, 1], dtype=pd.Int64Dtype())
def adds_one(ser: pd.Series) -> pd.Series:
    return ser + 1

ser.transform(["abs", adds_one])

Unnamed: 0,abs,adds_one
0,1,0
1,0,1
2,1,2


In [23]:
df = pd.DataFrame(
    np.arange(-5, 4, 1).reshape(3, -1)
).convert_dtypes(dtype_backend="numpy_nullable")
df

Unnamed: 0,0,1,2
0,-5,-4,-3
1,-2,-1,0
2,1,2,3


In [24]:
df.transform("abs")

Unnamed: 0,0,1,2
0,5,4,3
1,2,1,0
2,1,2,3


In [25]:
def add_42(ser: pd.Series):
    return ser + 42

df.transform(["abs", add_42])

Unnamed: 0_level_0,0,0,1,1,2,2
Unnamed: 0_level_1,abs,add_42,abs,add_42,abs,add_42
0,5,37,4,38,3,39
1,2,40,1,41,0,42
2,1,43,2,44,3,45


### Map

In [26]:
ser = pd.Series([123.45, [100, 113], 142.0, [110, 113, 119]])
ser

0             123.45
1         [100, 113]
2              142.0
3    [110, 113, 119]
dtype: object

In [27]:
def custom_average(value):
    if isinstance(value, list):
        return sum(value) / len(value)

    return value

In [28]:
ser.map(custom_average)

0    123.45
1    106.50
2    142.00
3    114.00
dtype: float64

In [29]:
df = pd.DataFrame([
    [2., [1, 2], 3.],
    [[4, 5], 5, 7.],
    [1, 4, [1, 1, 5.5]],
])
df

Unnamed: 0,0,1,2
0,2.0,"[1, 2]",3.0
1,"[4, 5]",5,7.0
2,1,4,"[1, 1, 5.5]"


In [30]:
df.map(custom_average)

Unnamed: 0,0,1,2
0,2.0,1.5,3.0
1,4.5,5.0,7.0
2,1.0,4.0,2.5


In [31]:
ser.transform(custom_average)

0    123.45
1    106.50
2    142.00
3    114.00
dtype: float64

In [32]:
df.transform(custom_average)

Unnamed: 0,0,1,2
0,2.0,"[1, 2]",3.0
1,"[4, 5]",5,7.0
2,1,4,"[1, 1, 5.5]"


### Apply

In [33]:
def debug_apply(value):
    print(f"Apply was called with value:\n{value}")

In [34]:
ser = pd.Series(range(3), dtype=pd.Int64Dtype())
ser.apply(debug_apply)

Apply was called with value:
0
Apply was called with value:
1
Apply was called with value:
2


0    None
1    None
2    None
dtype: object

In [35]:
ser.map(debug_apply)

Apply was called with value:
0
Apply was called with value:
1
Apply was called with value:
2


0    None
1    None
2    None
dtype: object

In [36]:
df = pd.DataFrame(
    np.arange(6).reshape(3, -1),
    columns=list("ab"),
).convert_dtypes(dtype_backend="numpy_nullable")
df

Unnamed: 0,a,b
0,0,1
1,2,3
2,4,5


In [37]:
df.apply(debug_apply)

Apply was called with value:
0    0
1    2
2    4
Name: a, dtype: Int64
Apply was called with value:
0    1
1    3
2    5
Name: b, dtype: Int64


a    None
b    None
dtype: object

In [38]:
def debug_apply_and_return(value):
    print(value)
    return value

In [39]:
df.apply(debug_apply_and_return)

0    0
1    2
2    4
Name: a, dtype: Int64
0    1
1    3
2    5
Name: b, dtype: Int64


Unnamed: 0,a,b
0,0,1
1,2,3
2,4,5


### Summary statistics

In [40]:
ser = pd.Series(["a", "b", "c", "a", "c", "a"], dtype=pd.StringDtype())
ser.value_counts()

a    3
c    2
b    1
Name: count, dtype: Int64

In [41]:
ser = pd.Series([0, 42, 84], dtype=pd.Int64Dtype())
ser.describe()

count     3.0
mean     42.0
std      42.0
min       0.0
25%      21.0
50%      42.0
75%      63.0
max      84.0
dtype: Float64

In [42]:
ser.describe(percentiles=[.10, .44, .67])

count      3.0
mean      42.0
std       42.0
min        0.0
10%        8.4
44%      36.96
50%       42.0
67%      56.28
max       84.0
dtype: Float64

### Binning algorithms

In [43]:
df = pd.DataFrame([
    ["Jane", 34],
    ["John", 18],
    ["Jamie", 22],
    ["Jessica", 36],
    ["Jackie", 33],
    ["Steve", 40],
    ["Sam", 30],
    ["Stephanie", 66],
    ["Sarah", 55],
    ["Aaron", 22],
    ["Erin", 28],
    ["Elsa", 37],
], columns=["name", "age"])
df = df.convert_dtypes(dtype_backend="numpy_nullable")

df.head()

Unnamed: 0,name,age
0,Jane,34
1,John,18
2,Jamie,22
3,Jessica,36
4,Jackie,33


In [44]:
pd.cut(df["age"], 4)

0       (30.0, 42.0]
1     (17.952, 30.0]
2     (17.952, 30.0]
3       (30.0, 42.0]
4       (30.0, 42.0]
5       (30.0, 42.0]
6     (17.952, 30.0]
7       (54.0, 66.0]
8       (54.0, 66.0]
9     (17.952, 30.0]
10    (17.952, 30.0]
11      (30.0, 42.0]
Name: age, dtype: category
Categories (4, interval[float64, right]): [(17.952, 30.0] < (30.0, 42.0] < (42.0, 54.0] < (54.0, 66.0]]

In [45]:
pd.cut(df["age"], 4, precision=0)

0     (30.0, 42.0]
1     (18.0, 30.0]
2     (18.0, 30.0]
3     (30.0, 42.0]
4     (30.0, 42.0]
5     (30.0, 42.0]
6     (18.0, 30.0]
7     (54.0, 66.0]
8     (54.0, 66.0]
9     (18.0, 30.0]
10    (18.0, 30.0]
11    (30.0, 42.0]
Name: age, dtype: category
Categories (4, interval[float64, right]): [(18.0, 30.0] < (30.0, 42.0] < (42.0, 54.0] < (54.0, 66.0]]

In [46]:
pd.cut(df["age"], [10, 20, 30, 40, 50, 60, 70])

0     (30, 40]
1     (10, 20]
2     (20, 30]
3     (30, 40]
4     (30, 40]
5     (30, 40]
6     (20, 30]
7     (60, 70]
8     (50, 60]
9     (20, 30]
10    (20, 30]
11    (30, 40]
Name: age, dtype: category
Categories (6, interval[int64, right]): [(10, 20] < (20, 30] < (30, 40] < (40, 50] < (50, 60] < (60, 70]]

In [47]:
pd.cut(df["age"], [10, 20, 30, 40, 50, 60, 999])

0      (30, 40]
1      (10, 20]
2      (20, 30]
3      (30, 40]
4      (30, 40]
5      (30, 40]
6      (20, 30]
7     (60, 999]
8      (50, 60]
9      (20, 30]
10     (20, 30]
11     (30, 40]
Name: age, dtype: category
Categories (6, interval[int64, right]): [(10, 20] < (20, 30] < (30, 40] < (40, 50] < (50, 60] < (60, 999]]

In [48]:
pd.cut(
    df["age"],
    [10, 20, 30, 40, 50, 60, 999],
    labels=["10-20", "20-30", "30-40", "40-50", "50-60", "60+"],
)

0     30-40
1     10-20
2     20-30
3     30-40
4     30-40
5     30-40
6     20-30
7       60+
8     50-60
9     20-30
10    20-30
11    30-40
Name: age, dtype: category
Categories (6, object): ['10-20' < '20-30' < '30-40' < '40-50' < '50-60' < '60+']

In [49]:
df.assign(age_bin=lambda x: pd.cut(x["age"], [10, 20, 30, 40, 50, 60, 999]))

Unnamed: 0,name,age,age_bin
0,Jane,34,"(30, 40]"
1,John,18,"(10, 20]"
2,Jamie,22,"(20, 30]"
3,Jessica,36,"(30, 40]"
4,Jackie,33,"(30, 40]"
5,Steve,40,"(30, 40]"
6,Sam,30,"(20, 30]"
7,Stephanie,66,"(60, 999]"
8,Sarah,55,"(50, 60]"
9,Aaron,22,"(20, 30]"


In [50]:
df.assign(
    age_bin=lambda x: pd.cut(x["age"], [10, 20, 30, 40, 50, 60, 999], right=False)
)

Unnamed: 0,name,age,age_bin
0,Jane,34,"[30, 40)"
1,John,18,"[10, 20)"
2,Jamie,22,"[20, 30)"
3,Jessica,36,"[30, 40)"
4,Jackie,33,"[30, 40)"
5,Steve,40,"[40, 50)"
6,Sam,30,"[30, 40)"
7,Stephanie,66,"[60, 999)"
8,Sarah,55,"[50, 60)"
9,Aaron,22,"[20, 30)"


### One-hot encoding with pd.get_dummies

In [51]:
ser = pd.Series([
    "green",
    "brown",
    "blue",
    "amber",
    "hazel",
    "amber",
    "green",
    "blue",
    "green",
], name="eye_colors", dtype=pd.StringDtype())
ser

0    green
1    brown
2     blue
3    amber
4    hazel
5    amber
6    green
7     blue
8    green
Name: eye_colors, dtype: string

In [52]:
pd.get_dummies(ser)

Unnamed: 0,amber,blue,brown,green,hazel
0,False,False,False,True,False
1,False,False,True,False,False
2,False,True,False,False,False
3,True,False,False,False,False
4,False,False,False,False,True
5,True,False,False,False,False
6,False,False,False,True,False
7,False,True,False,False,False
8,False,False,False,True,False


In [53]:
pd.get_dummies(ser, prefix="is")

Unnamed: 0,is_amber,is_blue,is_brown,is_green,is_hazel
0,False,False,False,True,False
1,False,False,True,False,False
2,False,True,False,False,False
3,True,False,False,False,False
4,False,False,False,False,True
5,True,False,False,False,False
6,False,False,False,True,False
7,False,True,False,False,False
8,False,False,False,True,False


### Chaining with .pipe

In [54]:
df = pd.DataFrame({
    "col1": pd.Series([1, 2, 3], dtype=pd.Int64Dtype()),
    "col2": pd.Series(["a", "b", "c"], dtype=pd.StringDtype()),
})
df

Unnamed: 0,col1,col2
0,1,a
1,2,b
2,3,c


In [56]:
def change_col1(df: pd.DataFrame) -> pd.DataFrame:
    return df.assign(col1=pd.Series([4, 5, 6], dtype=pd.Int64Dtype()))

def change_col2(df: pd.DataFrame) -> pd.DataFrame:
    return df.assign(col2=pd.Series(["X", "Y", "Z"], dtype=pd.StringDtype()))

In [57]:
df2 = change_col1(df)
df3 = change_col2(df2)
df3

Unnamed: 0,col1,col2
0,4,X
1,5,Y
2,6,Z


In [58]:
change_col2(change_col1(df))

Unnamed: 0,col1,col2
0,4,X
1,5,Y
2,6,Z


In [59]:
df.pipe(change_col1).pipe(change_col2)

Unnamed: 0,col1,col2
0,4,X
1,5,Y
2,6,Z


In [60]:
from typing import Literal

def change_col2(
        df: pd.DataFrame,
        str_case: Literal["upper", "lower"]
) -> pd.DataFrame:
    if str_case == "upper":
        values = ["X", "Y", "Z"]
    else:
        values = ["x", "y", "z"]

    return df.assign(col2=pd.Series(values, dtype=pd.StringDtype()))

In [61]:
df.pipe(change_col2, str_case="lower")

Unnamed: 0,col1,col2
0,1,x
1,2,y
2,3,z


### Selecting the lowest budget movies from the top 100

In [63]:
df = pd.read_csv(
    "../data/movie.csv",
    usecols=["movie_title", "imdb_score", "budget", "gross"],
    dtype_backend="numpy_nullable",
)
df.head()

Unnamed: 0,gross,movie_title,budget,imdb_score
0,760505847.0,Avatar,237000000.0,7.9
1,309404152.0,Pirates of the Caribbean: At World's End,300000000.0,7.1
2,200074175.0,Spectre,245000000.0,6.8
3,448130642.0,The Dark Knight Rises,250000000.0,8.5
4,,Star Wars: Episode VII - The Force Awakens,,7.1


In [64]:
df.nlargest(100, "imdb_score").head()

Unnamed: 0,gross,movie_title,budget,imdb_score
2725,,Towering Inferno,,9.5
1920,28341469.0,The Shawshank Redemption,25000000.0,9.3
3402,134821952.0,The Godfather,6000000.0,9.2
2779,447093.0,Dekalog,,9.1
4312,,Kickboxer: Vengeance,17000000.0,9.1


In [65]:
df.nlargest(100, "imdb_score").nsmallest(5, "budget")

Unnamed: 0,gross,movie_title,budget,imdb_score
4804,,Butterfly Girl,180000.0,8.7
4801,925402.0,Children of Heaven,180000.0,8.5
4706,,12 Angry Men,350000.0,8.9
4550,7098492.0,A Separation,500000.0,8.4
4636,133778.0,The Other Dream Team,500000.0,8.4


In [66]:
df.nlargest(10, "imdb_score")

Unnamed: 0,gross,movie_title,budget,imdb_score
2725,,Towering Inferno,,9.5
1920,28341469.0,The Shawshank Redemption,25000000.0,9.3
3402,134821952.0,The Godfather,6000000.0,9.2
2779,447093.0,Dekalog,,9.1
4312,,Kickboxer: Vengeance,17000000.0,9.1
66,533316061.0,The Dark Knight,185000000.0,9.0
2791,57300000.0,The Godfather: Part II,13000000.0,9.0
3415,,Fargo,,9.0
335,377019252.0,The Lord of the Rings: The Return of the King,94000000.0,8.9
1857,96067179.0,Schindler's List,22000000.0,8.9


In [67]:
df[df["imdb_score"] >= 8.9]

Unnamed: 0,gross,movie_title,budget,imdb_score
66,533316061.0,The Dark Knight,185000000.0,9.0
335,377019252.0,The Lord of the Rings: The Return of the King,94000000.0,8.9
1857,96067179.0,Schindler's List,22000000.0,8.9
1920,28341469.0,The Shawshank Redemption,25000000.0,9.3
2725,,Towering Inferno,,9.5
2779,447093.0,Dekalog,,9.1
2791,57300000.0,The Godfather: Part II,13000000.0,9.0
3295,107930000.0,Pulp Fiction,8000000.0,8.9
3402,134821952.0,The Godfather,6000000.0,9.2
3415,,Fargo,,9.0


In [68]:
df.nlargest(10, ["imdb_score", "gross"])

Unnamed: 0,gross,movie_title,budget,imdb_score
2725,,Towering Inferno,,9.5
1920,28341469.0,The Shawshank Redemption,25000000.0,9.3
3402,134821952.0,The Godfather,6000000.0,9.2
2779,447093.0,Dekalog,,9.1
4312,,Kickboxer: Vengeance,17000000.0,9.1
66,533316061.0,The Dark Knight,185000000.0,9.0
2791,57300000.0,The Godfather: Part II,13000000.0,9.0
3415,,Fargo,,9.0
335,377019252.0,The Lord of the Rings: The Return of the King,94000000.0,8.9
3295,107930000.0,Pulp Fiction,8000000.0,8.9
