## Import and read dataset

In [1]:
%pip install pandas
import pandas as pd

Defaulting to user installation because normal site-packages is not writeable
Note: you may need to restart the kernel to use updated packages.


In [2]:
def get_data():
    return pd.read_csv("input/japanese_universities.csv",index_col=0)

## Hypothetical Problem

#### Classifying Universities:

| **Category**                    | **Remote**                | **Grad**                    | **Rating**              |
|----------------------------------|---------------------------|-----------------------------|-------------------------|
| **Highly Suitable for Remote Learning** | `True`                    | `True`                      | `≥ 3.5`                 |
| **Traditional University Focus** | `False`                   | `True`                      | `≥ 3.5`                 |
| **Other Universities**           | `Any`                     | `Any`                       | `< 3.5 or no grad/remote` |


In [3]:
def classify_university(row):
    if row['has_remote'] and row['has_grad'] and row['review_rating'] >= 3.5:
        return "Highly Suitable for Remote Learning"
    elif not row['has_remote'] and row['has_grad'] and row['review_rating'] >= 3.5:
        return "Traditional University Focus"
    else:
        return "Other Universities"


### Looping: (shouldn't be used!)

In [4]:
df = get_data()

In [5]:
%%timeit

university_categories = []

for _, row in df.iterrows():
    category = classify_university(row)
    university_categories.append(category)

df['university_category'] = university_categories

51.8 ms ± 8.48 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)


### Apply:

In [6]:
df = get_data()

In [7]:
%%timeit
df['university_category'] = df.apply(classify_university, axis=1)

9.6 ms ± 781 μs per loop (mean ± std. dev. of 7 runs, 100 loops each)


### Vectorize:

In [8]:
df = get_data()

In [9]:
%%timeit

df['university_category'] = "Other Universities"  # Default category
df.loc[(df['has_remote'] == True) & (df['has_grad'] == True) & (df['review_rating'] >= 3.5), 'university_category'] = "Highly Suitable for Remote Learning"
df.loc[(df['has_remote'] == False) & (df['has_grad'] == True) & (df['review_rating'] >= 3.5), 'university_category'] = "Traditional University Focus"

1.58 ms ± 101 μs per loop (mean ± std. dev. of 7 runs, 100 loops each)


Another Example: [Make Your Pandas Code Lightning Fast - Rob Mulla](https://www.youtube.com/watch?v=SAFmrTnEHLg)