# Pandas Transformations

Keep from your current content:

Adding columns (calculated columns)

.apply(), .map(), .applymap()

If/Then logic with np.where()

Lambda functions

Conditional column creation

What to add:

.assign() for chaining

.pipe() for custom functions

.transform() vs .apply()

Window functions: .rolling(), .expanding()

Binning: pd.cut(), pd.qcut()

In [1]:
import pandas as pd

In [2]:
df = pd.read_csv("https://drive.google.com/uc?id=1oE-3rt17bFW7fOzDIjwFSEMPTIV3NvcO")
df[:3]

Unnamed: 0,school_id,grade,class,student_id,sex,nationality,grade_math_t1,grade_language_t1,grade_science_t1,grade_math_t2,grade_language_t2,grade_science_t2,treatment,date_of_birth
0,57,6,A,wvqgd@cunb.edu,1,1,2.046285,2.369783,2.711344,1.175309,1.498807,1.840368,1,1997-07-27
1,57,6,A,j0ihe@cunb.edu,1,2,7.859077,7.206984,6.994455,7.548812,6.896719,6.68419,1,1997-06-24
2,57,6,A,wcjgk@cunb.edu,2,1,7.118976,8.057449,8.57321,6.181019,7.119492,7.635253,1,1997-04-23


## calculated columns

- Direct operations on entire columns is the fastest vectorized way to manipulate data in pandas.
- optimized C/NumPy backend 
- For simple arithmetic, logical, or string operations (+, *, sum(), str.upper()) on columns.

In [None]:
df_a = df.copy()
df_a["math_avg"] = df_a[["grade_math_t1", "grade_math_t2"]].mean(axis=1)a

Unnamed: 0,school_id,grade,class,student_id,sex,nationality,math_t1,language_t1,science_t1,math_t2,language_t2,science_t2,treatment,date_of_birth,math_avg
0,57,6,A,wvqgd@cunb.edu,1,1,2.046285,2.369783,2.711344,1.175309,1.498807,1.840368,1,1997-07-27,1.610797
1,57,6,A,j0ihe@cunb.edu,1,2,7.859077,7.206984,6.994455,7.548812,6.896719,6.684190,1,1997-06-24,7.703944
2,57,6,A,wcjgk@cunb.edu,2,1,7.118976,8.057449,8.573210,6.181019,7.119492,7.635253,1,1997-04-23,6.649997
3,57,6,A,mzmqb@cunb.edu,1,1,6.973737,7.388008,6.628448,7.593102,8.007373,7.247812,1,1997-02-24,7.283419
4,57,6,A,s6n0y@cunb.edu,1,1,6.574877,6.773626,7.844316,7.611065,7.809814,8.880504,1,1996-09-05,7.092971
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1505,946,8,D,jj36e@cunb.edu,1,1,4.938375,5.306647,4.985917,6.162418,6.530690,6.209960,0,1995-07-04,5.550397
1506,946,8,D,ca1dg@cunb.edu,1,1,7.661931,8.360153,7.937165,8.179364,8.877586,8.454598,0,1995-08-23,7.920647
1507,946,8,D,amdrx@cunb.edu,1,1,9.248758,8.534791,8.056641,8.773601,8.059634,7.581484,0,1994-12-15,9.011180
1508,946,8,D,yn5ug@cunb.edu,1,2,5.689228,5.071503,5.338547,6.382995,5.765270,6.032315,0,1994-09-18,6.036111


## column renaming and sorting

In [23]:
df_a[:3]

Unnamed: 0,school_id,grade,class,student_id,sex,nationality,math_avg,math_t1,language_t1,science_t1,math_t2,language_t2,science_t2,treatment,date_of_birth
0,57,6,A,wvqgd@cunb.edu,1,1,1.610797,2.046285,2.369783,2.711344,1.175309,1.498807,1.840368,1,1997-07-27
1,57,6,A,j0ihe@cunb.edu,1,2,7.703944,7.859077,7.206984,6.994455,7.548812,6.896719,6.68419,1,1997-06-24
2,57,6,A,wcjgk@cunb.edu,2,1,6.649997,7.118976,8.057449,8.57321,6.181019,7.119492,7.635253,1,1997-04-23


In [15]:
df_a.columns = df_a.columns.str.lower().str.replace("grade_", "")
df_a

Unnamed: 0,school_id,grade,class,student_id,sex,nationality,math_t1,language_t1,science_t1,math_t2,language_t2,science_t2,treatment,date_of_birth,math_avg
0,57,6,A,wvqgd@cunb.edu,1,1,2.046285,2.369783,2.711344,1.175309,1.498807,1.840368,1,1997-07-27,1.610797
1,57,6,A,j0ihe@cunb.edu,1,2,7.859077,7.206984,6.994455,7.548812,6.896719,6.684190,1,1997-06-24,7.703944
2,57,6,A,wcjgk@cunb.edu,2,1,7.118976,8.057449,8.573210,6.181019,7.119492,7.635253,1,1997-04-23,6.649997
3,57,6,A,mzmqb@cunb.edu,1,1,6.973737,7.388008,6.628448,7.593102,8.007373,7.247812,1,1997-02-24,7.283419
4,57,6,A,s6n0y@cunb.edu,1,1,6.574877,6.773626,7.844316,7.611065,7.809814,8.880504,1,1996-09-05,7.092971
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1505,946,8,D,jj36e@cunb.edu,1,1,4.938375,5.306647,4.985917,6.162418,6.530690,6.209960,0,1995-07-04,5.550397
1506,946,8,D,ca1dg@cunb.edu,1,1,7.661931,8.360153,7.937165,8.179364,8.877586,8.454598,0,1995-08-23,7.920647
1507,946,8,D,amdrx@cunb.edu,1,1,9.248758,8.534791,8.056641,8.773601,8.059634,7.581484,0,1994-12-15,9.011180
1508,946,8,D,yn5ug@cunb.edu,1,2,5.689228,5.071503,5.338547,6.382995,5.765270,6.032315,0,1994-09-18,6.036111


In [None]:
df_a = df_a[[ 
    "school_id",
    "grade",
    "class",
    "student_id",
    "sex",
    "nationality",
    "math_avg",
    "math_t1",
    "language_t1",
    "science_t1",
    "math_t2",
    "language_t2",
    "science_t2",
    "treatment",
    "date_of_birth",
]]
df_a

Unnamed: 0,school_id,grade,class,student_id,sex,nationality,math_avg,math_t1,language_t1,science_t1,math_t2,language_t2,science_t2,treatment,date_of_birth
0,57,6,A,wvqgd@cunb.edu,1,1,1.610797,2.046285,2.369783,2.711344,1.175309,1.498807,1.840368,1,1997-07-27
1,57,6,A,j0ihe@cunb.edu,1,2,7.703944,7.859077,7.206984,6.994455,7.548812,6.896719,6.684190,1,1997-06-24
2,57,6,A,wcjgk@cunb.edu,2,1,6.649997,7.118976,8.057449,8.573210,6.181019,7.119492,7.635253,1,1997-04-23
3,57,6,A,mzmqb@cunb.edu,1,1,7.283419,6.973737,7.388008,6.628448,7.593102,8.007373,7.247812,1,1997-02-24
4,57,6,A,s6n0y@cunb.edu,1,1,7.092971,6.574877,6.773626,7.844316,7.611065,7.809814,8.880504,1,1996-09-05
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1505,946,8,D,jj36e@cunb.edu,1,1,5.550397,4.938375,5.306647,4.985917,6.162418,6.530690,6.209960,0,1995-07-04
1506,946,8,D,ca1dg@cunb.edu,1,1,7.920647,7.661931,8.360153,7.937165,8.179364,8.877586,8.454598,0,1995-08-23
1507,946,8,D,amdrx@cunb.edu,1,1,9.011180,9.248758,8.534791,8.056641,8.773601,8.059634,7.581484,0,1994-12-15
1508,946,8,D,yn5ug@cunb.edu,1,2,6.036111,5.689228,5.071503,5.338547,6.382995,5.765270,6.032315,0,1994-09-18


## map()

- Element-wise transformation of a Series (single column) using a dictionary, Series, or function.
- Only works on Series (not DataFrames).
- Useful for replacing values or simple lookups.
- Faster than apply() for Series, but slower than vectorized operations.


In [33]:
# Map values using a dictionary
df_b = df_a.copy()
df_b["sex_coded"] = df_b["sex"].map({1: "male", 2: "female"})

df_b = df_b[
    [
        "school_id",
        "grade",
        "class",
        "student_id",
        "sex",
        "sex_coded",
        "nationality",
        "math_avg",
        "math_t1",
        "language_t1",
        "science_t1",
        "math_t2",
        "language_t2",
        "science_t2",
        "treatment",
        "date_of_birth",
    ]
]
df_b

Unnamed: 0,school_id,grade,class,student_id,sex,sex_coded,nationality,math_avg,math_t1,language_t1,science_t1,math_t2,language_t2,science_t2,treatment,date_of_birth
0,57,6,A,wvqgd@cunb.edu,1,male,1,1.610797,2.046285,2.369783,2.711344,1.175309,1.498807,1.840368,1,1997-07-27
1,57,6,A,j0ihe@cunb.edu,1,male,2,7.703944,7.859077,7.206984,6.994455,7.548812,6.896719,6.684190,1,1997-06-24
2,57,6,A,wcjgk@cunb.edu,2,female,1,6.649997,7.118976,8.057449,8.573210,6.181019,7.119492,7.635253,1,1997-04-23
3,57,6,A,mzmqb@cunb.edu,1,male,1,7.283419,6.973737,7.388008,6.628448,7.593102,8.007373,7.247812,1,1997-02-24
4,57,6,A,s6n0y@cunb.edu,1,male,1,7.092971,6.574877,6.773626,7.844316,7.611065,7.809814,8.880504,1,1996-09-05
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1505,946,8,D,jj36e@cunb.edu,1,male,1,5.550397,4.938375,5.306647,4.985917,6.162418,6.530690,6.209960,0,1995-07-04
1506,946,8,D,ca1dg@cunb.edu,1,male,1,7.920647,7.661931,8.360153,7.937165,8.179364,8.877586,8.454598,0,1995-08-23
1507,946,8,D,amdrx@cunb.edu,1,male,1,9.011180,9.248758,8.534791,8.056641,8.773601,8.059634,7.581484,0,1994-12-15
1508,946,8,D,yn5ug@cunb.edu,1,male,2,6.036111,5.689228,5.071503,5.338547,6.382995,5.765270,6.032315,0,1994-09-18


In [None]:
def squared(x):
    return x**2


df1['y^2'] = df1.y.apply(lambda x: squared(x)) # inserts column at the end
df1.insert(loc=1, column='x^2', value=df1.x.apply(lambda x: squared(x))) 
df1.insert(loc=2, column='x^4', value=df1.x.apply(lambda x: x**4)) # short function can be written in the apply
df1

Unnamed: 0,x,x^2,y,s,ones,no,na,y^2
1,0.0,0.0,1.0,,1,,,1.0
2,1.0,1.0,2.0,,1,,,4.0
6,90.0,8100.0,19.0,,1,,,361.0
7,0.0,0.0,1.0,,1,,,1.0
8,100.0,10000.0,1.0,,1,,,1.0


## apply()
- Row-wise or column-wise operations requiring custom logic or multiple columns.
- Works on DataFrames (axis=0 for columns, axis=1 for rows).
- Slower than vectorized operations (Python-level loops).




In [35]:
df_b["math_extra"] = df_b["math_avg"].apply(lambda x: x**2 if x < 2 else x)
df_b


Unnamed: 0,school_id,grade,class,student_id,sex,sex_coded,nationality,math_avg,math_t1,language_t1,science_t1,math_t2,language_t2,science_t2,treatment,date_of_birth,math_extra
0,57,6,A,wvqgd@cunb.edu,1,male,1,1.610797,2.046285,2.369783,2.711344,1.175309,1.498807,1.840368,1,1997-07-27,2.594667
1,57,6,A,j0ihe@cunb.edu,1,male,2,7.703944,7.859077,7.206984,6.994455,7.548812,6.896719,6.684190,1,1997-06-24,7.703944
2,57,6,A,wcjgk@cunb.edu,2,female,1,6.649997,7.118976,8.057449,8.573210,6.181019,7.119492,7.635253,1,1997-04-23,6.649997
3,57,6,A,mzmqb@cunb.edu,1,male,1,7.283419,6.973737,7.388008,6.628448,7.593102,8.007373,7.247812,1,1997-02-24,7.283419
4,57,6,A,s6n0y@cunb.edu,1,male,1,7.092971,6.574877,6.773626,7.844316,7.611065,7.809814,8.880504,1,1996-09-05,7.092971
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1505,946,8,D,jj36e@cunb.edu,1,male,1,5.550397,4.938375,5.306647,4.985917,6.162418,6.530690,6.209960,0,1995-07-04,5.550397
1506,946,8,D,ca1dg@cunb.edu,1,male,1,7.920647,7.661931,8.360153,7.937165,8.179364,8.877586,8.454598,0,1995-08-23,7.920647
1507,946,8,D,amdrx@cunb.edu,1,male,1,9.011180,9.248758,8.534791,8.056641,8.773601,8.059634,7.581484,0,1994-12-15,9.011180
1508,946,8,D,yn5ug@cunb.edu,1,male,2,6.036111,5.689228,5.071503,5.338547,6.382995,5.765270,6.032315,0,1994-09-18,6.036111


## applymap()
- Applies a function row-wise or column-wise to a DataFrame or Series.
- Works on DataFrames (axis=0 for columns, axis=1 for rows).
- Use axis=1 (row-wise) sparingly—it’s the slowest.
- Flexible for complex operations (e.g., custom functions, multiple columns).
- Row-wise or column-wise operations that can’t be vectorized.
- Custom functions requiring multiple columns or complex logic.
- Slower than vectorized operations (Python-level loops).

In [None]:
def to_uppercase(x):
    if isinstance(x, str):
        return x.upper()
    return x


df = df.applymap(to_uppercase)


Index(['school_id', 'grade', 'class', 'student_id', 'sex', 'nationality',
       'nationality_coded', 'math_avg', 'math_t1', 'language_t1', 'science_t1',
       'math_t2', 'language_t2', 'science_t2', 'treatment', 'date_of_birth'],
      dtype='object')