# Pandas Transformations

In [1]:
import pandas as pd
import numpy as np

In [2]:
df = pd.read_csv("https://drive.google.com/uc?id=1oE-3rt17bFW7fOzDIjwFSEMPTIV3NvcO")
df[:3]

Unnamed: 0,school_id,grade,class,student_id,sex,nationality,grade_math_t1,grade_language_t1,grade_science_t1,grade_math_t2,grade_language_t2,grade_science_t2,treatment,date_of_birth
0,57,6,A,wvqgd@cunb.edu,1,1,2.046285,2.369783,2.711344,1.175309,1.498807,1.840368,1,1997-07-27
1,57,6,A,j0ihe@cunb.edu,1,2,7.859077,7.206984,6.994455,7.548812,6.896719,6.68419,1,1997-06-24
2,57,6,A,wcjgk@cunb.edu,2,1,7.118976,8.057449,8.57321,6.181019,7.119492,7.635253,1,1997-04-23


## where and mask


### pd.where()
- conditional replacement of values -> keep values where condition is true, replace NaN where condition is false
- replace values where condition is false, with `other = replacement_value`
- `df.where(cond, other=<no_default>, *, inplace=False, axis=None, level=None)`


### pd.mask()
- excludes values where condition is True
- `df.mask(condition, other=replacement)` replaces values where condition is True
- Keeps values where condition is false, excludes where True

In [3]:
df["grade_language_t1"].where(df["grade_language_t1"] >= 5) # shows NaN for values < 5
df["grade_language_t1"].where(df["grade_language_t1"] >= 5, other=5) # replaces values < 5 with 5

0        5.000000
1        7.206984
2        8.057449
3        7.388008
4        6.773626
          ...    
1505     5.306647
1506     8.360153
1507     8.534791
1508     5.071503
1509    10.000000
Name: grade_language_t1, Length: 1510, dtype: float64

In [4]:
df["grade_math_t1"].mask(df["grade_math_t1"] > 8) # shows NaN for values > 8
df["grade_math_t1"].mask(df["grade_math_t1"] > 8, other=10) # replaces values > 8 with 10

0        2.046285
1        7.859077
2        7.118976
3        6.973737
4        6.574877
          ...    
1505     4.938375
1506     7.661931
1507    10.000000
1508     5.689228
1509    10.000000
Name: grade_math_t1, Length: 1510, dtype: float64

In [5]:

math_t2_mean = df[df["grade_math_t2"] <= 10]["grade_math_t2"].mean()
lang_t2_mean = df[df["grade_language_t2"] <= 10]["grade_language_t2"].mean()
science_t2_mean = df[df["grade_science_t2"] <= 10]["grade_science_t2"].mean()

# replace values with mask() - mask excludes values that meet condition and replaces the remaining values with specified value
df["grade_math_t2"] = df["grade_math_t2"].mask(df["grade_math_t2"] > 10, math_t2_mean)
df["grade_language_t2"] = df["grade_language_t2"].mask(df["grade_language_t2"] > 10, lang_t2_mean)

# replace values with where()
df["grade_science_t2"] = df["grade_science_t2"].where(
    df["grade_science_t2"] <= 10, other=science_t2_mean)

In [6]:
df.describe()

Unnamed: 0,school_id,grade,sex,nationality,grade_math_t1,grade_language_t1,grade_science_t1,grade_math_t2,grade_language_t2,grade_science_t2,treatment
count,1510.0,1510.0,1510.0,1510.0,1510.0,1510.0,1510.0,1508.0,1507.0,1510.0,1510.0
mean,457.321192,7.037086,1.203311,1.195364,6.653575,6.628136,6.642331,6.931952,6.901913,6.913011,0.501325
std,315.676789,0.817277,0.402595,0.396612,1.701152,1.713346,1.69825,1.945345,1.945886,1.922238,0.500164
min,57.0,6.0,1.0,1.0,0.0,0.993821,0.0,0.0,0.0,0.0,0.0
25%,141.0,6.0,1.0,1.0,5.563173,5.530065,5.472154,5.635373,5.601699,5.682996,0.0
50%,458.0,7.0,1.0,1.0,6.829838,6.775808,6.757819,7.100935,7.054068,7.003384,1.0
75%,812.0,8.0,1.0,1.0,7.943985,7.852015,7.921141,8.375933,8.355651,8.389772,1.0
max,946.0,8.0,2.0,2.0,10.0,10.0,10.0,10.0,10.0,10.0,1.0


## calculated columns

- Direct operations on entire columns is the fastest vectorized way to manipulate data in pandas.
- optimized C/NumPy backend 
- For simple arithmetic, logical, or string operations (+, *, sum(), str.upper()) on columns.

In [94]:
df_a = df.copy()
df_a["math_avg"] = df_a[["grade_math_t1", "grade_math_t2"]].mean(axis=1)

## column renaming and sorting

In [95]:
df_a[:3]

Unnamed: 0,school_id,grade,class,student_id,sex,nationality,grade_math_t1,grade_language_t1,grade_science_t1,grade_math_t2,grade_language_t2,grade_science_t2,treatment,date_of_birth,math_avg
0,57,6,A,wvqgd@cunb.edu,1,1,2.046285,2.369783,2.711344,1.175309,1.498807,1.840368,1,1997-07-27,1.610797
1,57,6,A,j0ihe@cunb.edu,1,2,7.859077,7.206984,6.994455,7.548812,6.896719,6.68419,1,1997-06-24,7.703944
2,57,6,A,wcjgk@cunb.edu,2,1,7.118976,8.057449,8.57321,6.181019,7.119492,7.635253,1,1997-04-23,6.649997


In [96]:
df_a.columns = df_a.columns.str.lower().str.replace("grade_", "")
df_a

Unnamed: 0,school_id,grade,class,student_id,sex,nationality,math_t1,language_t1,science_t1,math_t2,language_t2,science_t2,treatment,date_of_birth,math_avg
0,57,6,A,wvqgd@cunb.edu,1,1,2.046285,2.369783,2.711344,1.175309,1.498807,1.840368,1,1997-07-27,1.610797
1,57,6,A,j0ihe@cunb.edu,1,2,7.859077,7.206984,6.994455,7.548812,6.896719,6.684190,1,1997-06-24,7.703944
2,57,6,A,wcjgk@cunb.edu,2,1,7.118976,8.057449,8.573210,6.181019,7.119492,7.635253,1,1997-04-23,6.649997
3,57,6,A,mzmqb@cunb.edu,1,1,6.973737,7.388008,6.628448,7.593102,8.007373,7.247812,1,1997-02-24,7.283419
4,57,6,A,s6n0y@cunb.edu,1,1,6.574877,6.773626,7.844316,7.611065,7.809814,8.880504,1,1996-09-05,7.092971
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1505,946,8,D,jj36e@cunb.edu,1,1,4.938375,5.306647,4.985917,6.162418,6.530690,6.209960,0,1995-07-04,5.550397
1506,946,8,D,ca1dg@cunb.edu,1,1,7.661931,8.360153,7.937165,8.179364,8.877586,8.454598,0,1995-08-23,7.920647
1507,946,8,D,amdrx@cunb.edu,1,1,9.248758,8.534791,8.056641,8.773601,8.059634,7.581484,0,1994-12-15,9.011180
1508,946,8,D,yn5ug@cunb.edu,1,2,5.689228,5.071503,5.338547,6.382995,5.765270,6.032315,0,1994-09-18,6.036111


In [97]:
df_a = df_a[[ 
    "school_id",
    "grade",
    "class",
    "student_id",
    "sex",
    "nationality",
    "math_avg",
    "math_t1",
    "language_t1",
    "science_t1",
    "math_t2",
    "language_t2",
    "science_t2",
    "treatment",
    "date_of_birth",
]]
df_a

Unnamed: 0,school_id,grade,class,student_id,sex,nationality,math_avg,math_t1,language_t1,science_t1,math_t2,language_t2,science_t2,treatment,date_of_birth
0,57,6,A,wvqgd@cunb.edu,1,1,1.610797,2.046285,2.369783,2.711344,1.175309,1.498807,1.840368,1,1997-07-27
1,57,6,A,j0ihe@cunb.edu,1,2,7.703944,7.859077,7.206984,6.994455,7.548812,6.896719,6.684190,1,1997-06-24
2,57,6,A,wcjgk@cunb.edu,2,1,6.649997,7.118976,8.057449,8.573210,6.181019,7.119492,7.635253,1,1997-04-23
3,57,6,A,mzmqb@cunb.edu,1,1,7.283419,6.973737,7.388008,6.628448,7.593102,8.007373,7.247812,1,1997-02-24
4,57,6,A,s6n0y@cunb.edu,1,1,7.092971,6.574877,6.773626,7.844316,7.611065,7.809814,8.880504,1,1996-09-05
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1505,946,8,D,jj36e@cunb.edu,1,1,5.550397,4.938375,5.306647,4.985917,6.162418,6.530690,6.209960,0,1995-07-04
1506,946,8,D,ca1dg@cunb.edu,1,1,7.920647,7.661931,8.360153,7.937165,8.179364,8.877586,8.454598,0,1995-08-23
1507,946,8,D,amdrx@cunb.edu,1,1,9.011180,9.248758,8.534791,8.056641,8.773601,8.059634,7.581484,0,1994-12-15
1508,946,8,D,yn5ug@cunb.edu,1,2,6.036111,5.689228,5.071503,5.338547,6.382995,5.765270,6.032315,0,1994-09-18


## map()

- Element-wise transformation of a Series (single column) using a dictionary, Series, or function.
- Only works on Series (not DataFrames).
- Useful for replacing values or simple lookups.
- Faster than apply() for Series, but slower than vectorized operations.


In [98]:
# Map values using a dictionary
df_b = df_a.copy()
df_b["sex_coded"] = df_b["sex"].map({1: "male", 2: "female"})

## apply()
- Row-wise or column-wise operations requiring custom logic or multiple columns.
- Works on DataFrames for rows (axis=1) or columns (axis=0). 
- Slower than vectorized operations (Python-level loops).

In [99]:
df_b["math_extra"] = df_b["math_avg"].apply(lambda x: x**2 if x < 2 else x)
df_b


Unnamed: 0,school_id,grade,class,student_id,sex,nationality,math_avg,math_t1,language_t1,science_t1,math_t2,language_t2,science_t2,treatment,date_of_birth,sex_coded,math_extra
0,57,6,A,wvqgd@cunb.edu,1,1,1.610797,2.046285,2.369783,2.711344,1.175309,1.498807,1.840368,1,1997-07-27,male,2.594667
1,57,6,A,j0ihe@cunb.edu,1,2,7.703944,7.859077,7.206984,6.994455,7.548812,6.896719,6.684190,1,1997-06-24,male,7.703944
2,57,6,A,wcjgk@cunb.edu,2,1,6.649997,7.118976,8.057449,8.573210,6.181019,7.119492,7.635253,1,1997-04-23,female,6.649997
3,57,6,A,mzmqb@cunb.edu,1,1,7.283419,6.973737,7.388008,6.628448,7.593102,8.007373,7.247812,1,1997-02-24,male,7.283419
4,57,6,A,s6n0y@cunb.edu,1,1,7.092971,6.574877,6.773626,7.844316,7.611065,7.809814,8.880504,1,1996-09-05,male,7.092971
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1505,946,8,D,jj36e@cunb.edu,1,1,5.550397,4.938375,5.306647,4.985917,6.162418,6.530690,6.209960,0,1995-07-04,male,5.550397
1506,946,8,D,ca1dg@cunb.edu,1,1,7.920647,7.661931,8.360153,7.937165,8.179364,8.877586,8.454598,0,1995-08-23,male,7.920647
1507,946,8,D,amdrx@cunb.edu,1,1,9.011180,9.248758,8.534791,8.056641,8.773601,8.059634,7.581484,0,1994-12-15,male,9.011180
1508,946,8,D,yn5ug@cunb.edu,1,2,6.036111,5.689228,5.071503,5.338547,6.382995,5.765270,6.032315,0,1994-09-18,male,6.036111


## rearranging columns

In [100]:
df_b.columns

Index(['school_id', 'grade', 'class', 'student_id', 'sex', 'nationality',
       'math_avg', 'math_t1', 'language_t1', 'science_t1', 'math_t2',
       'language_t2', 'science_t2', 'treatment', 'date_of_birth', 'sex_coded',
       'math_extra'],
      dtype='str')

In [101]:
df_b = df_b[
    [
        "school_id",
        "grade",
        "class",
        "student_id",
        "date_of_birth",
        "treatment",
        "nationality",
        "sex",
        "sex_coded",
        "math_avg",
        "math_extra",
        "math_t1",
        "language_t1",
        "science_t1",
        "math_t2",
        "language_t2",
        "science_t2",
    ]
]
df_b


Unnamed: 0,school_id,grade,class,student_id,date_of_birth,treatment,nationality,sex,sex_coded,math_avg,math_extra,math_t1,language_t1,science_t1,math_t2,language_t2,science_t2
0,57,6,A,wvqgd@cunb.edu,1997-07-27,1,1,1,male,1.610797,2.594667,2.046285,2.369783,2.711344,1.175309,1.498807,1.840368
1,57,6,A,j0ihe@cunb.edu,1997-06-24,1,2,1,male,7.703944,7.703944,7.859077,7.206984,6.994455,7.548812,6.896719,6.684190
2,57,6,A,wcjgk@cunb.edu,1997-04-23,1,1,2,female,6.649997,6.649997,7.118976,8.057449,8.573210,6.181019,7.119492,7.635253
3,57,6,A,mzmqb@cunb.edu,1997-02-24,1,1,1,male,7.283419,7.283419,6.973737,7.388008,6.628448,7.593102,8.007373,7.247812
4,57,6,A,s6n0y@cunb.edu,1996-09-05,1,1,1,male,7.092971,7.092971,6.574877,6.773626,7.844316,7.611065,7.809814,8.880504
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1505,946,8,D,jj36e@cunb.edu,1995-07-04,0,1,1,male,5.550397,5.550397,4.938375,5.306647,4.985917,6.162418,6.530690,6.209960
1506,946,8,D,ca1dg@cunb.edu,1995-08-23,0,1,1,male,7.920647,7.920647,7.661931,8.360153,7.937165,8.179364,8.877586,8.454598
1507,946,8,D,amdrx@cunb.edu,1994-12-15,0,1,1,male,9.011180,9.011180,9.248758,8.534791,8.056641,8.773601,8.059634,7.581484
1508,946,8,D,yn5ug@cunb.edu,1994-09-18,0,2,1,male,6.036111,6.036111,5.689228,5.071503,5.338547,6.382995,5.765270,6.032315


## insert() 
- The insert() method allows you to add a column at any position in a DataFrame.

In [102]:
condition = (df_b["language_t1"] < 2) & (df_b["language_t2"] < 2)

df_b.insert(
    loc=11,
    column="language_extra",
    value=np.where(
        condition, (df_b["language_t1"] + df_b["language_t2"]) / 2, df_b["language_t1"]
    ),
)


df_b

Unnamed: 0,school_id,grade,class,student_id,date_of_birth,treatment,nationality,sex,sex_coded,math_avg,math_extra,language_extra,math_t1,language_t1,science_t1,math_t2,language_t2,science_t2
0,57,6,A,wvqgd@cunb.edu,1997-07-27,1,1,1,male,1.610797,2.594667,2.369783,2.046285,2.369783,2.711344,1.175309,1.498807,1.840368
1,57,6,A,j0ihe@cunb.edu,1997-06-24,1,2,1,male,7.703944,7.703944,7.206984,7.859077,7.206984,6.994455,7.548812,6.896719,6.684190
2,57,6,A,wcjgk@cunb.edu,1997-04-23,1,1,2,female,6.649997,6.649997,8.057449,7.118976,8.057449,8.573210,6.181019,7.119492,7.635253
3,57,6,A,mzmqb@cunb.edu,1997-02-24,1,1,1,male,7.283419,7.283419,7.388008,6.973737,7.388008,6.628448,7.593102,8.007373,7.247812
4,57,6,A,s6n0y@cunb.edu,1996-09-05,1,1,1,male,7.092971,7.092971,6.773626,6.574877,6.773626,7.844316,7.611065,7.809814,8.880504
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1505,946,8,D,jj36e@cunb.edu,1995-07-04,0,1,1,male,5.550397,5.550397,5.306647,4.938375,5.306647,4.985917,6.162418,6.530690,6.209960
1506,946,8,D,ca1dg@cunb.edu,1995-08-23,0,1,1,male,7.920647,7.920647,8.360153,7.661931,8.360153,7.937165,8.179364,8.877586,8.454598
1507,946,8,D,amdrx@cunb.edu,1994-12-15,0,1,1,male,9.011180,9.011180,8.534791,9.248758,8.534791,8.056641,8.773601,8.059634,7.581484
1508,946,8,D,yn5ug@cunb.edu,1994-09-18,0,2,1,male,6.036111,6.036111,5.071503,5.689228,5.071503,5.338547,6.382995,5.765270,6.032315


 ## assign()
- Add one or more new columns in a method-chaining workflow.
- Create new columns and return a new DataFrame without modifying the original.
- Either appends a new column or assigns new values to an existing column.
- You can query the df like a subquery in SQL, assign() returns a new df, leaving the original unchanged.
- Chaining: `df.assign(...).query(...)`

In [None]:
# overwrite values conditionally
df_b.assign(grade=0)

# create new column based on existing column
df_b.assign(math_normalized=(df_b["math_avg"] - df_b["math_avg"].mean()) / df_b["math_avg"].std())[
    ["math_avg", "math_normalized"]
].query("math_normalized > 1.75")

Unnamed: 0,math_avg,math_normalized
31,9.868626,1.764926
97,10.0,1.84027
451,10.0,1.84027
740,9.850244,1.754384
900,9.855506,1.757402
976,10.0,1.84027
983,9.893287,1.779069
1209,9.859381,1.759624
1364,9.84458,1.751135
1370,9.986122,1.832311


## assign() multiple columns

In [109]:
df_b.assign(
    math_improvement=df_b['math_t2'] - df_b['math_t1'],
    language_improvement=df_b['language_t2'] - df_b['language_t1']
).query('math_improvement > 2 and language_improvement > 2')

# df_b # unchanged

Unnamed: 0,school_id,grade,class,student_id,date_of_birth,treatment,nationality,sex,sex_coded,math_avg,math_extra,language_extra,math_t1,language_t1,science_t1,math_t2,language_t2,science_t2,math_improvement,language_improvement
26,57,6,C,ealcb@cunb.edu,1997-05-03,1,2,1,male,8.476275,8.476275,7.308350,7.386236,7.308350,7.040684,9.566315,9.488430,9.220763,2.180080,2.180080
28,57,6,C,mzlrh@cunb.edu,1997-06-19,1,1,1,male,7.482958,7.482958,6.050160,5.794664,6.050160,7.155896,9.171252,9.426748,10.000000,3.376588,3.376588
35,57,6,C,wbcfx@cunb.edu,1996-10-24,1,2,1,male,8.470308,8.470308,6.494307,6.940615,6.494307,6.535804,10.000000,9.664280,9.705778,3.059385,3.169974
42,57,6,D,mcg7z@cunb.edu,1997-04-22,1,2,1,male,3.932210,3.932210,1.740182,2.326798,1.740182,2.249668,5.537621,4.951005,5.460491,3.210823,3.210823
89,57,7,D,g5hdy@cunb.edu,1996-07-02,1,1,1,male,7.586657,7.586657,4.635556,6.185302,4.635556,5.475284,8.988013,7.438267,8.277995,2.802711,2.802711
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1355,881,8,D,5mbrn@cunb.edu,1995-02-13,1,1,1,male,7.554429,7.554429,7.717830,6.486369,7.717830,7.744453,8.622489,9.853950,9.880572,2.136120,2.136120
1399,946,6,C,ra7mm@cunb.edu,1997-02-22,1,2,1,male,7.502663,7.502663,6.674345,6.384862,6.674345,5.853809,8.620463,8.909946,8.089409,2.235601,2.235601
1414,946,7,A,ufurz@cunb.edu,1996-08-12,1,2,1,male,5.911458,5.911458,4.184833,4.785588,4.184833,5.138892,7.037328,6.436573,7.390633,2.251740,2.251740
1429,946,7,B,obkej@cunb.edu,1996-06-07,1,2,1,male,7.098223,7.098223,5.782356,5.572450,5.782356,6.105611,8.623995,8.833901,9.157156,3.051545,3.051545


## pd.col() and method chaining

In [37]:
# pd.col() works since pandas 3.0 which allows chaining methods
df_b.assign(
    math_improvement=(pd.col("math_t2") - pd.col("math_t1")).round(),
    language_improvement=(pd.col("language_t2") - pd.col("language_t1")).round(),
).query("math_improvement > 2 and language_improvement > 2")


Unnamed: 0,school_id,grade,class,student_id,date_of_birth,treatment,nationality,sex,sex_coded,math_avg,math_extra,language_extra,math_t1,language_t1,science_t1,math_t2,language_t2,science_t2,math_improvement,language_improvement
28,57,6,C,mzlrh@cunb.edu,1997-06-19,1,1,1,male,7.482958,7.482958,6.05016,5.794664,6.05016,7.155896,9.171252,9.426748,10.0,3.0,3.0
35,57,6,C,wbcfx@cunb.edu,1996-10-24,1,2,1,male,8.470308,8.470308,6.494307,6.940615,6.494307,6.535804,10.0,9.66428,9.705778,3.0,3.0
42,57,6,D,mcg7z@cunb.edu,1997-04-22,1,2,1,male,3.93221,3.93221,1.740182,2.326798,1.740182,2.249668,5.537621,4.951005,5.460491,3.0,3.0
89,57,7,D,g5hdy@cunb.edu,1996-07-02,1,1,1,male,7.586657,7.586657,4.635556,6.185302,4.635556,5.475284,8.988013,7.438267,8.277995,3.0,3.0
227,60,7,B,ci0nk@cunb.edu,1996-08-10,1,2,1,male,8.099917,8.099917,5.412063,6.199834,5.412063,5.894175,10.0,9.546667,10.0,4.0,4.0
359,141,7,A,mioli@cunb.edu,1996-03-27,1,2,2,female,8.554796,8.554796,7.3426,7.109591,7.3426,5.471583,10.0,10.0,8.924098,3.0,3.0
383,141,7,C,2hbgs@cunb.edu,1996-01-14,1,2,1,male,8.674163,8.674163,6.86235,7.348327,6.86235,5.577604,10.0,10.0,9.297259,3.0,3.0
408,141,8,B,n0nxd@cunb.edu,1995-04-19,1,2,1,male,7.481388,7.481388,5.900925,6.205239,5.900925,7.419231,8.757537,8.453223,9.971529,3.0,3.0
431,141,8,D,6amst@cunb.edu,1995-02-14,1,1,1,male,6.311199,6.311199,5.348811,4.691448,5.348811,4.924348,7.930949,8.588312,8.163849,3.0,3.0
470,262,6,C,xsuhy@cunb.edu,1996-11-28,1,1,1,male,5.181966,5.181966,3.689339,3.873139,3.689339,4.560038,6.490793,6.306993,7.177691,3.0,3.0


## df.transform()
- Group-wise operations where the output must match the input shape (e.g., normalization, filling NA with group mean).


In [121]:
# Normalize math_avg by grade
df_b.groupby("grade")["math_avg"].transform(
    lambda x: (x - x.mean()) / x.std()
)
df_b

Unnamed: 0,school_id,grade,class,student_id,date_of_birth,treatment,nationality,sex,sex_coded,math_avg,math_extra,language_extra,math_t1,language_t1,science_t1,math_t2,language_t2,science_t2,math_grouped_normalized
0,57,6,A,wvqgd@cunb.edu,1997-07-27,1,1,1,male,1.610797,2.594667,2.369783,2.046285,2.369783,2.711344,1.175309,1.498807,1.840368,-2.916020
1,57,6,A,j0ihe@cunb.edu,1997-06-24,1,2,1,male,7.703944,7.703944,7.206984,7.859077,7.206984,6.994455,7.548812,6.896719,6.684190,0.533368
2,57,6,A,wcjgk@cunb.edu,1997-04-23,1,1,2,female,6.649997,6.649997,8.057449,7.118976,8.057449,8.573210,6.181019,7.119492,7.635253,-0.063282
3,57,6,A,mzmqb@cunb.edu,1997-02-24,1,1,1,male,7.283419,7.283419,7.388008,6.973737,7.388008,6.628448,7.593102,8.007373,7.247812,0.295305
4,57,6,A,s6n0y@cunb.edu,1996-09-05,1,1,1,male,7.092971,7.092971,6.773626,6.574877,6.773626,7.844316,7.611065,7.809814,8.880504,0.187490
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1505,946,8,D,jj36e@cunb.edu,1995-07-04,0,1,1,male,5.550397,5.550397,5.306647,4.938375,5.306647,4.985917,6.162418,6.530690,6.209960,-0.764210
1506,946,8,D,ca1dg@cunb.edu,1995-08-23,0,1,1,male,7.920647,7.920647,8.360153,7.661931,8.360153,7.937165,8.179364,8.877586,8.454598,0.647955
1507,946,8,D,amdrx@cunb.edu,1994-12-15,0,1,1,male,9.011180,9.011180,8.534791,9.248758,8.534791,8.056641,8.773601,8.059634,7.581484,1.297680
1508,946,8,D,yn5ug@cunb.edu,1994-09-18,0,2,1,male,6.036111,6.036111,5.071503,5.689228,5.071503,5.338547,6.382995,5.765270,6.032315,-0.474827


In [43]:
df_b = df_b[
    [
        "school_id",
        "grade",
        "class",
        "student_id",
        "date_of_birth",
        "treatment",
        "nationality",
        "sex",
        "sex_coded",
        "math_avg",
        "math_extra",
        "language_extra",
        "math_t1",
        "language_t1",
        "science_t1",
        "math_t2",
        "language_t2",
        "science_t2",
    ]
]
df_b


Unnamed: 0,school_id,grade,class,student_id,date_of_birth,treatment,nationality,sex,sex_coded,math_avg,math_extra,language_extra,math_t1,language_t1,science_t1,math_t2,language_t2,science_t2
0,57,6,A,wvqgd@cunb.edu,1997-07-27,1,1,1,male,1.610797,2.594667,2.369783,2.046285,2.369783,2.711344,1.175309,1.498807,1.840368
1,57,6,A,j0ihe@cunb.edu,1997-06-24,1,2,1,male,7.703944,7.703944,7.206984,7.859077,7.206984,6.994455,7.548812,6.896719,6.684190
2,57,6,A,wcjgk@cunb.edu,1997-04-23,1,1,2,female,6.649997,6.649997,8.057449,7.118976,8.057449,8.573210,6.181019,7.119492,7.635253
3,57,6,A,mzmqb@cunb.edu,1997-02-24,1,1,1,male,7.283419,7.283419,7.388008,6.973737,7.388008,6.628448,7.593102,8.007373,7.247812
4,57,6,A,s6n0y@cunb.edu,1996-09-05,1,1,1,male,7.092971,7.092971,6.773626,6.574877,6.773626,7.844316,7.611065,7.809814,8.880504
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1505,946,8,D,jj36e@cunb.edu,1995-07-04,0,1,1,male,5.550397,5.550397,5.306647,4.938375,5.306647,4.985917,6.162418,6.530690,6.209960
1506,946,8,D,ca1dg@cunb.edu,1995-08-23,0,1,1,male,7.920647,7.920647,8.360153,7.661931,8.360153,7.937165,8.179364,8.877586,8.454598
1507,946,8,D,amdrx@cunb.edu,1994-12-15,0,1,1,male,9.011180,9.011180,8.534791,9.248758,8.534791,8.056641,8.773601,8.059634,7.581484
1508,946,8,D,yn5ug@cunb.edu,1994-09-18,0,2,1,male,6.036111,6.036111,5.071503,5.689228,5.071503,5.338547,6.382995,5.765270,6.032315


## lambda functions

In [46]:
df_c = df_b.copy()
# apply a function to rows
df_c["math_avg"].apply(lambda x: x*10 / 100)

# assign a new column, with name
df_b.assign(AverageMath=lambda x: (x["math_t1"]+df_b["math_t2"]) / 2)

df_c = df_b.copy()
df_c["math_extra_male_sqr"] = df_c.apply(lambda x: np.square(x["math_extra"]) if x.sex_coded == "male" else x, axis=1)
df_c

Unnamed: 0,school_id,grade,class,student_id,date_of_birth,treatment,nationality,sex,sex_coded,math_avg,math_extra,language_extra,math_t1,language_t1,science_t1,math_t2,language_t2,science_t2,math_extra_male_sqr
0,57,6,A,wvqgd@cunb.edu,1997-07-27,1,1,1,male,1.610797,2.594667,2.369783,2.046285,2.369783,2.711344,1.175309,1.498807,1.840368,6.732298
1,57,6,A,j0ihe@cunb.edu,1997-06-24,1,2,1,male,7.703944,7.703944,7.206984,7.859077,7.206984,6.994455,7.548812,6.896719,6.684190,59.350755
2,57,6,A,wcjgk@cunb.edu,1997-04-23,1,1,2,female,6.649997,6.649997,8.057449,7.118976,8.057449,8.573210,6.181019,7.119492,7.635253,school_id 57 grade ...
3,57,6,A,mzmqb@cunb.edu,1997-02-24,1,1,1,male,7.283419,7.283419,7.388008,6.973737,7.388008,6.628448,7.593102,8.007373,7.247812,53.048197
4,57,6,A,s6n0y@cunb.edu,1996-09-05,1,1,1,male,7.092971,7.092971,6.773626,6.574877,6.773626,7.844316,7.611065,7.809814,8.880504,50.310243
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1505,946,8,D,jj36e@cunb.edu,1995-07-04,0,1,1,male,5.550397,5.550397,5.306647,4.938375,5.306647,4.985917,6.162418,6.530690,6.209960,30.806904
1506,946,8,D,ca1dg@cunb.edu,1995-08-23,0,1,1,male,7.920647,7.920647,8.360153,7.661931,8.360153,7.937165,8.179364,8.877586,8.454598,62.736655
1507,946,8,D,amdrx@cunb.edu,1994-12-15,0,1,1,male,9.011180,9.011180,8.534791,9.248758,8.534791,8.056641,8.773601,8.059634,7.581484,81.201362
1508,946,8,D,yn5ug@cunb.edu,1994-09-18,0,2,1,male,6.036111,6.036111,5.071503,5.689228,5.071503,5.338547,6.382995,5.765270,6.032315,36.434641


## df.pipe()
- pandas.pipe() allows you to chain multiple functions together in a clean, readable way. 
- Instead of nesting functions inside functions, you apply them one after another like steps in a process.

In [None]:
# df_d = df_b.query("school_id in [57, 946] and sex == 1")
# df_d = df_d[df_d["date_of_birth"] > "1996-01-01"]
# df_d = df_d.where( (df_d["math_avg"] > 8) & (df_d["language_extra"] > 8))
# df_d = df_d.dropna()
# df_d[:3]


df_pipe = (
    df_b
    .pipe(lambda x: x.query("school_id in [57, 946] and sex == 1"))
    .pipe(lambda x: x[x["date_of_birth"] > "1996-01-01"])
    .pipe(lambda x: x.where( (x["math_avg"] > 8) & (x["language_extra"] > 8)))
    .dropna()
)
df_pipe[:3]

Unnamed: 0,school_id,grade,class,student_id,date_of_birth,treatment,nationality,sex,sex_coded,math_avg,math_extra,language_extra,math_t1,language_t1,science_t1,math_t2,language_t2,science_t2
6,57.0,6.0,A,ylw2t@cunb.edu,1997-04-23,1.0,1.0,1.0,male,8.123594,8.123594,8.015355,8.611935,8.015355,9.27773,7.635252,7.038671,8.301046
9,57.0,6.0,A,crjpx@cunb.edu,1996-10-24,1.0,1.0,1.0,male,8.637265,8.637265,9.126943,9.039362,9.126943,8.789369,8.235168,8.322749,7.985175
15,57.0,6.0,A,4js7l@cunb.edu,1997-02-11,1.0,1.0,1.0,male,8.192196,8.192196,8.291754,8.775585,8.291754,9.315957,7.608807,7.124976,8.14918


## Window Functions 
Window functions are functions where the input values are taken from a “window” of one or more rows in a series or a table and calculation is performed over them.

pd.rolling()

In [138]:
df_b["rolling_avg"] = df_b["math_avg"].rolling(3).mean()
df_b["rolling_sum"] = df_b["math_avg"].rolling(2).sum()

df_b[["math_avg", "rolling_avg", "rolling_sum"]]

Unnamed: 0,math_avg,rolling_avg,rolling_sum
0,1.610797,,
1,7.703944,,9.314741
2,6.649997,5.321579,14.353941
3,7.283419,7.212454,13.933417
4,7.092971,7.008796,14.376391
...,...,...,...
1505,5.550397,5.883541,11.926844
1506,7.920647,6.615830,13.471044
1507,9.011180,7.494075,16.931827
1508,6.036111,7.655980,15.047291


In [174]:
import scipy.signal

df_b["weighted_mean"] = df_b["math_avg"].rolling(window=5, win_type="triang").mean()
df_b[["math_avg", "rolling_avg", "rolling_sum", "weighted_mean"]]

Unnamed: 0,math_avg,rolling_avg,rolling_sum,weighted_mean
0,1.610797,,,
1,7.703944,,9.314741,
2,6.649997,5.321579,14.353941,
3,7.283419,7.212454,13.933417,
4,7.092971,7.008796,14.376391,6.514276
...,...,...,...,...
1505,5.550397,5.883541,11.926844,5.992603
1506,7.920647,6.615830,13.471044,6.285107
1507,9.011180,7.494075,16.931827,6.664482
1508,6.036111,7.655980,15.047291,7.255295


## cut()
- Bin values into discrete intervals.

```
pd.cut(x, bins, right=True, 
  labels=None, retbins=False, 
  precision=3, include_lowest=False, 
  duplicates='raise', ordered=True)```

In [163]:
pd.cut(df_b["math_avg"], 5, include_lowest=True, ordered=True)
pd.cut(df_b["math_avg"], 3, labels=["bad", "medium", "good"])

0          bad
1         good
2       medium
3         good
4         good
         ...  
1505    medium
1506      good
1507      good
1508    medium
1509      good
Name: math_avg, Length: 1510, dtype: category
Categories (3, str): ['bad' < 'medium' < 'good']

## qcut() 
- Quantile-based discretization function.
- Discretize variable into equal-sized buckets based on rank or based on sample quantiles.

In [172]:
df_b["math_quartile_no"] = pd.qcut(df_b["math_avg"], 4)
df_b["math_quartile_labels"] = pd.qcut(df_b["math_avg"], 4, labels=["very_low", "low", "medium", "high"])
df_b[["math_avg", "math_quartile", "math_quartile_no"]]


Unnamed: 0,math_avg,math_quartile,math_quartile_no
0,1.610797,very_low,"(-0.001, 5.648]"
1,7.703944,medium,"(6.926, 8.139]"
2,6.649997,low,"(5.648, 6.926]"
3,7.283419,medium,"(6.926, 8.139]"
4,7.092971,medium,"(6.926, 8.139]"
...,...,...,...
1505,5.550397,very_low,"(-0.001, 5.648]"
1506,7.920647,medium,"(6.926, 8.139]"
1507,9.011180,high,"(8.139, 10.0]"
1508,6.036111,low,"(5.648, 6.926]"
