In [12]:
import polars as pl

In [13]:
# Load D2672_analysis_segments to a df using polars
df = pl.read_csv("D2672_analysis_segments.csv")

In [14]:
df.head(100)

Segment_ID,Mode,Text,Word_Count
i64,str,str,i64
0,"""auto""","""@ / / 'phags pa dpung bzangs k…",23
1,"""allo""","""/ chos dbyings mnyam pa nyid l…",43
2,"""auto""","""/ / de ltar bsdus pa'i don dan…",72
3,"""allo""","""/ / nyi ma stong gi gzi""",7
4,"""auto""","""zhes bya ba ni sku dang thugs …",13
…,…,…,…
95,"""allo""","""yis""",1
96,"""auto""","""zhes bya ba ni thugs rje lhag …",14
97,"""allo""","""zhes""",1
98,"""auto""","""bya ba'i""",2


In [15]:
import polars as pl

# Mask rows with Word_Count=1 as null, then forward fill
df = df.with_columns(
    pl.when((pl.col("Word_Count") > 5))
    .then(pl.col("Mode"))
    .otherwise(None)
    .forward_fill()
    .alias("Mode")
)

In [24]:
df.slice(5,10)

Segment_ID,Mode,Text,Word_Count
i64,str,str,i64
5,"""auto""","""/ /""",2
6,"""auto""","""thugs""",1
7,"""auto""","""rje""",1
8,"""auto""","""zhes bya ba ni sems can thams …",15
9,"""allo""","""/ / rtse gcig yid gyur""",6
10,"""auto""","""zhes bya ba ni de bzhin nyid l…",12
11,"""auto""","""/ / bzhugs pa la""",5
12,"""auto""","""zhes bya ba ni tshangs pa'i gn…",21
13,"""allo""","""dpung bzangs tshul bzhin gus p…",11
14,"""auto""","""zhes bya ba ni shin tu dang ba…",24


In [25]:
df.shape

(3475, 4)

In [26]:
# Merge consecutive rows with the same Mode
df_merged = (
    df
    .with_columns(
        # Create group ID that changes when Mode changes
        group=(pl.col("Mode") != pl.col("Mode").shift()).cum_sum()
    )
    .group_by("group", maintain_order=True)
    .agg([
        pl.col("Segment_ID").first(),
        pl.col("Mode").first(),
        pl.col("Text").str.join(delimiter=" "),  # Concatenate text with spaces
        pl.col("Word_Count").sum()
    ])
    .drop("group")
)


In [31]:
df_merged.shape

(991, 4)

In [27]:
df_merged.head(10)


Segment_ID,Mode,Text,Word_Count
i64,str,str,i64
0,"""auto""","""@ / / 'phags pa dpung bzangs k…",23
1,"""allo""","""/ chos dbyings mnyam pa nyid l…",43
2,"""auto""","""/ / de ltar bsdus pa'i don dan…",72
3,"""allo""","""/ / nyi ma stong gi gzi""",7
4,"""auto""","""zhes bya ba ni sku dang thugs …",32
9,"""allo""","""/ / rtse gcig yid gyur""",6
10,"""auto""","""zhes bya ba ni de bzhin nyid l…",38
13,"""allo""","""dpung bzangs tshul bzhin gus p…",11
14,"""auto""","""zhes bya ba ni shin tu dang ba…",230
27,"""allo""","""smos pa ni sa 'dzin nam rdo rj…",12


In [29]:
# check if there are 1s in Word_Count column in df_merged
df_merged.filter(pl.col("Word_Count") <= 5).shape[0] > 0

False

In [30]:
df_merged.write_csv("D2672_postprocessed_output.csv")