In [53]:
import polars as pl

In [54]:
text_name = "Gpb023.003"

In [55]:
# Load D2672_analysis_segments to a df using polars
df = pl.read_csv(f"./Orna/{text_name}_clean_analysis_segments.csv")

In [56]:
df.head(100)

Segment_ID,Mode,Text,Word_Count
i64,str,str,i64
0,"""auto""","""rgya gar skad du/ sarba ta thA…",860
1,"""allo""","""phyug""",1
2,"""auto""","""rgyur""",1
3,"""allo""","""smra""",1
4,"""auto""","""bar 'gyur""",2
…,…,…,…
95,"""allo""","""gang""",1
96,"""auto""","""mos""",1
97,"""allo""","""dbang bsgyur nas//""",3
98,"""auto""","""las dbang mi""",3


In [57]:
import polars as pl

# Mask rows with Word_Count=1 as null, then forward fill
df = df.with_columns(
    pl.when((pl.col("Word_Count") > 1))
    .then(pl.col("Mode"))
    .otherwise(None)
    .forward_fill()
    .alias("Mode")
)

In [58]:
df.slice(5,10)

Segment_ID,Mode,Text,Word_Count
i64,str,str,i64
5,"""allo""","""ro// 'thad pa bsgrub pa'i rigs…",10
6,"""auto""","""rigs pa gnas skabs thams cad d…",23
7,"""auto""","""rnams""",1
8,"""auto""","""gzhi'i""",1
9,"""auto""","""sgrub""",1
10,"""auto""","""byed la""",2
11,"""auto""","""gzigs""",1
12,"""auto""","""so//""",1
13,"""auto""","""dngos""",1
14,"""auto""","""por smra ba rnams dngos por bs…",15


In [59]:
df.shape

(2489, 4)

In [60]:
# Merge consecutive rows with the same Mode
df_merged = (
    df
    .with_columns(
        # Create group ID that changes when Mode changes
        group=(pl.col("Mode") != pl.col("Mode").shift()).cum_sum()
    )
    .group_by("group", maintain_order=True)
    .agg([
        pl.col("Segment_ID").first(),
        pl.col("Mode").first(),
        pl.col("Text").str.join(delimiter=" "),  # Concatenate text with spaces
        pl.col("Word_Count").sum()
    ])
    .drop("group")
)


In [61]:
df_merged.shape

(1010, 4)

In [62]:
df_merged.head(10)


Segment_ID,Mode,Text,Word_Count
i64,str,str,i64
0,"""auto""","""rgya gar skad du/ sarba ta thA…",860
1,"""auto""","""phyug rgyur smra bar 'gyur""",5
5,"""allo""","""ro// 'thad pa bsgrub pa'i rigs…",10
6,"""auto""","""rigs pa gnas skabs thams cad d…",46
15,"""allo""","""bsgrub ste// 'du shes kyi byad…",7
16,"""auto""","""du byas pa'i blos dmigs pa dan…",10
17,"""allo""","""par du byas pa'i blo la snang …",22
20,"""auto""","""pa'o// de yang so sor rtogs pa…",275
21,"""allo""","""lam kun spyod pas// 'khor ba '…",64
28,"""auto""","""pa dang// lha yi""",4


In [63]:
# check if there are 1s in Word_Count column in df_merged
df_merged.filter(pl.col("Word_Count") == 0).shape[0] > 0

False

In [64]:
# df_merged.write_csv(f"{text_name}_postprocessed_output.csv")