In [154]:
import pandas as pd

csv_path = "bangkok_traffy.csv"
df = pd.read_csv(csv_path, encoding="utf-8-sig", low_memory=False)
print(df.shape)
print(df.columns)

(132493, 16)
Index(['ticket_id', 'type', 'organization', 'comment', 'photo', 'photo_after',
       'coords', 'address', 'subdistrict', 'district', 'province', 'timestamp',
       'state', 'star', 'count_reopen', 'last_activity'],
      dtype='object')


In [155]:
print(df.dtypes)

ticket_id         object
type              object
organization      object
comment           object
photo             object
photo_after       object
coords            object
address           object
subdistrict       object
district          object
province          object
timestamp         object
state             object
star             float64
count_reopen       int64
last_activity     object
dtype: object


In [156]:
str_cols = df.select_dtypes(include=["object"]).columns
print(str_cols)

Index(['ticket_id', 'type', 'organization', 'comment', 'photo', 'photo_after',
       'coords', 'address', 'subdistrict', 'district', 'province', 'timestamp',
       'state', 'last_activity'],
      dtype='object')


In [157]:
df[str_cols] = df[str_cols].astype("string")
print(df.dtypes)

ticket_id        string[python]
type             string[python]
organization     string[python]
comment          string[python]
photo            string[python]
photo_after      string[python]
coords           string[python]
address          string[python]
subdistrict      string[python]
district         string[python]
province         string[python]
timestamp        string[python]
state            string[python]
star                    float64
count_reopen              int64
last_activity    string[python]
dtype: object


In [158]:
df[df['star'].notna() & (df['star'] % 1 == 0)].head(2)

Unnamed: 0,ticket_id,type,organization,comment,photo,photo_after,coords,address,subdistrict,district,province,timestamp,state,star,count_reopen,last_activity
1,2021-CGPMUN,"{น้ำท่วม,ร้องเรียน}","เขตประเวศ,ฝ่ายโยธา เขตประเวศ",น้ำท่วมเวลาฝนตกและทะลุเข้าบ้านเดือดร้อนมากทุกๆ...,https://storage.googleapis.com/traffy_public_b...,https://storage.googleapis.com/traffy_public_b...,"100.66709,13.67891",189 เฉลิมพระเกียรติ ร.9 แขวง หนองบอน เขต ประเว...,หนองบอน,ประเวศ,กรุงเทพมหานคร,2021-09-19 14:56:08.924992+00,เสร็จสิ้น,4.0,0,2022-06-21 08:21:09.532782+00
4,2021-DVEWYM,"{น้ำท่วม,ถนน}","เขตลาดพร้าว,ฝ่ายโยธา เขตลาดพร้าว",ซอยลาดพร้าววังหิน 75 ถนนลาดพร้าววังหิน แขวงลาด...,https://storage.googleapis.com/traffy_public_b...,,"100.59165,13.82280",702 ถ. ลาดพร้าววังหิน แขวงลาดพร้าว เขตลาดพร้าว...,ลาดพร้าว,ลาดพร้าว,กรุงเทพมหานคร,2021-12-09 12:29:08.408763+00,เสร็จสิ้น,5.0,0,2022-08-12 07:18:44.884945+00


In [159]:
def clean_data(df: pd.DataFrame) -> pd.DataFrame:
    for col in str_cols:
        df = df[df[col].notna() & (df[col].str.strip() != "")]

    df = df.dropna(subset=["ticket_id", "star"])
    # df = df[df["count_reopen"].notna() & (df["count_reopen"] >= 0)]
    df = df[df["type"].notna() & (df["type"].str.strip("{}") != "")]
    df = df[df["coords"].notna() & (df["coords"].str.strip() != "") & df["coords"].str.contains(",")]
    df["count_reopen"] = df["count_reopen"].fillna(0).astype(int)
    
    df["timestamp"] = pd.to_datetime(df["timestamp"], errors="coerce")
    df["last_activity"] = pd.to_datetime(df["last_activity"], errors="coerce")
    
    return df


In [160]:
cleaned_df = clean_data(df)

In [161]:
print(cleaned_df.head(2))

     ticket_id                 type  \
1  2021-CGPMUN  {น้ำท่วม,ร้องเรียน}   
7  2021-8N9ZP8          {ความสะอาด}   

                                        organization  \
1                       เขตประเวศ,ฝ่ายโยธา เขตประเวศ   
7  เขตประเวศ,ฝ่ายเทศกิจ เขตประเวศ,ฝ่ายรักษาความสะ...   

                                             comment  \
1  น้ำท่วมเวลาฝนตกและทะลุเข้าบ้านเดือดร้อนมากทุกๆ...   
7                  คนเอาขยะมาทิ้งจนกลายเป็นกองขยะค่ะ   

                                               photo  \
1  https://storage.googleapis.com/traffy_public_b...   
7  https://storage.googleapis.com/traffy_public_b...   

                                         photo_after              coords  \
1  https://storage.googleapis.com/traffy_public_b...  100.66709,13.67891   
7  https://storage.googleapis.com/traffy_public_b...  100.64690,13.67083   

                                             address subdistrict district  \
1  189 เฉลิมพระเกียรติ ร.9 แขวง หนองบอน เขต ประเว...     หนองบอน   ปร

In [162]:
print(cleaned_df.shape)

(34645, 16)


In [163]:
def process_data(df: pd.DataFrame) -> pd.DataFrame:
    df['type'] = df['type'].str.strip('{}').str.split(',')
    df['organization'] = df['organization'].str.split(',')
    df["coords"] = df["coords"].str.split(",").apply(lambda x: [float(i) for i in x])
    df["lat"] = df["coords"].apply(lambda x: x[0])
    df["lon"] = df["coords"].apply(lambda x: x[1])
    df["duration_min"] = (df["last_activity"] - df["timestamp"]).dt.total_seconds() / 60
    df = df.drop(columns=["coords"])
    
    return df

In [164]:
print(cleaned_df.dtypes)

ticket_id             string[python]
type                  string[python]
organization          string[python]
comment               string[python]
photo                 string[python]
photo_after           string[python]
coords                string[python]
address               string[python]
subdistrict           string[python]
district              string[python]
province              string[python]
timestamp        datetime64[ns, UTC]
state                 string[python]
star                         float64
count_reopen                   int64
last_activity    datetime64[ns, UTC]
dtype: object


In [165]:
# n = 10
# tmp = cleaned_df.head(n)
tmp = cleaned_df
tmp = process_data(tmp)

In [166]:
tmp.to_json('output2.jsonl', orient='records', lines=True, force_ascii=False)

In [167]:
print(tmp.dtypes)
print(tmp.shape)
print(tmp[tmp["duration_min"] > 24*60].shape)
print(tmp[tmp["duration_min"] > 24*60]["duration_min"].head(5))

ticket_id             string[python]
type                          object
organization                  object
comment               string[python]
photo                 string[python]
photo_after           string[python]
address               string[python]
subdistrict           string[python]
district              string[python]
province              string[python]
timestamp        datetime64[ns, UTC]
state                 string[python]
star                         float64
count_reopen                   int64
last_activity    datetime64[ns, UTC]
lat                          float64
lon                          float64
duration_min                 float64
dtype: object
(34645, 18)
(30727, 18)
1     3.956050e+05
7     1.545927e+06
9     2.593765e+05
11    2.824109e+05
30    6.826268e+05
Name: duration_min, dtype: float64


In [168]:
# cleaned_df.to_csv("cleaned_bangkok_traffy.csv", index=False, encoding="utf-8-sig")
# print("✅ Exported to cleaned_bangkok_traffy.csv")