In [1]:
import re
import pandas as pd
from collections import Counter
from pythainlp import word_tokenize
from pythainlp.corpus import thai_stopwords

from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.preprocessing import MultiLabelBinarizer, FunctionTransformer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline


In [2]:
df = pd.read_csv('../data/processed/footpath_phase1.csv')

In [3]:
stopwords = set(thai_stopwords())

def clean_token(w):
    if not w or w.strip() == "":
        return False
    if re.fullmatch(r"[\W_]+", w):
        return False
    if re.fullmatch(r"\d+", w):
        return False
    return True

# tokenize -> clean -> เอา stopword ออก
df['tokens'] = df['comment'].fillna('').apply(word_tokenize)
df['tokens_clean'] = df['tokens'].apply(lambda toks: [w for w in toks if clean_token(w)])
df['tokens_no_stop'] = df['tokens_clean'].apply(lambda toks: [w for w in toks if w not in stopwords])

# คำที่จะไม่เอามาใช้
SKIP = {"ปากซอย","ซอย","บริเวณ","หน้า","บ้าน","คน","เขต","จุด",'ขอบคุณ','เมตร','เข้ามา','แยก','เลขที่','เดิน',
        "ทางเท้า","ฟุตบาท","ทางเดิน","เจ้าหน้าที่","ประชาชน","#1555","เวลา"}

# สร้าง column มา TOP_N ตัว
TOP_N = 50
all_tokens = [w for toks in df['tokens_no_stop'] for w in toks]
tok_counter = Counter(all_tokens)
candidates = [(w,c) for w,c in tok_counter.most_common() if w not in SKIP]
important_words = [w for w,_ in candidates[:TOP_N]]
important_set = set(important_words)

df['comment_keywords'] = df['tokens_no_stop'].apply(lambda toks: ' '.join([w for w in toks if w in important_set]))
df['comment_keywords_list'] = df['tokens_no_stop'].apply(lambda toks: [w for w in toks if w in important_set])


In [4]:
# แยกสมาชิกใน column 'type' ให้เป็น list

class SimpleMLB(BaseEstimator, TransformerMixin):

    def __init__(self):
        self.mlb = MultiLabelBinarizer()

    def _parse(self, s):
        if s is None:
            return []
        if isinstance(s, (list, set, tuple)):
            return list(s)
        s = str(s).strip()
        if s in ("", "{}"):
            return []
        if s.startswith("{") and s.endswith("}"):
            s = s[1:-1]
        parts = [p.strip() for p in s.split(",") if p.strip()]
        return parts

    def fit(self, X, y=None):
        lists = [self._parse(x) for x in X]
        self.mlb.fit(lists)
        return self

    def transform(self, X):
        lists = [self._parse(x) for x in X]
        return self.mlb.transform(lists)

    def get_feature_names_out(self, input_features=None):
        return [f"type__{c}" for c in self.mlb.classes_]

In [5]:
# แยก year-month-day ออกจากตัวแปร datetime

def extract_datetime(df_slice):

    ts = pd.to_datetime(df_slice.iloc[:, 0], errors='coerce')
    return pd.DataFrame({
        'year': ts.dt.year.fillna(-1).astype(int),
        'month': ts.dt.month.fillna(-1).astype(int),
        'day': ts.dt.day.fillna(-1).astype(int),
        'weekday': ts.dt.weekday.fillna(-1).astype(int),
        'hour': ts.dt.hour.fillna(-1).astype(int),
        'is_weekend': (ts.dt.weekday >= 5).fillna(False).astype(int)
    })


In [6]:
tfidf_for_keywords = TfidfVectorizer(
    tokenizer=lambda x: x.split(),   # treat comment_keywords as already-tokenized
    preprocessor=lambda x: x,        # no extra preprocessing
    token_pattern=None,              # required when passing tokenizer
    lowercase=False,
    max_features=200                 # adjust if desired
)


In [7]:
preprocessor = ColumnTransformer(
    transformers=[
        ('datetime', FunctionTransformer(extract_datetime, validate=False), ['timestamp']),
        ('type', SimpleMLB(), 'type'),
        ('comment', tfidf_for_keywords, 'comment_keywords'),
        ('coords', 'passthrough', ['lon', 'lat']),
    ],
    remainder='drop',
    sparse_threshold=0.0
)

pipeline = Pipeline([('preprocess', preprocessor)])

In [8]:
def pipeline_to_df(pipeline, df):
    # fit_transform if not fitted
    try:
        X = pipeline.transform(df)
    except Exception:
        X = pipeline.fit_transform(df)

    ct = pipeline.named_steps['preprocess']

    # datetime names
    dt_cols = ['year', 'month', 'day', 'weekday', 'hour', 'is_weekend']

    # type names
    type_cols = ct.named_transformers_['type'].get_feature_names_out()

    # tfidf names
    tfidf_vect = ct.named_transformers_['comment']
    tfidf_cols = [f"tfidf__{c}" for c in tfidf_vect.get_feature_names_out().tolist()]

    # coord_cols
    coord_cols = ['lon', 'lat']

    colnames = list(dt_cols) + list(type_cols) + tfidf_cols + coord_cols

    if hasattr(X, "toarray"):
        X = X.toarray()

    return pd.DataFrame(X, columns=colnames, index=df.index)

preprocessed_df = pipeline_to_df(pipeline, df)
preprocessed_df.shape, preprocessed_df.columns[:30]

((172931, 82),
 Index(['year', 'month', 'day', 'weekday', 'hour', 'is_weekend', 'type__PM2.5',
        'type__การเดินทาง', 'type__กีดขวาง', 'type__คนจรจัด', 'type__คลอง',
        'type__ความปลอดภัย', 'type__ความสะอาด', 'type__จราจร', 'type__ต้นไม้',
        'type__ถนน', 'type__ทางเท้า', 'type__ท่อระบายน้ำ', 'type__น้ำท่วม',
        'type__ป้าย', 'type__ป้ายจราจร', 'type__ร้องเรียน', 'type__สอบถาม',
        'type__สะพาน', 'type__สัตว์จรจัด', 'type__สายไฟ', 'type__ห้องน้ำ',
        'type__เสนอแนะ', 'type__เสียงรบกวน', 'type__แสงสว่าง'],
       dtype='object'))

In [9]:
# Peek a few rows
display(preprocessed_df.head())
preprocessed_df.columns

Unnamed: 0,year,month,day,weekday,hour,is_weekend,type__PM2.5,type__การเดินทาง,type__กีดขวาง,type__คนจรจัด,...,tfidf__หลุม,tfidf__อันตราย,tfidf__อุบัติเหตุ,tfidf__เทศกิจ,tfidf__เรื่อง,tfidf__แจ้ง,tfidf__แตก,tfidf__ไฟ,lon,lat
0,2021.0,12.0,23.0,3.0,10.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.623006,0.477759,0.0,0.0,100.64844,13.68735
1,2022.0,1.0,2.0,6.0,10.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,100.53764,13.70716
2,2022.0,1.0,14.0,4.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,100.52674,13.7095
3,2022.0,1.0,16.0,6.0,11.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,100.52789,13.70611
4,2022.0,1.0,17.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,100.53055,13.70561


Index(['year', 'month', 'day', 'weekday', 'hour', 'is_weekend', 'type__PM2.5',
       'type__การเดินทาง', 'type__กีดขวาง', 'type__คนจรจัด', 'type__คลอง',
       'type__ความปลอดภัย', 'type__ความสะอาด', 'type__จราจร', 'type__ต้นไม้',
       'type__ถนน', 'type__ทางเท้า', 'type__ท่อระบายน้ำ', 'type__น้ำท่วม',
       'type__ป้าย', 'type__ป้ายจราจร', 'type__ร้องเรียน', 'type__สอบถาม',
       'type__สะพาน', 'type__สัตว์จรจัด', 'type__สายไฟ', 'type__ห้องน้ำ',
       'type__เสนอแนะ', 'type__เสียงรบกวน', 'type__แสงสว่าง', 'tfidf__กีดขวาง',
       'tfidf__ขยะ', 'tfidf__ขอให้', 'tfidf__ขับ', 'tfidf__ขาย',
       'tfidf__ขายของ', 'tfidf__คลอง', 'tfidf__จอด', 'tfidf__ชำรุด',
       'tfidf__ซ่อม', 'tfidf__ดำเนินการ', 'tfidf__ตรวจสอบ', 'tfidf__ต้นไม้',
       'tfidf__ถนน', 'tfidf__ท', 'tfidf__ทำ', 'tfidf__ทิ้ง', 'tfidf__ท่อ',
       'tfidf__ท่อระบายน้ำ', 'tfidf__น.', 'tfidf__น้ำ', 'tfidf__บ่อ',
       'tfidf__ปัญหา', 'tfidf__ป้าย', 'tfidf__ฝั่ง', 'tfidf__ฝา', 'tfidf__พัง',
       'tfidf__พื้น', 'tfidf