In [1]:
import pandas as pd 
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import FunctionTransformer
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.base import BaseEstimator, TransformerMixin

import re
from collections import Counter
from pythainlp import word_tokenize
from pythainlp.corpus import thai_stopwords

In [2]:
df = pd.read_csv('../data/processed/footpath_phase1.csv')

In [3]:
# เลือกเฉพาะ record ในกรุงเทพ
df = df[(df['province'] == 'กรุงเทพมหานคร') | (df['province'] == 'จังหวัดกรุงเทพมหานคร')]

# Drop columns ที่ไม่ใช้
df = df.drop(['coords','ticket_id','photo','photo_after','address','star','last_activity','count_reopen','province'], axis=1) 

In [4]:
TOP_N = 50
SKIP = {
    "ปากซอย","ซอย","บริเวณ","หน้า","บ้าน","คน","เขต","จุด",
    "ทางเท้า","ฟุตบาท","ทางเดิน","เจ้าหน้าที่","ประชาชน",
    "#1555","เวลา"
}

stopwords = set(thai_stopwords())

def clean_token(w):
    """Return True for tokens we want to keep; False to drop."""
    if not w or w.strip() == "":
        return False
    if re.fullmatch(r"[\W_]+", w):     # punctuation-only
        return False
    if re.fullmatch(r"\d+", w):        # numbers-only
        return False
    return True

# tokenize comment
df['comment'] = df['comment'].fillna('')
df['tokens'] = df['comment'].apply(word_tokenize)

# clean + ลบ stopword
df['tokens_clean'] = df['tokens'].apply(lambda words: [w for w in words if clean_token(w)])
df['tokens_no_stop'] = df['tokens_clean'].apply(lambda words: [w for w in words if w not in stopwords])

# นับ token
all_tokens = [w for words in df['tokens_no_stop'] for w in words]
tok_counter = Counter(all_tokens)


In [5]:
# TOP50 ยังไม่เอาคำใน skip ออก

# print("Top 50 raw tokens (before SKIP):")
# for word, freq in tok_counter.most_common(50):
#     print(f"{word:15} {freq}")
# print("-" * 40)

In [6]:
# เอาคำที่อยู่ใน skip ออก
candidates = [(w, c) for w, c in tok_counter.most_common() if w not in SKIP]
top_n = candidates[:TOP_N]
important_words = [w for w, _ in top_n]
important_set = set(important_words)

# ใส่กลับเข้า dataframe
def keep_top_keywords(token_list):
    if not token_list:
        return ''
    kept = [w for w in token_list if w in important_set]
    return ' '.join(kept)

df['comment_keywords'] = df['tokens_no_stop'].apply(keep_top_keywords)
df['comment_keywords_list'] = df['tokens_no_stop'].apply(lambda toks: [w for w in toks if w in important_set])

In [7]:
# print("Example rows (comment_keywords):")
# print(df[['comment', 'comment_keywords']].head(20).to_string(index=False))

In [8]:
# def parse_type_field(s):
#     """Convert strings like '{ถนน,ทางเท้า}' into a Python set."""
#     if not s or s == "{}":
#         return set()
#     s = s.strip("{}")          # remove outer braces
#     parts = [p.strip() for p in s.split(",") if p.strip() != ""]
#     return set(parts)

# df['type_set'] = df['type'].apply(parse_type_field)
# df['type_list'] = df['type_set'].apply(list)

# df[['type', 'type_set']].head()

# mlb = MultiLabelBinarizer()
# type_matrix = mlb.fit_transform(df['type_list'])
# type_df = pd.DataFrame(type_matrix, columns=mlb.classes_, index=df.index)
# df = pd.concat([df, type_df], axis=1)


In [9]:

class TypeMultiLabelEncoder(BaseEstimator, TransformerMixin):
    def __init__(self):
        self.mlb = MultiLabelBinarizer()

    def parse_type(self, s):
        """Convert '{ถนน,ทางเท้า}' → ['ถนน','ทางเท้า']"""
        if not s or s == "{}":
            return []
        s = s.strip("{}")
        parts = [p.strip() for p in s.split(",") if p.strip()]
        return parts

    def fit(self, X, y=None):
        type_lists = [self.parse_type(s) for s in X]
        self.mlb.fit(type_lists)
        return self

    def transform(self, X):
        type_lists = [self.parse_type(s) for s in X]
        return self.mlb.transform(type_lists)

In [10]:
# # แปลง Datetime
# def extract_datetime_features(df):
#     df['timestamp'] = pd.to_datetime(df['timestamp'])
#     return pd.DataFrame({
#         'year': df['timestamp'].dt.year,
#         'month': df['timestamp'].dt.month,
#         'day': df['timestamp'].dt.day,
#         'weekday': df['timestamp'].dt.weekday,
#         'hour': df['timestamp'].dt.hour,
#         'is_weekend': (df['timestamp'].dt.weekday >= 5).astype(int)
#     })



# # Create a pipeline that applies the transformer
# pipeline = Pipeline(steps=[
#     ('extract_datetime', FunctionTransformer(extract_datetime_features, validate=False))
# ])

# # Apply the pipeline to the DataFrame
# X_transformed = pipeline.fit_transform(df)

# # Print the transformed DataFrame
# print(X_transformed)

In [11]:
# -----------------------------
# 1) Date extraction function
# -----------------------------
def extract_datetime(df):
    ts = pd.to_datetime(df['timestamp'])
    return pd.DataFrame({
        'year': ts.dt.year,
        'month': ts.dt.month,
        'day': ts.dt.day,
        'weekday': ts.dt.weekday,
        'hour': ts.dt.hour,
        'is_weekend': (ts.dt.weekday >= 5).astype(int)
    })

# -----------------------------
# 2) Simple MultiLabel transformer
# -----------------------------
class SimpleMLB(BaseEstimator, TransformerMixin):
    def __init__(self):
        self.mlb = MultiLabelBinarizer()

    def parse(self, s):
        s = s.strip("{}")
        if not s:
            return []
        return [x.strip() for x in s.split(",") if x.strip()]

    def fit(self, X, y=None):
        lists = [self.parse(x) for x in X]
        self.mlb.fit(lists)
        return self

    def transform(self, X):
        lists = [self.parse(x) for x in X]
        return self.mlb.transform(lists)

# -----------------------------
# 3) ColumnTransformer + Pipeline
# -----------------------------
pre = ColumnTransformer([
    ('dt', FunctionTransformer(extract_datetime, validate=False), ['timestamp']),
    ('type', SimpleMLB(), 'type')
])

pipeline = Pipeline([
    ('preprocess', pre)
])

In [12]:
processed_df = pipeline.fit_transform(df)
processed_df = pd.DataFrame(processed_df)

In [13]:
processed_df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,20,21,22,23,24,25,26,27,28,29
0,2021,12,23,3,10,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,2022,1,2,6,10,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,2022,1,14,4,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,2022,1,16,6,11,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,2022,1,17,0,2,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [14]:
import pandas as pd

def transform_to_df(pipeline, df):
    """
    Fit/transform (or just transform if already fitted) with a pipeline whose first
    step is a ColumnTransformer named 'preprocess', and return a DataFrame with column names.
    """
    # Run fit_transform if pipeline not fitted; otherwise transform
    try:
        X_arr = pipeline.transform(df)
    except Exception:
        X_arr = pipeline.fit_transform(df)

    # access the ColumnTransformer
    ct = pipeline.named_steps['preprocess']

    # 1) datetime names (we know these)
    datetime_cols = ['year', 'month', 'day', 'weekday', 'hour', 'is_weekend']

    # 2) type names from the SimpleMLB transformer
    type_transformer = ct.named_transformers_['type']   # name used in ColumnTransformer
    # after fit, the MultiLabelBinarizer classes are available as type_transformer.mlb.classes_
    type_cols = list(type_transformer.mlb.classes_) if hasattr(type_transformer, 'mlb') else []

    # Compose final column list in the same order as transformers were defined
    col_names = datetime_cols + type_cols

    # Return DataFrame
    return pd.DataFrame(X_arr, columns=col_names, index=df.index)

# Usage
X_df = transform_to_df(pipeline, df)
print(X_df.shape)
X_df.head()


(172833, 30)


Unnamed: 0,year,month,day,weekday,hour,is_weekend,PM2.5,การเดินทาง,กีดขวาง,คนจรจัด,...,ป้ายจราจร,ร้องเรียน,สอบถาม,สะพาน,สัตว์จรจัด,สายไฟ,ห้องน้ำ,เสนอแนะ,เสียงรบกวน,แสงสว่าง
0,2021,12,23,3,10,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,2022,1,2,6,10,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,2022,1,14,4,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,2022,1,16,6,11,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,2022,1,17,0,2,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [15]:
X_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 172833 entries, 0 to 172930
Data columns (total 30 columns):
 #   Column       Non-Null Count   Dtype
---  ------       --------------   -----
 0   year         172833 non-null  int64
 1   month        172833 non-null  int64
 2   day          172833 non-null  int64
 3   weekday      172833 non-null  int64
 4   hour         172833 non-null  int64
 5   is_weekend   172833 non-null  int64
 6   PM2.5        172833 non-null  int64
 7   การเดินทาง   172833 non-null  int64
 8   กีดขวาง      172833 non-null  int64
 9   คนจรจัด      172833 non-null  int64
 10  คลอง         172833 non-null  int64
 11  ความปลอดภัย  172833 non-null  int64
 12  ความสะอาด    172833 non-null  int64
 13  จราจร        172833 non-null  int64
 14  ต้นไม้       172833 non-null  int64
 15  ถนน          172833 non-null  int64
 16  ทางเท้า      172833 non-null  int64
 17  ท่อระบายน้ำ  172833 non-null  int64
 18  น้ำท่วม      172833 non-null  int64
 19  ป้าย         172833 non-null

In [16]:
import pandas as pd
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.preprocessing import MultiLabelBinarizer, FunctionTransformer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

# ------------------------------------------------
# 1) DATETIME FEATURE EXTRACTOR
# ------------------------------------------------
def extract_datetime(df):
    ts = pd.to_datetime(df['timestamp'])
    return pd.DataFrame({
        'year': ts.dt.year,
        'month': ts.dt.month,
        'day': ts.dt.day,
        'weekday': ts.dt.weekday,
        'hour': ts.dt.hour,
        'is_weekend': (ts.dt.weekday >= 5).astype(int)
    })


# ------------------------------------------------
# 2) TYPE MULTI-LABEL ENCODER
# ------------------------------------------------
class SimpleMLB(BaseEstimator, TransformerMixin):
    def __init__(self):
        self.mlb = MultiLabelBinarizer()

    def parse(self, s):
        s = s.strip("{}")
        if not s:
            return []
        return [x.strip() for x in s.split(",") if x.strip()]

    def fit(self, X, y=None):
        lists = [self.parse(x) for x in X]
        self.mlb.fit(lists)
        return self

    def transform(self, X):
        lists = [self.parse(x) for x in X]
        return self.mlb.transform(lists)

    def get_feature_names_out(self, input_features=None):
        return self.mlb.classes_


# ------------------------------------------------
# 3) COLUMN TRANSFORMER (ALL FEATURES)
# ------------------------------------------------
preprocessor = ColumnTransformer(
    transformers=[
        ('datetime', FunctionTransformer(extract_datetime, validate=False), ['timestamp']),
        ('type', SimpleMLB(), 'type'),
        ('comment', TfidfVectorizer(), 'comment_keywords'),
    ],
    remainder='drop'
)

pipeline = Pipeline([
    ('preprocess', preprocessor)
])

In [17]:
def pipeline_to_df(pipeline, df):
    """
    Transform df with pipeline and return a DataFrame with column names.
    Works with datetime + SimpleMLB + TfidfVectorizer.
    """
    ct = pipeline.named_steps['preprocess']   # ColumnTransformer

    # 1) Datetime feature names (we know them)
    dt_cols = ['year', 'month', 'day', 'weekday', 'hour', 'is_weekend']

    # 2) Type multilabel names (from SimpleMLB)
    type_encoder = ct.named_transformers_['type']
    type_cols = list(type_encoder.mlb.classes_)

    # 3) Comment TF-IDF names
    comment_vectorizer = ct.named_transformers_['comment']
    comment_cols = comment_vectorizer.get_feature_names_out().tolist()

    # Combine all names in correct order
    colnames = dt_cols + type_cols + comment_cols

    # Get transformed matrix
    X = pipeline.transform(df)

    # Return DataFrame
    return pd.DataFrame(X.toarray() if hasattr(X, "toarray") else X,
                        columns=colnames,
                        index=df.index)


In [18]:
pipeline.fit(df)

X_df = pipeline_to_df(pipeline, df)

# print(X_df.shape)
print(X_df.head())


     year  month   day  weekday  hour  is_weekend  PM2.5  การเดินทาง  กีดขวาง  \
0  2021.0   12.0  23.0      3.0  10.0         0.0    0.0         0.0      0.0   
1  2022.0    1.0   2.0      6.0  10.0         1.0    0.0         0.0      0.0   
2  2022.0    1.0  14.0      4.0   1.0         0.0    0.0         0.0      0.0   
3  2022.0    1.0  16.0      6.0  11.0         1.0    0.0         0.0      0.0   
4  2022.0    1.0  17.0      0.0   2.0         0.0    0.0         0.0      0.0   

   คนจรจัด  ...   เข   เด  เทศก  เมตร        เร  เลขท  เหต        แจ  แตก  \
0      0.0  ...  0.0  0.0   0.0   0.0  0.632734   0.0  0.0  0.485235  0.0   
1      0.0  ...  0.0  0.0   0.0   0.0  0.000000   0.0  0.0  0.000000  0.0   
2      0.0  ...  0.0  0.0   0.0   0.0  0.000000   0.0  0.0  0.000000  0.0   
3      0.0  ...  0.0  0.0   0.0   0.0  0.000000   0.0  0.0  0.000000  0.0   
4      0.0  ...  0.0  0.0   0.0   0.0  0.000000   0.0  0.0  0.000000  0.0   

        แยก  
0  0.000000  
1  0.868716  
2  0.000

In [21]:
X_df['กีดขวาง'].value_counts()

กีดขวาง
0.0    150729
1.0     22104
Name: count, dtype: int64