In [31]:
from typing import List
import re
from twikit import Tweet
import time
import json 
import pandas as pd
import string
from typing import List, Optional, Union, Iterable
import math
import unicodedata

LLM_API_KEY =''

def clean_data(
    raw: Union[pd.Series, Iterable[str]] = None,
    extra_stopwords: Optional[List[str]] = None
) -> List[str]:
    """
    Unified text cleaning function preserving input order and length. Converts pandas Series or general iterables to a list,
    then applies cleaning: lowercase, emoji removal, URL removal, optional hashtag/mention/number removal,
    punctuation stripping, and optional extra stopwords removal.

    Args:
        raw: pandas Series or iterable of strings.
        remove_hashtags: strip tokens starting with '#'.
        remove_mentions: strip tokens starting with '@'.
        remove_numbers: remove standalone numbers.
        extra_stopwords: list of words to remove (case-insensitive).

    Returns:
        List of cleaned strings, same length as input; missing or NaN become empty string.
    """
    # Convert input to list, preserving order
    if raw is None:
        raw_list: List[Optional[str]] = []
    elif isinstance(raw, pd.Series):
        raw_list = raw.tolist()
    else:
        raw_list = list(raw)

    # Compile regex patterns
    emoji_pattern = re.compile(
        '['
        '\U0001F1E6-\U0001F1FF'  # Flags
        '\U0001F300-\U0001F5FF'  # Symbols & pictographs
        '\U0001F600-\U0001F64F'  # Emoticons
        '\U0001F680-\U0001F6FF'  # Transport & map symbols
        '\U0001F700-\U0001F77F'  # Alchemical symbols
        '\U0001F780-\U0001F7FF'  # Geometric shapes extended
        '\U0001F800-\U0001F8FF'  # Supplemental arrows
        '\U0001F900-\U0001F9FF'  # Supplemental symbols
        '\U0001FA00-\U0001FA6F'  # Chess symbols
        '\U0001FA70-\U0001FAFF'  # Symbols & pictographs extended
        '\u2600-\u26FF'          # Misc symbols
        '\u2700-\u27BF'          # Dingbats
        ']+', flags=re.UNICODE
    )
    vs_pattern = re.compile(r'[\uFE0E\uFE0F\u200D]')
    url_pattern = re.compile(r'https?://\S+|www\.\S+')
    hashtag_pattern = re.compile(r'#\w+')
    mention_pattern = re.compile(r'@\w+')
    number_pattern = re.compile(r'\b\d+\b')

    # Table to strip punctuation
    punc_table = str.maketrans('', '', string.punctuation)

    # Normalize stopwords
    stopwords = set(w.lower() for w in (extra_stopwords or []))

    cleaned: List[str] = []
    for text in raw_list:
        # Handle None or NaN
        if text is None or (isinstance(text, float) and math.isnan(text)):
            cleaned.append(np.nan)
            continue

        # Lowercase and cast
        s = str(text).lower()

        # Remove emojis and URLs
        s = emoji_pattern.sub('', s)
        s = url_pattern.sub('', s)

        # Remove hashtags and mentions
        s = vs_pattern.sub('', s)
        s = hashtag_pattern.sub('', s)
        s = mention_pattern.sub('', s)
        s = number_pattern.sub('', s)

        # Collapse whitespace
        s = s.replace('\n', ' ').replace('\r', ' ')
        s = re.sub(r'\s+', ' ', s)

        # Remove punctuation
        s = s.translate(punc_table)

        # Remove extra stopwords
        if stopwords:
            tokens = [tok for tok in s.split() if tok not in stopwords]
            s = ' '.join(tokens)

        s = ''.join(ch for ch in s if not unicodedata.category(ch)[0] in ('C', 'P', 'S'))

        # Final trim
        s = s.strip()

        cleaned.append(s)

    return cleaned


In [32]:
import os
os.getcwd()
data = pd.read_csv('../mlflow/artifact/data/cleaned_data.csv')

In [33]:
data['tweet'] = clean_data(data['tweet'].to_list())

In [34]:
data = data.rename(columns={'tweet': 'text'})

In [38]:
data.dropna()

Unnamed: 0,text,sentiment
0,new week same goal were dedicated to ensuring ...,POSITIVE
1,exactly because,POSITIVE
2,virginity is a social construct not a value me...,POSITIVE
3,ugh big brother watching my bowel movements no...,NEGATIVE
4,chudwaane ke liye aur kyon self objectificatio...,NEGATIVE
...,...,...
2325,share brings an allinone platform for develope...,POSITIVE
2326,responsible use starts in your classroom explo...,POSITIVE
2327,alexa live demonstration in paris by,NEUTRAL
2328,itw de david guyomarch directeur adjoint digit...,NEUTRAL


In [None]:
# Sẽ làm 1 hàm train test split cho class data set rồi ném vô input 1 list retruen 2 data set sau 
#Sẽ xem lúc trả về có xóa được bớt cí nào return null ko 

In [41]:
from sklearn.model_selection import train_test_split
train_texts, test_texts, train_labels, test_labels = train_test_split(
    data['text'], data['sentiment'], test_size=0.2, random_state=42)

In [42]:
train_texts.to_csv('../mlflow/artifact/data/X_train.csv')
train_labels.to_csv('../mlflow/artifact/data/y_train.csv')
test_texts.to_csv('../mlflow/artifact/data/X_test.csv')
test_labels.to_csv('../mlflow/artifact/data/y_test.csv')