In [1]:
#FOR ANALYSIS/CLEANING/COMPUTATION:
import pandas as pd
import numpy as np

#FOR VISUALIZATION:
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

# Load Data

In [None]:
import time
import pandas as pd

print("Loading data file now, this could take a while depending on file size")
start = time.time()
df = pd.read_csv('DATASET.csv') # ADD-CSV
end = time.time()
print("Loading took " + str(round(end - start, 2)) + " seconds\n")
print("Number of rows : ",df.shape[0]," and the number of columns : ",df.shape[1])
missing_values = df.isna().sum().sum()
duplicated_values = df.duplicated().sum()
print(f'\nMissing values: {missing_values}')
print(f'Duplicated values: {duplicated_values}')
if missing_values >= 1:
    print('\nMissing values by column:')
    print(df.isna().sum())
print("\nUnique Values in Each Column:")
print(df.nunique())

In [3]:
df.dropna(inplace=True)

In [None]:
df['label'].value_counts()

# View Data

In [None]:
def plot_categorical_distributions(columns, data=df, palette='muted'):
    plt.figure(figsize=(15, 6))  
    
    for i, column_name in enumerate(columns):
        plt.subplot(1, 3, i + 1)  
        value_counts = data[column_name].value_counts()
        value_counts.plot.pie(autopct='%1.1f%%', colors=sns.color_palette
                            (palette), startangle=90, explode=[0.05] * value_counts.nunique())
        
        plt.title(f'Percentage Distribution of {column_name}')
        plt.ylabel('')  
    df[columns].value_counts()
    plt.tight_layout()
    plt.show()


columns_to_plot = ['label']
plot_categorical_distributions(columns_to_plot)

# Clean Text

In [6]:
import re
import numpy as np
def clean_text(text):
    if pd.isna(text):
        return np.nan
    text = re.sub(r'Ã[\x80-\xBF]+', ' ', text)
    text = re.sub(r'[^a-zA-Zก-ฮะ-์\s]', ' ', text)
    text = re.sub(r'\s+', ' ', text)
    text = text.strip()
    return text.lower()

In [7]:
df['Clean_text'] = df['Review'].apply(clean_text)

In [None]:
pd.set_option('display.max_colwidth', None)
print(df.iloc[0]['Review'])
print(df.iloc[0]['Clean_text'])

# ทำ TOKENIZATION

In [None]:
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
nltk.download('punkt_tab')
nltk.download('stopwords')
nltk.download('wordnet')

In [10]:
def tokenize_text(text):
    tokens = word_tokenize(text)  
    return tokens  
df['Tokens'] = df['Clean_text'].apply(tokenize_text)

# Normalisation

In [11]:
norm_dict = {'2morrow': 'tomorrow', '2nite': 'tonight', '4ever': 'forever', '4get': 'forget', '4give': 'forgive', '4got': 'forgot', '4th': 'fourth', '4ward': 'forward', '4warned': 'forewarned', '4wrd': 'forward', 'abt': 'about', 'acc': 'account', 'acct': 'account', 'add': 'address', 'addy': 'address', 
             'admin': 'administrator', 'advert': 'advertise', 'advice': 'advise', 'aftr': 'after', 'agri': 'agriculture', 'aint': 'am not', 'alot': 'a lot', 'alrite': 'all right', 'alryt': 'all right', 'alwys': 'always', 'amblnc': 'ambulance', 'amnt': 'amount', 'amp': 'amplifier', 'aniversary': 'anniversary', 'anniv': 'anniversary'}

def normalize_text(text):
    for word in text.split():
        if word in norm_dict:
            text = text.replace(word, norm_dict[word])
    return text

In [12]:
def normalize_tokens(tokens):
	return [norm_dict.get(token, token) for token in tokens]

df['normalize'] = df['Tokens'].apply(normalize_tokens)

In [13]:
custom_stopwords = {'app', 'music','play', 'spotify', 'song', 'songs', 'listen', 'playing','get', 'playlist'}

In [14]:
def remove_stopwords(tokens):
    stop_words = set(stopwords.words('english'))  # Get the set of English stopwords
    if custom_stopwords:
        stop_words.update(custom_stopwords)
    return [word for word in tokens if word.lower() not in stop_words]  # Filter out stopwords

### เลือก Custom Stopwords

In [15]:
custom_stopwords = {'app', 'music','play', 'spotify', 'song', 'songs', 'listen', 'playing','get', 'playlist'}

In [16]:
df['Filtered_Tokens'] = df['Tokens'].apply(remove_stopwords)

วนลบ Stop Words โดยใช้ Counter

In [None]:
from collections import Counter 
all_filtered_tokens = [word for tokens in df['Filtered_Tokens'] for word in tokens]

word_counts_after_removal = Counter(all_filtered_tokens)
most_common_words_after_removal = word_counts_after_removal.most_common(20)

print(most_common_words_after_removal)


# แก้ LEMMATIZATION พวก V-ing 

In [18]:
from nltk.stem import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()

In [19]:
def lemmatize_tokens(tokens):
    return [lemmatizer.lemmatize(word, pos='v') for word in tokens]
#pos is set to v (verb) for better accuracy. v เพื่อจาก running -> run , a เพื่อจาก better -> good

In [20]:
df['Lemmatized_Tokens'] = df['Filtered_Tokens'].apply(lemmatize_tokens)

In [None]:
df.head(10)

# การทำเป็น Input ให้โมเดล
### แบ่งได้ 3 วิธี 
1. ทำ TF-IDF
2. ทำ One-Hot Encoding
3. ทำ Word Embeddings

## 1. ใช้ sklearn text ทำ Tfidf_matrix และ Feature_names
ง่ายเร็ว คำมีความเกี่ยวข้องกัน ดี กับ ดีมาก มีค่าใกล้กัน

In [22]:
from sklearn.feature_extraction.text import TfidfVectorizer

def apply_tfidf(df , max_features=10000):
    df['TFIDF_Tokens'] = df['Lemmatized_Tokens'].apply(lambda x: ' '.join(x))
    tfidf_vectorizer = TfidfVectorizer(stop_words='english', max_features=max_features , encoding='utf-8', decode_error='replace',lowercase=False)
    tfidf_matrix = tfidf_vectorizer.fit_transform(df['TFIDF_Tokens'])
    feature_names = tfidf_vectorizer.get_feature_names_out()
    return tfidf_matrix, feature_names


In [None]:
tfidf_matrix, feature_names = apply_tfidf(df)
df.head(2)

In [79]:
np.set_printoptions(precision = 2)
for i , tfide_enc in enumerate(tfidf_matrix[:2]):
    print(f"Review {i+1} : {tfide_enc}")

Review 0 : <Compressed Sparse Row sparse matrix of dtype 'float64'
	with 10 stored elements and shape (1, 10000)>
  Coords	Values
  (0, 3121)	0.19645623338718804
  (0, 8265)	0.2699783522857679
  (0, 519)	0.3252689617050176
  (0, 3551)	0.38189959165106974
  (0, 7620)	0.28049351556628804
  (0, 2116)	0.2644618482301777
  (0, 9543)	0.17685364605762688
  (0, 7632)	0.4456317394459237
  (0, 2700)	0.38212337351102066
  (0, 8860)	0.3352157943109041
Review 1 : <Compressed Sparse Row sparse matrix of dtype 'float64'
	with 7 stored elements and shape (1, 10000)>
  Coords	Values
  (0, 3121)	0.20186968304578454
  (0, 3871)	0.49057624581594506
  (0, 7463)	0.39818900683462105
  (0, 6723)	0.48138994952960623
  (0, 7682)	0.3694340736085449
  (0, 8852)	0.34968223570709517
  (0, 8698)	0.2636922614032367


In [24]:
from wordcloud import WordCloud, STOPWORDS
import matplotlib.pyplot as plt

def generate_wordcloud(df,sentence_column, emotion_column, condition_value):
    """
    สร้าง WordCloud จากข้อมูลใน DataFrame
    Parameters:
        df (DataFrame): ข้อมูลในรูปแบบ pandas DataFrame
        sentence_column (str): ชื่อคอลัมน์ที่มีข้อความ
        emotion_column (str): ชื่อคอลัมน์สำหรับการกรองข้อมูล
        condition_value (str): ค่าของเงื่อนไขที่ใช้กรอง
    """
    filtered_text = " ".join(df[df[emotion_column] == condition_value][sentence_column])
    stopwords = set(STOPWORDS)
    wordcloud = WordCloud(
        width=800,
        height=400,
        background_color='white',
        stopwords=stopwords,
        collocations=True,
        max_words=2000
    ).generate(filtered_text)
    
    # แสดงผล WordCloud
    plt.figure(figsize=(15, 6))
    plt.imshow(wordcloud, interpolation='bilinear')
    plt.axis('off')  # ซ่อนแกน x และ y
    plt.title(f"WordCloud for {emotion_column} = {condition_value}", fontsize=16)
    plt.show()

def plot_emotion_wordclouds(df,sentence_column ,emotion_column , figsize=(15, 6)):
    """
    สร้าง Word Cloud สำหรับแต่ละอารมณ์ใน DataFrame
    
    Parameters:
        df (DataFrame): DataFrame ที่มีข้อมูลอารมณ์และข้อความ
        emotion_column (str): ชื่อคอลัมน์ที่แสดงอารมณ์
        sentence_column (str): ชื่อคอลัมน์ที่แสดงข้อความ
        figsize (tuple): ขนาดของกราฟ (default: (15, 6))
        background_color (str): สีพื้นหลังของ Word Cloud (default: 'white')
        colormap (str): ชุดสีของ Word Cloud (default: 'Reds')
    
    Returns:
        None
    """
    # ดึงรายการอารมณ์ที่ไม่ซ้ำกันและเรียงลำดับ
    unique_emotions = sorted(df[emotion_column].unique())
    
    for emotion in unique_emotions:
        plt.figure(figsize=figsize)
        
        # รวมข้อความทั้งหมดของอารมณ์นั้น
        emotion_review = " ".join(df[df[emotion_column] == emotion][sentence_column])
        
        stopwords = set(STOPWORDS)
        # สร้าง Word Cloud
        wordcloud = WordCloud(
            width=800,
            height=400,
            background_color='white',
            stopwords=stopwords,
            collocations=True,
            max_words=2000
        ).generate(emotion_review)
        
        # แสดง Word Cloud
        plt.imshow(wordcloud, interpolation='bilinear')
        plt.axis('off')
        plt.title(f"WordCloud for {emotion_column} = {emotion}", fontsize=16)
        plt.show()


In [None]:
generate_wordcloud(df, sentence_column='TFIDF_Tokens', emotion_column='label', condition_value='POSITIVE')
generate_wordcloud(df, sentence_column='TFIDF_Tokens', emotion_column='label', condition_value='NEGATIVE')
plot_emotion_wordclouds(df,sentence_column ='TFIDF_Tokens',emotion_column ='label')

In [26]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
X = tfidf_matrix  # Features from TF-IDF
y = df['label']   # Target labels


X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

In [None]:
from sklearn.linear_model import LogisticRegression

model=LogisticRegression()
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
real_accuracy= accuracy*100
print(f"Logistic Regression Accuracy: {real_accuracy:.4f} %")

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report
import matplotlib.pyplot as plt
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay

y_pred_class = model.predict(X_test)
print("Classification Accuracy:", accuracy_score(y_test, y_pred_class))
print("Classification Report:\n", classification_report(y_test, y_pred_class))

# Compute confusion matrix
cm = confusion_matrix(y_test, y_pred_class)

# Plot confusion matrix
disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=['NEGATIVE','POSITIVE'])
disp.plot(cmap=plt.cm.Blues)
plt.title("Confusion Matrix for sentiment Classification")
plt.show()

# One-Hot Encodeing
ง่ายเร็วแต่คำทุกคำจะไม่เกี่ยวข้องหรือสือถึงกันได้ เช่น ดี กับ ดีมาก model จะมองว่า 2 คำนี้ต่างกันแบบสิ้นเชิง

In [None]:
result = df['Lemmatized_Tokens'].tolist()

# แสดงผลลัพธ์
for row in result:
    print(row)

In [30]:
unique_words = set(word for row in result for word in row if word.strip())

In [None]:
list(unique_words)[1:10]

In [None]:
total_word = len(unique_words)
w2ids = {w: idx for idx, w in enumerate(unique_words)}
print(w2ids)

In [33]:
import numpy as np
def One_Hot_Encode(x , n_class):
    return np.eye(n_class)[x]

In [None]:
count = 0
for w, index in w2ids.items():
    print(f"{w:<10}", '\t', One_Hot_Encode(index, total_word))
    count += 1
    if count >= 5:
        break

In [None]:
all_encodings = []
tokens_ids = [w2ids[tk] for tk in result[0]]
tokens_encode = [One_Hot_Encode(id, total_word) for id in tokens_ids]

print(f"{'word':<5}\t{'id':<5}\t{'encoding':<100}")
for tk, id, en in zip(result[0], tokens_ids, tokens_encode):
    print(f"{tk:<5}\t{id:<5}\t{str(en):<100}")

print(result[0], ':', sum(tokens_encode))


In [None]:
new_df = pd.DataFrame({'text': result[0:5]})
new_df

In [None]:
pd.get_dummies(new_df.text.apply(pd.Series).stack()).groupby(level=0).sum()

## ใช้ sklearn

In [38]:
from sklearn.preprocessing import MultiLabelBinarizer
mlb = MultiLabelBinarizer()

In [None]:
mlb.fit(result)

In [None]:
mlb.classes_

# Count Vectorizer
เหมือน One-Encode แต่จะเพิ่มความถี่ของคำเข้าไปได้ คำไม่เกี่ยวข้องกันเหมือนเดิม

In [61]:
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer

In [74]:
result = df['Lemmatized_Tokens'].tolist()
texts = ["cat cat fish", "cat fish", "fish bird", "bird"]

In [75]:
# Join tokens back into strings
joined_result = [' '.join(tokens) for tokens in result]

cv = CountVectorizer()
CV_fit = cv.fit_transform(texts)
word_list = cv.get_feature_names_out()
count_list =  np.asarray(CV_fit.sum(axis=0))[0]

In [76]:
print(dict(zip(word_list, count_list)))

{'bird': np.int64(2), 'cat': np.int64(3), 'fish': np.int64(3)}


In [77]:
CV_fit.toarray()

array([[0, 2, 1],
       [0, 1, 1],
       [1, 0, 1],
       [1, 0, 0]])

# Word Embedding
ไม่มีใครใช้แล้ว