In [1]:
import nltk
from nltk.stem.porter import PorterStemmer
import pandas as pd
from stop_words import stop_words
import re
from sklearn.preprocessing import LabelEncoder

In [2]:
# nltk.download("punkt") 

## PREPROCESSING

In [3]:
stemmer = PorterStemmer()
label_encoder = LabelEncoder()

#### 1. TOKENISASI
#### 2. STEMMING
#### 3. STOP WORDS REMOVAL

In [4]:
def steaming(word):
    stemmer = PorterStemmer()
    stemmed_word = stemmer.stem(word)
    return stemmed_word

In [5]:
def preprocessing(text):
    # Menghilangkan tanda baca
    text = re.sub(r'[^\w\s-]', ' ', text)

    # Memisahkan teks menjadi kata-kata
    words = text.split()
    print(words)
    
    # Menghilangkan stopwords
    filtered_words = [word for word in words if word.lower() not in stop_words]

    # Menghilangkan singkatan sederhana
    filtered_words = [re.sub(r'\.', '', word) for word in filtered_words]

    # Menghilangkan angka
    filtered_words = [re.sub(r'\d', '', word) for word in filtered_words]
    
    filtered_words = set(filtered_words)
    
    # STEAMING
    filtered_words = [steaming(i) for i in filtered_words]

    return ' '.join(sorted(filtered_words))


In [6]:
df = pd.read_csv("./dataset/dataset.csv")

In [50]:
patterns = df['patterns'].apply(preprocessing).tolist()
encoded_labels = label_encoder.fit_transform(df['tag'])

['Hi', 'Hey', 'How', 'are', 'you', 'Is', 'anyone', 'there', 'Hello', 'Good', 'day']
['Bye', 'See', 'you', 'later', 'Goodbye']
['Thanks', 'Thank', 'you', 'That', 's', 'helpful', 'Thank', 's', 'a', 'lot']
['Which', 'items', 'do', 'you', 'have', 'What', 'kinds', 'of', 'items', 'are', 'there', 'What', 'do', 'you', 'sell']
['Do', 'you', 'take', 'credit', 'cards', 'Do', 'you', 'accept', 'Mastercard', 'Can', 'I', 'pay', 'with', 'Paypal', 'Are', 'you', 'cash', 'only']
['How', 'long', 'does', 'delivery', 'take', 'How', 'long', 'does', 'shipping', 'take', 'When', 'do', 'I', 'get', 'my', 'delivery']
['Tell', 'me', 'a', 'joke', 'Tell', 'me', 'something', 'funny', 'Do', 'you', 'know', 'a', 'joke']


In [51]:
patterns = [''.join(i) for i in patterns]

In [55]:
patterns

['day good hey',
 'bye goodby',
 'help lot',
 'item kind sell',
 'accept card cash credit mastercard pay paypal',
 'deliveri long ship',
 'funni joke']

In [8]:
encoded_labels

array([3, 2, 6, 4, 5, 0, 1])

In [33]:
all_words = ' '.join(patterns).split()
len(all_words)

'day good hey bye goodby help lot item kind sell accept card cash credit mastercard pay paypal deliveri long ship funni joke'

## BAG OF WORDS

In [10]:
from sklearn.feature_extraction.text import CountVectorizer

In [54]:
# create bag of words
vectorizer = CountVectorizer(stop_words='english')
X = vectorizer.fit_transform(patterns)

# print bag of words
print(X.toarray())

[[0 0 0 0 0 1 0 0 1 0 0 1 0 0 0 0 0 0 0 0 0 0]
 [0 1 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 1 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 0 0 0 0 1 0 1 0 0 0 0 0 1 0]
 [1 0 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 1 1 1 0 0]
 [0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 1 0 0 0 0 0 1]
 [0 0 0 0 0 0 0 1 0 0 0 0 0 1 0 0 0 0 0 0 0 0]]


## TF-IDF

In [39]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [60]:
# create tf-idf
vectorizer = TfidfVectorizer(stop_words='english')
X = vectorizer.fit_transform(patterns)

# print idf
print(vectorizer.idf_)

[2.38629436 2.38629436 2.38629436 2.38629436 2.38629436 2.38629436
 2.38629436 2.38629436 2.38629436 2.38629436 2.38629436 2.38629436
 2.38629436 2.38629436 2.38629436 2.38629436 2.38629436 2.38629436
 2.38629436 2.38629436 2.38629436 2.38629436]


In [56]:
# Mendapatkan fitur (kata-kata)
features = vectorizer.get_feature_names_out()

# Mendapatkan nilai IDF
idf_values = vectorizer.idf_

# Membuat DataFrame untuk menampilkan hasil
idf_df = pd.DataFrame(list(zip(features, idf_values)), columns=['Feature', 'IDF'])
print(idf_df)

AttributeError: 'CountVectorizer' object has no attribute 'idf_'

## DATA PREPARATION

In [42]:
from sklearn.model_selection import train_test_split

In [62]:
X_train, X_test, y_train, y_test = train_test_split(X, encoded_labels, test_size=0.10, random_state=42)
display(X_train, y_train)

<6x22 sparse matrix of type '<class 'numpy.float64'>'
	with 19 stored elements in Compressed Sparse Row format>

array([2, 0, 6, 5, 4, 1])

## TRAINING MODEL

In [63]:
from keras.models import Sequential
from keras.layers import Conv2D, MaxPooling2D, Flatten, Dense

# Inisialisasi model Sequential
model = Sequential()

# Lapisan Konvolusi Pertama
model.add(Conv2D(32, (3, 3), input_shape=(64, 64, 3), activation='relu'))
model.add(MaxPooling2D(pool_size=(2, 2)))

# Lapisan Konvolusi Kedua
model.add(Conv2D(64, (3, 3), activation='relu'))
model.add(MaxPooling2D(pool_size=(2, 2)))

# Lapisan Konvolusi Ketiga
model.add(Conv2D(128, (3, 3), activation='relu'))
model.add(MaxPooling2D(pool_size=(2, 2)))

# Lapisan Flatten
model.add(Flatten())

# Lapisan Fully Connected (Dense)
model.add(Dense(128, activation='relu'))
model.add(Dense(1, activation='sigmoid'))  # Untuk klasifikasi biner

# Compile model
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

# Tampilkan ringkasan model
model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 conv2d (Conv2D)             (None, 62, 62, 32)        896       
                                                                 
 max_pooling2d (MaxPooling2  (None, 31, 31, 32)        0         
 D)                                                              
                                                                 
 conv2d_1 (Conv2D)           (None, 29, 29, 64)        18496     
                                                                 
 max_pooling2d_1 (MaxPoolin  (None, 14, 14, 64)        0         
 g2D)                                                            
                                                                 
 conv2d_2 (Conv2D)           (None, 12, 12, 128)       73856     
                                                                 
 max_pooling2d_2 (MaxPoolin  (None, 6, 6, 128)         0

In [67]:
history = model.fit(X_train, y_train, epochs=100, batch_size=256, validation_split=0.1)

InvalidArgumentError: {{function_node __wrapped__SerializeManySparse_device_/job:localhost/replica:0/task:0/device:CPU:0}} indices[1] = [0,1] is out of order. Many sparse ops require sorted indices.
    Use `tf.sparse.reorder` to create a correctly ordered copy.

 [Op:SerializeManySparse] name: 