In [75]:
import nltk
from nltk.stem.porter import PorterStemmer
import pandas as pd
from stop_words import stop_words
import re
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split

In [76]:
# nltk.download("punkt") 

## PREPROCESSING

#### 1. TOKENISASI
#### 2. STEMMING
#### 3. STOP WORDS REMOVAL

In [125]:
stemmer = PorterStemmer()
label_encoder = LabelEncoder()

In [78]:
def steaming(word):
    stemmer = PorterStemmer()
    stemmed_word = stemmer.stem(word)
    return stemmed_word

In [79]:
def preprocessing(text):
    # Menghilangkan tanda baca
    text = re.sub(r'[^\w\s-]', ' ', text)

    # Memisahkan teks menjadi kata-kata
    words = text.split()
    print(words)
    
    # Menghilangkan stopwords
    filtered_words = [word for word in words if word.lower() not in stop_words]

    # Menghilangkan singkatan sederhana
    filtered_words = [re.sub(r'\.', '', word) for word in filtered_words]

    # Menghilangkan angka
    filtered_words = [re.sub(r'\d', '', word) for word in filtered_words]
    
    filtered_words = set(filtered_words)
    
    # STEAMING
    filtered_words = [steaming(i) for i in filtered_words]

    return ' '.join(sorted(filtered_words))


In [216]:
json_data = pd.read_json("./dataset/intents.json")
df = pd.json_normalize(json_data['intents'])
df = df.drop('context_set', axis=1)

In [234]:
df['tag']

0           greeting
1            goodbye
2            creator
3               name
4              hours
5             number
6             course
7               fees
8           location
9             hostel
10             event
11          document
12            floors
13          syllabus
14           library
15    infrastructure
16           canteen
17              menu
18         placement
19             ithod
20       computerhod
21           extchod
22         principal
23               sem
24         admission
25       scholarship
26        facilities
27    college intake
28           uniform
29         committee
30            random
31             swear
32          vacation
33            sports
34         salutaion
35              task
36           ragging
37               hod
Name: tag, dtype: object

In [186]:
df['patterns'] = df['patterns'].apply(lambda x : ' '.join(x))

In [212]:
df.loc[0:5,['tag','patterns']]

Unnamed: 0,tag,patterns
0,greeting,Hi How are you? Is anyone there? Hello Good da...
1,goodbye,cya see you bye bye See you later Goodbye I am...
2,creator,what is the name of your developers what is th...
3,name,name your name do you have a name what are you...
4,hours,timing of college what is college timing worki...
5,number,more info contact info how to contact college ...


In [189]:
patterns = df['patterns'].apply(preprocessing)
encoded_labels = label_encoder.fit_transform(df['tag'])

['Hi', 'How', 'are', 'you', 'Is', 'anyone', 'there', 'Hello', 'Good', 'day', 'What', 's', 'up', 'how', 'are', 'ya', 'heyy', 'whatsup']
['cya', 'see', 'you', 'bye', 'bye', 'See', 'you', 'later', 'Goodbye', 'I', 'am', 'Leaving', 'Bye', 'Have', 'a', 'Good', 'day', 'talk', 'to', 'you', 'later', 'ttyl', 'i', 'got', 'to', 'go', 'gtg']
['what', 'is', 'the', 'name', 'of', 'your', 'developers', 'what', 'is', 'the', 'name', 'of', 'your', 'creators', 'what', 'is', 'the', 'name', 'of', 'the', 'developers', 'what', 'is', 'the', 'name', 'of', 'the', 'creators', 'who', 'created', 'you', 'your', 'developers', 'your', 'creators', 'who', 'are', 'your', 'developers', 'developers', 'you', 'are', 'made', 'by', 'you', 'are', 'made', 'by', 'whom', 'who', 'created', 'you', 'who', 'create', 'you', 'creators', 'who', 'made', 'you', 'who', 'designed', 'you']
['name', 'your', 'name', 'do', 'you', 'have', 'a', 'name', 'what', 'are', 'you', 'called', 'what', 'is', 'your', 'name', 'what', 'should', 'I', 'call', 'you

In [190]:
patterns

0                              day good heyi whatsup ya
1        bye bye cya day good goodby gtg leav talk ttyl
2                    creat creat creator design develop
3                                             call chat
4     attend colleg day guy hour open oper saturday ...
5     colleg colleg contact contact info number phon...
6     ai branch chemic civil colleg comput comput co...
7     boy colleg fee fee girl hostel non-ac non-ac r...
8                      address colleg locat locat reach
9     address big capac colleg distanc facil facil f...
10    colleg conduct event event event function held...
11    admis admiss bring document document need requ...
12    build campu colleg colleg engin floor size tal...
13               lectur syllabu syllabu technolog timet
14    book book colleg facil facil librari librari l...
15                                 colleg infrastructur
16    cafetaria cafetaria canteen colleg facil facil...
17            canteen colleg eat food food menu 

## TF-IDF

In [140]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import make_pipeline

In [191]:
# create tf-idf
vectorizer = TfidfVectorizer(stop_words='english')
X = vectorizer.fit_transform(patterns)

# print idf
print(vectorizer.idf_)

[3.97041447 3.56494936 3.56494936 3.97041447 3.56494936 3.56494936
 3.97041447 3.97041447 3.97041447 3.97041447 3.97041447 3.97041447
 3.97041447 3.97041447 3.97041447 3.97041447 3.97041447 3.56494936
 3.97041447 3.97041447 3.97041447 3.97041447 3.97041447 3.56494936
 3.97041447 3.97041447 3.97041447 3.97041447 3.56494936 3.56494936
 3.97041447 1.61903921 3.97041447 3.97041447 3.97041447 3.97041447
 3.27726729 3.97041447 3.97041447 3.97041447 3.97041447 3.97041447
 3.97041447 3.97041447 3.27726729 3.97041447 3.97041447 3.97041447
 3.97041447 3.97041447 3.97041447 3.97041447 3.97041447 3.27726729
 3.97041447 3.97041447 3.97041447 2.7176515  3.56494936 3.97041447
 3.56494936 3.97041447 3.97041447 3.97041447 3.97041447 3.97041447
 3.97041447 3.27726729 3.97041447 3.97041447 3.56494936 3.97041447
 3.97041447 3.97041447 3.97041447 3.05412373 3.97041447 3.56494936
 3.97041447 3.97041447 3.97041447 3.97041447 3.56494936 3.97041447
 3.97041447 3.97041447 3.97041447 3.97041447 3.97041447 3.0541

In [196]:
df_tfidf = pd.DataFrame(X.toarray(), columns=vectorizer.get_feature_names_out())

In [221]:
df_tfidf.shape

(38, 156)

### DATA PREPARATION

In [198]:
X_train, X_test, y_train, y_test = train_test_split(df_tfidf, encoded_labels, test_size=0.10, random_state=42)
display(X_train, y_train)

Unnamed: 0,ac,activ,address,admis,admiss,ai,allot,antirag,ass,asshol,...,univrs,vacat,varieti,visit,wear,whatsup,whatv,work,ya,year
30,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
26,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6,0.0,0.0,0.0,0.0,0.0,0.225829,0.0,0.0,0.0,0.0,...,0.251515,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
27,0.0,0.0,0.0,0.0,0.0,0.0,0.291375,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
24,0.0,0.0,0.0,0.0,0.639312,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
15,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
17,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.37275,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8,0.0,0.0,0.393851,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
16,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
12,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


array([28, 10,  5,  2,  0, 18, 22, 21,  1, 12, 19, 16, 37, 14, 30, 24,  7,
       13, 15,  9,  6,  3, 35, 23, 29, 31, 33,  8, 26, 25,  4, 11, 20, 36])

In [199]:
# NORMALISASI
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()

In [200]:
norm_X_train = scaler.fit_transform(X_train)
norm_X_test = scaler.fit_transform(X_test)

In [201]:
display(norm_X_train, norm_X_test)

array([[0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       ...,
       [1.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.88124999],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ]])

array([[0.        , 1.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 1.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        1.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.  

## TRAINING MODEL

In [223]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Bidirectional

In [237]:
# Membuat model NN
model = Sequential()
model.add(Dense(128, input_dim=X_train.shape[1], activation='relu'))
model.add(Dense(64, activation='relu'))
model.add(Dense(len(set(encoded_labels)), activation='softmax'))

# Kompilasi model
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])

In [238]:
# Melatih model
history = model.fit(X_train, y_train, epochs=100, validation_data=(X_test, y_test))

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78

In [228]:
loss, accuracy = model.evaluate(norm_X_test, y_test)



In [270]:
# Langkah 1: Preprocessing Data Baru
new_data = [
        'Hi there! How can I help you?',
        'What courses are available at the university?',
        'How can I apply for admission?',
        'Tell me about the campus facilities.',
        'Are there any upcoming events?',
        'How can I contact the university?',"is there any canteen",   
        "Who is the head of the Information Technology department?",
    "Who is the head of the Computer Science department?",
  
    ]

# Lakukan TF-IDF pada data baru
new_data_vectorized = vectorizer.transform(new_data)

# Langkah 3: Prediksi
predictions = model.predict(new_data_vectorized)



In [271]:
predictions = model.predict(new_data_vectorized)

# Ambil label sebelum diencoded
predicted_labels = [df['tag'].iloc[prediction.argmax()] for prediction in predictions]

# Tampilkan hasil prediksi
for i, data in enumerate(new_data):
    print(f"Data: {data}, Predicted Label: {df['responses'].loc[df['tag'] == predicted_labels[i]].tolist()[0][0]}") 

Data: Hi there! How can I help you?, Predicted Label: To know about placement visit <a target="_blank" href="PLACEMENT INFORMATION LINK FROM YOUR UNIVERSITY WEBSITE IF THEY HAVE">here</a>
Data: What courses are available at the university?, Predicted Label: To know about placement visit <a target="_blank" href="PLACEMENT INFORMATION LINK FROM YOUR UNIVERSITY WEBSITE IF THEY HAVE">here</a>
Data: How can I apply for admission?, Predicted Label: To know about placement visit <a target="_blank" href="PLACEMENT INFORMATION LINK FROM YOUR UNIVERSITY WEBSITE IF THEY HAVE">here</a>
Data: Tell me about the campus facilities., Predicted Label: To know about placement visit <a target="_blank" href="PLACEMENT INFORMATION LINK FROM YOUR UNIVERSITY WEBSITE IF THEY HAVE">here</a>
Data: Are there any upcoming events?, Predicted Label: To know about placement visit <a target="_blank" href="PLACEMENT INFORMATION LINK FROM YOUR UNIVERSITY WEBSITE IF THEY HAVE">here</a>
Data: How can I contact the univers