In [29]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
import joblib

# Load cleaned dataset
df = pd.read_csv('./Dataset_Prosedur_Akademik_labeled.csv', encoding='utf-8-sig')
dfs = pd.read_csv('./Dataset_Prosedur_Akademik_clean.csv', encoding='utf-8-sig')

print(df.tail())

questions = df['Question'].tolist()  
classification = df['category'].tolist()

# Split dataset
X_train, X_test, y_train, y_test = train_test_split(questions, classification, test_size=0.3, random_state=42)

# Vectorize
vectorizer = TfidfVectorizer()
X_train_vec = vectorizer.fit_transform(X_train)
X_test_vec = vectorizer.transform(X_test)

# Train model
clf = LogisticRegression(class_weight='balanced', max_iter=1000)
clf.fit(X_train_vec, y_train)

# Predict
y_pred = clf.predict(X_test_vec)

# Vectorize the full dataset
X_all_vec = vectorizer.transform(questions)

# Predict on the full dataset
y_all_pred = clf.predict(X_all_vec)

# Add predicted labels as a new column
df['predicted_category'] = y_all_pred


pd.set_option('display.max_colwidth', None)  # Show full content of each column
pd.set_option('display.max_columns', None)   # Show all columns
pd.set_option('display.width', 1000)  
print(df[['Question', 'category', 'predicted_category']].head(20))  # Show only first 20 rows


# Evaluation
print("🔍 Accuracy:", accuracy_score(y_test, y_pred))
# print("\n📊 Classification Report:\n", classification_report(y_test, y_pred))
# print("📉 Confusion Matrix:\n", confusion_matrix(y_test, y_pred))

# Save model and vectorizer
joblib.dump(vectorizer, 'tfidf_vectorizer.pkl')
joblib.dump(clf, 'faq_classifier_model.pkl')


                                                                    Question                                                                                                                                                                                                                                                                                                                                                                Answer                                                         category
177                        Bagaimana prosedur Pembuatan Surat Peringatan 3?"  1. Jurusan mengajukan usulan ke Wadir I setelah 2 surat peringatan sebelumnya. 2. Wadir I setujui dalam 1 hari kerja. 3. Bagian Akademik membuat draft (10 menit). 4. Paraf 3 level: Koordinator Akademik, Wadir I, Direktur (masing-masing 5 menit). 5. Surat dikirim ke mahasiswa dan orang tua via email+pos tercatat. Proses sesuai POS 769/PL3.AOT.01.02/2018."  Standar Operasional Prosedur (SOP) Pembuatan Surat Peringatan 3


['faq_classifier_model.pkl']

In [26]:
# FAQ CLASSIFIER AND COSINE SIMILIARITY

import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.metrics.pairwise import cosine_similarity
import joblib

# Load cleaned dataset
df = pd.read_csv('./Dataset_Prosedur_Akademik_labeled.csv', encoding='utf-8-sig')
dfs = pd.read_csv('./Dataset_Prosedur_Akademik_clean.csv', encoding='utf-8-sig')  # Chat history or user questions

questions = df['Question'].tolist()  
classification = df['category'].tolist()

# Split dataset for training/testing
X_train, X_test, y_train, y_test = train_test_split(questions, classification, test_size=0.3, random_state=42)

# Vectorize
vectorizer = TfidfVectorizer()
X_train_vec = vectorizer.fit_transform(X_train)
X_test_vec = vectorizer.transform(X_test)

# Train Logistic Regression
clf = LogisticRegression(class_weight='balanced', max_iter=1000)
clf.fit(X_train_vec, y_train)

# Predict test set for evaluation
y_pred = clf.predict(X_test_vec)
print("Accuracy:", accuracy_score(y_test, y_pred))

# Now, classify all chat questions (the input from users)
chat_questions = dfs['Question'].tolist()
chat_vecs = vectorizer.transform(chat_questions)

# Predict categories for chat questions
chat_predicted_categories = clf.predict(chat_vecs)

# Add prediction back to chat df
dfs['predicted_category'] = chat_predicted_categories

# For each unique predicted category, find FAQs in that category and do cosine similarity
top_n = 3  # number of top FAQ matches you want per question
results = []

for i, chat_q in enumerate(chat_questions):
    pred_cat = dfs.loc[i, 'predicted_category']
    
    # Filter FAQ questions only from predicted category
    faq_subset = df[df['category'] == pred_cat]
    if faq_subset.empty:
        continue
    
    faq_vecs = vectorizer.transform(faq_subset['Question'].tolist())
    
    # Vector for current chat question
    chat_vec = vectorizer.transform([chat_q])
    
    # Cosine similarity
    sims = cosine_similarity(chat_vec, faq_vecs).flatten()
    
    # Get top N indices
    top_indices = sims.argsort()[::-1][:top_n]
    
    for idx in top_indices:
        results.append({
            'chat_question': chat_q,
            'predicted_category': pred_cat,
            'matched_faq_question': faq_subset.iloc[idx]['Question'],
            'matched_faq_answer': faq_subset.iloc[idx]['Answer'] if 'Answer' in faq_subset.columns else 'N/A',
            'similarity_score': sims[idx]
        })

# Show results
result_df = pd.DataFrame(results)
pd.set_option('display.max_colwidth', None)
print(result_df.head(10))

# Save model and vectorizer (optional)
# joblib.dump(vectorizer, 'tfidf_vectorizer.pkl')
# joblib.dump(clf, 'faq_classifier_model.pkl')


ValueError: Input y contains NaN.

In [26]:
# Multinominal Naive Bayes

import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB  # ✅ NEW MODEL
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
import joblib

# Load cleaned dataset
df = pd.read_csv('./Dataset_Prosedur_Akademik_labeled.csv', encoding='utf-8-sig')
dfs = pd.read_csv('./Dataset_Prosedur_Akademik_clean.csv', encoding='utf-8-sig')

questions = df['Question'].tolist()  
classification = df['category'].tolist()

# Split dataset
X_train, X_test, y_train, y_test = train_test_split(questions, classification, test_size=0.3, random_state=42)

# Vectorize
vectorizer = TfidfVectorizer()
X_train_vec = vectorizer.fit_transform(X_train)
X_test_vec = vectorizer.transform(X_test)

# ✅ Train Naive Bayes model
clf = MultinomialNB()
clf.fit(X_train_vec, y_train)

# Predict
y_pred = clf.predict(X_test_vec)

# Vectorize the full dataset
X_all_vec = vectorizer.transform(questions)

# Predict on the full dataset
y_all_pred = clf.predict(X_all_vec)

# Add predicted labels as a new column
df['predicted_category'] = y_all_pred

pd.set_option('display.max_colwidth', None)  # Show full content of each column
pd.set_option('display.max_columns', None)   # Show all columns
pd.set_option('display.width', 1000)  
print(df[['Question', 'category', 'predicted_category']].head(20))

# Evaluation
print("🔍 Accuracy:", accuracy_score(y_test, y_pred))
# print("\n📊 Classification Report:\n", classification_report(y_test, y_pred))
# print("📉 Confusion Matrix:\n", confusion_matrix(y_test, y_pred))

# Save model and vectorizer
joblib.dump(vectorizer, 'tfidf_vectorizer.pkl')
joblib.dump(clf, 'faq_classifier_model_nb.pkl')  # Save as different name


                                                                              Question                                                                                category                                                                      predicted_category
0                        Apa langkah pertama dalam pengajuan surat keterangan online?"                    Standar Operasional Prosedur (SOP) Pembuatan Surat Keterangan online                    Standar Operasional Prosedur (SOP) Pembuatan Surat Keterangan online
1                 Bagaimana jika data mahasiswa tidak ditemukan saat pengajuan surat?"                    Standar Operasional Prosedur (SOP) Pembuatan Surat Keterangan online                             Standar Operasional Prosedur (SOP) Perubahan Data Mahasiswa
2                    Apa yang harus dilakukan setelah memilih jenis surat keterangan?"                    Standar Operasional Prosedur (SOP) Pembuatan Surat Keterangan online                             Standar 

['faq_classifier_model_nb.pkl']

In [2]:
# SVM linear

import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
import joblib

# Load cleaned dataset
df = pd.read_csv('./Dataset_Prosedur_Akademik_labeled.csv', encoding='utf-8-sig')
questions = df['Question'].tolist()
classification = df['category'].tolist()

print (df.tail())

# Split dataset
X_train, X_test, y_train, y_test = train_test_split(questions, classification, test_size=0.3, random_state=42)

# Vectorize
vectorizer = TfidfVectorizer(ngram_range=(1, 3)) #adding n-grams
X_train_vec = vectorizer.fit_transform(X_train)
X_test_vec = vectorizer.transform(X_test)

# Train model using SVM
clf = SVC(kernel='rbf', class_weight='balanced')  # tried with rbf but poor performance
clf.fit(X_train_vec, y_train)

# Predict
y_pred = clf.predict(X_test_vec)

# Vectorize full dataset and predict
X_all_vec = vectorizer.transform(questions)
y_all_pred = clf.predict(X_all_vec)
df['predicted_category'] = y_all_pred

# Display predictions
pd.set_option('display.max_colwidth', None)
pd.set_option('display.max_columns', None)
pd.set_option('display.width', 1000)
print(df[['Question', 'category', 'predicted_category']].head(20))

# Evaluation
print("Accuracy:", accuracy_score(y_test, y_pred))
# print("Classification Report:\n", classification_report(y_test, y_pred))

# Save model and vectorizer
joblib.dump(vectorizer, 'tfidf_vectorizer.pkl')
joblib.dump(clf, 'faq_classifier_svm_model.pkl')


                                                                                                                                                                                                                                                                                                                                                                                                                                                            Question  Answer  category
182                                                                                                                                                                                                                Apa itu PNJ?","Politeknik Negeri Jakarta (disingkat sebagai PNJ) adalah salah satu perguruan tinggi negeri politeknik berstatus Badan Layanan Umum (BLU) yang terdapat di areal kampus Universitas Indonesia Depok Jawa Barat Indonesia.""", Umum     NaN       NaN
183                                                       

ValueError: Input y contains NaN.

In [27]:
# CLEANING THE DATASET SINCE THE ORIGINAL ONE COULDN'T READ THE ANSWER COLUNN

import csv
import pandas as pd

data = []


with open('./Dataset_Prosedur_Akademik.csv', 'r', encoding='utf-8-sig') as f:
    next(f)  # skip header line
    for line in f:
        line = line.strip()

        # Remove outer quotes if present
        if line.startswith('"') and line.endswith('"'):
            line = line[1:-1]

        # Fix the separator between columns
        line = line.replace(',""', '","')

        # Do NOT replace all double double-quotes inside the text to avoid breaking content

        # Parse with csv.reader
        row = next(csv.reader([line], delimiter=',', quotechar='"'))

        data.append(row)

# Filter out rows that don't have exactly 2 columns
clean_data = [row for row in data if len(row) == 2]

# Optional: Print any bad rows for debugging
for i, row in enumerate(data):
    if len(row) != 2:
        print(f"Skipping bad row {i}: {row}")

# Create DataFrame
df = pd.DataFrame(clean_data, columns=['Question', 'Answer'])

print(df.tail())

df.to_csv('./Dataset_Prosedur_Akademik_clean.csv', index=False, encoding='utf-8-sig')



                                                                    Question                                                                                                                                                                                                                                                                                                                                                                Answer
177                        Bagaimana prosedur Pembuatan Surat Peringatan 3?"  1. Jurusan mengajukan usulan ke Wadir I setelah 2 surat peringatan sebelumnya. 2. Wadir I setujui dalam 1 hari kerja. 3. Bagian Akademik membuat draft (10 menit). 4. Paraf 3 level: Koordinator Akademik, Wadir I, Direktur (masing-masing 5 menit). 5. Surat dikirim ke mahasiswa dan orang tua via email+pos tercatat. Proses sesuai POS 769/PL3.AOT.01.02/2018."
178                 Mengapa diterbitan Surat Peringatan 3 kepada mahasiswa?"                                                      

In [28]:
# MERGING THE LABELS INTO THE 

import pandas as pd

# Ini dataset clean
df_main = pd.read_csv('Dataset_Prosedur_Akademik_clean.csv', encoding='utf-8-sig')

# print("this is cleaned.csv")
# print(df_main.tail(10))

# Ini kolom label
df_labels = pd.read_csv('labels.csv', encoding='utf-8-sig')

# print("this is labels.csv")
# print(df_labels.tail(10))

# Check dulu label csv beneran 1 kolom apa gimana
if df_labels.shape[1] > 1:
    raise ValueError("labels.csv should have only one column of classifications.")

# Check sama length datanya
assert len(df_main) == len(df_labels), f"Length mismatch: main={len(df_main)}, labels={len(df_labels)}"

# Merge labelnya ke satu file
df_main['category'] = df_labels.iloc[:, 0]

# Save to new file
df_main.to_csv('Dataset_Prosedur_Akademik_labeled.csv', index=False, encoding='utf-8-sig')

print("this is labeled.csv")    
print(df_main.tail(10))

this is labeled.csv
                                                                    Question                                                                                                                                                                                                                                                                                                                                                                Answer                                                                            category
172  Bagaimana prosedur pengambilan Surat Pernah Kuliah dalam bentuk fisik?"                                                                                                                                                                         1. Cetak surat dari sistem. 2. Bawa ke Bagian Kemahasiswaan dengan menunjukkan KTP asli. 3. Staf akan membubuhkan stempel basah dan paraf koordinator. Proses pengambilan maksimal 10 menit."  Standar Operasional Prose

In [56]:
%pip install scikit-learn

Defaulting to user installation because normal site-packages is not writeable
Note: you may need to restart the kernel to use updated packages.



[notice] A new release of pip is available: 24.3.1 -> 25.1.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from sklearn.metrics.pairwise import cosine_similarity
import pandas as pd

# Load the dataset
df = pd.read_csv('./COVID19_FAQ.csv')
df['category'] = [
    "transmission", "symptoms", "symptoms", "transmission", "health_info", "health_info",
    "risk", "risk", "risk", "symptoms", "risk", "risk", "health_info", "health_info",
    "health_info", "health_info", "health_info", "health_info", "transmission", "transmission",
    "transmission", "testing", "health_info", "health_info", "transmission", "transmission",
    "transmission", "policy", "health_info", "prevention", "health_info", "policy", "prevention",
    "policy", "policy", "policy", "policy", "policy", "policy", "transmission", "health_info",
    "health_info", "transmission", "health_info", "transmission", "transmission", "transmission",
    "general_info", "health_info", "transmission", "health_info", "testing", "prevention",
    "testing", "testing", "transmission", "transmission", "transmission", "transmission",
    "transmission", "transmission", "prevention", "health_info", "transmission", "transmission",
    "transmission", "transmission"
]

questions = df['questions'].tolist()
categories = df['category'].tolist()

# TF-IDF Vectorizer (used for both model + cosine)
vectorizer = TfidfVectorizer()
X_all_vec = vectorizer.fit_transform(questions)

# =======================
# ✅ Classifier Approach
# =======================
X_train, X_test, y_train, y_test = train_test_split(questions, categories, test_size=0.3, random_state=42)
X_train_vec = vectorizer.transform(X_train)
X_test_vec = vectorizer.transform(X_test)

clf = LogisticRegression(class_weight='balanced', max_iter=1000)
clf.fit(X_train_vec, y_train)

y_pred = clf.predict(X_test_vec)
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred))

# =======================
# ✅ Cosine Similarity Search
# =======================
def search_similar_question(user_input, questions, question_vectors, threshold=0.3):
    input_vec = vectorizer.transform([user_input])
    cosine_similarities = cosine_similarity(input_vec, question_vectors).flatten()
    top_idx = cosine_similarities.argmax()
    top_score = cosine_similarities[top_idx]

    if top_score >= threshold:
        matched_question = questions[top_idx]
        matched_category = df.iloc[top_idx]['category']
        return {
            'matched_question': matched_question,
            'category': matched_category,
            'similarity': top_score
        }
    else:
        return {
            'matched_question': None,
            'category': None,
            'similarity': top_score
        }

# =======================
# 🚀 Example Usage
# =======================
while True:
    query = input("\n❓ Enter your question (or type 'exit'): ")
    if query.lower() == 'exit':
        break

    result = search_similar_question(query, questions, X_all_vec, threshold=0.3)

    if result['matched_question']:
        print(f"\n✅ Closest match: {result['matched_question']}")
        print(f"📂 Category: {result['category']}")
        print(f"📈 Similarity score: {result['similarity']:.2f}")
    else:
        print("❌ No good match found. Try rephrasing your question.")
