# **Text Preprocessing**

In [None]:
import pandas as pd
import numpy as np
from sqlalchemy import create_engine
from transformers import pipeline
from IPython.display import clear_output
import re
import string
from Sastrawi.Stemmer.StemmerFactory import StemmerFactory
from Sastrawi.StopWordRemover.StopWordRemoverFactory import StopWordRemoverFactory
from nltk.tokenize import sent_tokenize, word_tokenize
from googletrans import Translator
engine = create_engine('postgresql://postgres:Your_Database_Password@localhost:5432/DIY')

In [None]:
conf_competitor_name = "Grand Lucky"
conf_sql_name = "Review_Maps_" + conf_competitor_name

In [None]:
data = pd.read_sql(conf_sql_name,con=engine)
df = data[data['date'].str.contains('tahun')==False]
df['rating'] = df['rating'].replace(to_replace=['bintang','star'],value="",regex=True)
df['rating'] = df['rating'].astype(int)
print("Total Data : ",df.shape,"\n",data.rating.unique())

In [None]:
print(df['store'].nunique())
df.head()

In [None]:
factory_stemmer = StemmerFactory()
stemmer = factory_stemmer.create_stemmer()

stop_words_extend = ['abis','tempat', 'ada', 'adalah', 'agak', 'aja', 'amat', 'andaikata', 'aneh', 'anw', 'apa','klo', 
                     'apalagi', 'apapun', 'atas', 'atau', 'awalnya', 'ayo', 'bagai', 'bagaimana', 'bagaimanapun', 
                     'bahkan', 'bakal', 'balik', 'banyak', 'baru', 'bawah', 'beberapa', 'begitu', 'belum', 'biasa', 
                     'biasanya', 'bikin', 'bisakah', 'boleh', 'bro', 'bsok', 'btw', 'bu', 'buat', 'bukan', 'bukannya', 
                     'cmn', 'coba', 'cukup', 'dah', 'dahulu', 'dalam', 'dan', 'dari', 'deh', 'dekat', 'dengan', 'di', 
                     'dia', 'dikit', 'dimana', 'diri', 'dirinya', 'dl', 'dlm', 'doang', 'dong', 'dr', 'dua', 'eh', 
                     'engga', 'ga', 'gak', 'gimana', 'gini', 'gitu', 'gmn', 'gt', 'gua', 'haha', 'hai', 'halo', 'hampir', 
                     'hanya', 'hari', 'hehe', 'hihi', 'hingga', 'ini', 'itu', 'iya', 'jadi', 'jadinya', 'jalan', 'jangan', 
                     'jauh', 'jd', 'jelas', 'jg', 'juga', 'kalau', 'kalian', 'kalo', 'kami', 'kan', 'kapan', 'karena', 'kau', 
                     'kayak', 'kayaknya', 'ke', 'kecil', 'keluar', 'kemana', 'kembali', 'kemudian', 'kenapa', 'kini', 'kita', 
                     'kl', 'kmrn', 'kok', 'kondisi', 'krn', 'kurang', 'lagi', 'lah', 'lain', 'lalu', 'lbh', 'lmao', 'loh', 'lol', 
                     'makanya', 'makasih', 'mana', 'mas', 'masih', 'mbak', 'melalui', 'memang', 'mereka', 'meski', 'mesti', 'minta', 
                     'misalnya', 'moga', 'mohon', 'msh', 'mudah', 'mulai', 'nah', 'namun', 'nanti', 'ngapain', 'ngerti', 'nggak', 
                     'nih', 'nya', 'nyata', 'oke', 'oleh', 'pada', 'padanya', 'pak', 'paling', 'pastinya', 'per', 'pernah', 'plg', 
                     'plis', 'pls', 'pula', 'pun', 'rupanya', 'saat', 'saja', 'saling', 'sama', 'sampai', 'sampe', 'sangat', 'satu', 
                     'saya', 'sdh', 'sebab', 'sebagian', 'sebaiknya', 'sebelum', 'sebelumnya', 'seberapa', 'sebuah', 'sedangkan', 
                     'sedikit', 'segera', 'seharusnya', 'sehingga', 'sekali', 'selain', 'selalu', 'selama', 'seluruh', 'semacam', 
                     'semakin', 'sementara', 'semua', 'sendiri', 'seperti', 'sering', 'serta', 'sesuai', 'setelah', 'setiap', 'siap', 
                     'siapa', 'sini', 'sis', 'situ', 'skrg', 'sm', 'smg', 'soalnya', 'sudah', 'supaya', 'sy', 'tadi', 'tanpa', 'tau', 
                     'tdk', 'tentang', 'tentu', 'terkadang', 'terlalu', 'termasuk', 'terus', 'tetap', 'thx', 'tidak', 'toh', 'tp', 'tq', 
                     'trs', 'tuh', 'udah', 'umumnya', 'untuk', 'walau', 'walaupun', 'wkwk', 'ya', 'yaitu', 'yakin', 'yang', 'yuk']

factory_stopword = StopWordRemoverFactory()
stop_words = factory_stopword.get_stop_words()
stop_words.extend(stop_words_extend)
factory_stopword.stop_words = stop_words
stopword_remover = factory_stopword.create_stop_word_remover()

In [None]:
inc = 0
# fungsi untuk langkah case folding
def casefolding(text):
  text = text.lower()                               # Mengubah teks menjadi lower case
  text = re.sub(r'https?://\S+|www\.\S+', '', text) # Menghapus URL
  text = re.sub(r'[-+]?[0-9]+', '', text)           # Menghapus angka
  text = re.sub(r'[^\w\s]','', text)                # Menghapus karakter tanda baca
  text = text.strip()
  return text

def text_normalize(text):
  key_norm = pd.read_csv('https://raw.githubusercontent.com/ksnugroho/klasifikasi-spam-sms/master/data/key_norm.csv')
  text = ' '.join([key_norm[key_norm['singkat'] == word]['hasil'].values[0] if (key_norm['singkat'] == word).any() else word for word in text.split()])
  text = str.lower(text)
  return text

def remove_stop_words(text):
  text = stopword_remover.remove(text)
  return text

def stemming(text):
  text = stemmer.stem(text)
  return text

def text_preprocessing_process(text):
  global inc 
  inc += 1
  try :
    print(inc)
    text = casefolding(text)
    text = text_normalize(text)
    text = remove_stop_words(text)
    text = stemming(text)
    clear_output()
    return text
  except:
    return ""
df['clean_text']  = df['review'].apply(lambda text : text_preprocessing_process(text))
#Done 14m 48s

In [None]:
new_order_col = ['competitor','store','Province','Region','review','clean_text','reviewer_name','date','rating']
df = df[new_order_col]
df.head(3)

# **Sentiment Generate**

In [None]:
from transformers import pipeline

pretrained_name = "w11wo/indonesian-roberta-base-sentiment-classifier"

predictor = pipeline(
    "sentiment-analysis",
    model=pretrained_name,
    tokenizer=pretrained_name
)
counter =0 
def sentiment_results(text_review):
    try :
        global counter
        counter += 1
        print(counter)
        text = text_review.replace("\n", " ")
        results = predictor(text_review)
        score = results[0]['score']
        if score <= 0.89:
            labels = 'neutral'
        else : 
            labels = results[0]['label']
        clear_output()
        return labels,score
    except:
        return "",0
df[['sentiment','Conf']] = df['review'].apply(lambda x : pd.Series(sentiment_results(x)))
print("Done generated sentiment",df.shape)

# **Save File**

In [None]:
from datetime import datetime
current_time = datetime.now().strftime("%Y-%m-%d_%H-%M-%S")

# Menambahkan tanggal dan waktu ke nama file
filename_version = f"Sentiment_{conf_competitor_name}_{current_time}.csv"
df.to_csv(filename_version, index=False)

# **Text Generation**

- token="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" 
- **untuk model CohereForAI/aya-23-8B dan meta-llama/Meta-Llama-3-8B**

### **google/flan-t5-large**

In [None]:
from transformers import pipeline
from googletrans import Translator
translator = Translator()
model_name_4 = "google/flan-t5-large" # So Far ini paling baik
predictor = pipeline('text2text-generation', model=model_name_4)

def topic_predictor(text_review):
    text_review = translator.translate(text_review, src='id', dest='en').text
    prompt = f"""
    Based on the text: "{text_review}".
    Specify only one of the topics : [No Issue, Service Issue, Payment Issue, Store Environment Issue, Price Issue, Product Issue]
    """
    results = predictor(prompt,num_return_sequences=1)[0]['generated_text']
    print(text_review,'\n',results)
    return results
# df['Labels'] = df['review'].apply(lambda x : topic_predictor(x))
df['review'].apply(lambda x : topic_predictor(x))

### **deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B**
- Deepseek : https://dev.to/nodeshiftcloud/a-step-by-step-guide-to-install-deepseek-r1-locally-with-ollama-vllm-or-transformers-44a1

In [None]:
from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline
import torch

# Load model dan tokenizer
model_name = "deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(model_name, device_map="auto")
predictor = pipeline("text-generation", model=model, tokenizer=tokenizer)

In [None]:
# Contoh prompt
prompt = """Classify Positive / Negative from this sentences : 
'buruk pesan online jam pagi jam sore konfirmasi boro kirim dicall'
"""
# Generate teks
results = predictor(prompt, max_new_tokens=50, return_full_text=False)
# Tampilkan hasil
print(results[0]['generated_text'])


### **Qwen/Qwen2.5-1.5B-Instruct**
from alibaba : https://huggingface.co/Qwen/Qwen2.5-1.5B-Instruct

# **Zero-Shot Classification**
**Model References**


<img src="https://raw.githubusercontent.com/MoritzLaurer/zeroshot-classifier/main/v2_synthetic_data/results/zeroshot-v2.0-aggreg.png" width="500" />


https://huggingface.co/facebook

**Select Model**

In [None]:
df = pd.read_excel(r"S:\Web Scrap\Sentiment Analysist\Testing_Grand_Lucky.xlsx").reset_index(drop=True)
# df = df.sample(10,random_state=3).reset_index(drop=True)
print(df.shape)
df.head(2)

In [None]:
classifier_english = pipeline("zero-shot-classification", model='MoritzLaurer/deberta-v3-large-zeroshot-v2.0',device=0)#Model hanya 1 bahasa (English)
classifier_indo = pipeline("zero-shot-classification", model='MoritzLaurer/bge-m3-zeroshot-v2.0',device_map="auto")#Multi language Model (Include Indonesia)
list_labels = ["Good Sentiment", "Bad Staff Services", "Payment Issue", "Bad Store Environment", "Price Issue", "Bad Product"]


In [None]:
list_labels_try = ["Good Sentiment",
                   "Good Staff Services", "Bad Staff Services", 
                   "Bad Payment Procedure", "Good Payment Procedure", 
                   "Bad Store Environment", "Good Store Environment", 
                   "Low Price", "High Price", 
                   "Bad Product Quality","Good Product Quality"]

In [None]:
counter = 0
def generate_text_classification(classifier_indo,classifier_english,list_labels,review_text,review_rating):
    global counter
    counter +=1
    # clear_output()
    # print(counter)
    try :
        review_translation = Translator().translate(review_text, src="id", dest="en").text
        result_en = classifier_english(review_translation, candidate_labels=list_labels)
        top_label_en = result_en['labels'][0]
        top_conf_en = result_en['scores'][0]

        result_id = classifier_indo(review_text, candidate_labels=list_labels)
        top_label_id = result_id['labels'][0]
        top_conf_id = result_id['scores'][0]

        sentiment_id = classifier_indo(review_text, candidate_labels=["Positive","Negative","Neutral"])
        top_sentiment_id = sentiment_id['labels'][0]
        top_conf_sentiment_id = sentiment_id['scores'][0]

        sentiment_en = classifier_english(review_translation, candidate_labels=["Positive","Negative","Neutral"])
        top_sentiment_en = sentiment_en['labels'][0]
        top_conf_sentiment_en = sentiment_en['scores'][0]

        print(review_text)
        print("en ",top_label_en,":",top_conf_en)
        print("en ",top_sentiment_en,":",top_conf_sentiment_en)
        print("id ",top_label_id,":",top_conf_id)
        print("id ",top_sentiment_id,":",top_conf_sentiment_id)

        if (top_conf_en > top_conf_id) and (top_conf_en>=0.7):
            return top_label_en
        elif (top_conf_en < top_conf_id) and (top_conf_id>=0.7):
            return top_label_id
        elif (top_conf_en < 0.7) or (top_conf_id<0.7):
            print("review_rating ",review_rating)
            if (review_rating <=3) and (top_label_en!="Good Sentiment") and (top_label_id==top_label_en)  :
                return top_label_en
            elif (review_rating >=4) and (top_label_en=="Good Sentiment") and (top_label_id==top_label_en)  :
                return top_label_en
            elif (top_label_id != top_label_en) and (top_conf_en < top_conf_id):
                return f"Check_{top_label_id}"
            elif (top_label_id != top_label_en) and (top_conf_en > top_conf_id):
                return f"Check_{top_label_en}"
        else:
            return ""
    except :
        return ""

df['Labels'] = df.apply(lambda data: generate_text_classification(classifier_indo,classifier_english,list_labels_try,data['review'],data['rating']), axis=1)
# df.to_csv(f"TESTING_{conf_competitor_name}.csv", index=False)
df

# **Test Hugging Face API**
**Token dapat dibuat di Hugging Face**

In [None]:
conf_competitor_name = "Grand Lucky"
conf_sql_name = "Review_Maps_" + conf_competitor_name

import pandas as pd
import numpy as np
from sqlalchemy import create_engine
from transformers import pipeline
from IPython.display import clear_output
import requests
import re

engine = create_engine('postgresql://postgres:Santus12@localhost:5432/DIY')

data = pd.read_sql(conf_sql_name,con=engine)
df = data[(data['date'].str.contains('tahun')==False) &
          (data['review'].apply(lambda x: len(x.split()) > 4) )
          ].sample(7,random_state=19).reset_index(drop=True)

print("Total Data : ",df.shape,"\n",data.rating.unique())

In [None]:
model= "mistralai/Mixtral-8x7B-Instruct-v0.1"
model_2 = "meta-llama/Meta-Llama-3-8B-Instruct"
API_TOKEN = "api_token" 
API_URL = f"https://api-inference.huggingface.co/models/{model}"
headers = {"Authorization": f"Bearer {API_TOKEN}"}
# data = {"inputs": prompt}


def pipeline_labeling_by_api(text_review):
    # pattern = r'\b(no issue|service issue|payment issue|store environment issue|price issue|product issue)\b'
    # prompt = f"""Berdasarkan text : "{text_review}" .Tentukan hanya SATU dari sentiment : [No Issue, Service Issue, Payment Issue, Store Environment Issue, Price Issue, Product Issue]. Hanya tulis label saja:"""
    
    pattern = r'\b(good sentiment|bad service|payment issue|bad store environment|price issue|bad product)\b'
    prompt = f"""Berdasarkan text : "{text_review}" .Tentukan hanya SATU dari sentiment : [good sentiment, bad service, payment issue, bad store environment, price issue, bad product]. Hanya tulis label saja:"""
    data = {"inputs": prompt}
    response = requests.post(API_URL, headers=headers, json=data)
    if response.status_code == 200:
        result = response.json()
        generate_text = result[0]["generated_text"].split("label saja:")[-1].strip()
        match = re.search(pattern, generate_text, re.IGNORECASE).group()
        return match
    else:
        print(f"Error: {response.status_code} - {response.text}")
        return "Error"

text_review = "Tokonya panas sekali tidak nyaman"
pipeline_labeling_by_api(text_review)

# df['Labels'] = df['review'].apply( lambda revw : pipeline_labeling_by_api(revw))
# df

