<a href="https://colab.research.google.com/github/MaiKhoa0101/MachineLearning/blob/main/synonym.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install -q flask flask-ngrok sentence-transformers scikit-learn

In [None]:
pip install pyngrok



In [None]:
pip install flask-cors



In [None]:
import json
from flask import Flask, request, jsonify
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

# Load data
with open("data.json", "r", encoding="utf-8") as f:
    data = json.load(f)

# T·∫°o dictionary ƒë·ªÉ tra c·ª©u nhanh
gross_to_url = {}
for item in data:
    gross = item["gross"].lower().strip()
    if gross not in gross_to_url:
        gross_to_url[gross] = []
    gross_to_url[gross].append(item["url"])

texts = list(gross_to_url.keys())

# Load model - s·ª≠ d·ª•ng model ƒëa ng√¥n ng·ªØ t·ªët h∆°n
model = SentenceTransformer("paraphrase-multilingual-MiniLM-L12-v2")

# Pre-compute embeddings
print("üîÑ ƒêang encode embeddings...")
embeddings = model.encode(texts, batch_size=32, show_progress_bar=True)
print(f"‚úÖ ƒê√£ load {len(texts)} t·ª´ v·ª±ng")

app = Flask(__name__)

# Configuration
SIMILARITY_THRESHOLD = 0.80  # Ng∆∞·ª°ng 90%
# C·∫ßn th√™m metadata v·ªÅ ƒë·ªô d√†i video v√†o JSON
def find_shortest_video(urls):
    # Logic ƒë·ªÉ t√¨m video ng·∫Øn nh·∫•t
    return urls[0]  # Placeholder

def find_synonym(word, threshold=SIMILARITY_THRESHOLD):
    """
    T√¨m t·ª´ ƒë·ªìng nghƒ©a cho m·ªôt t·ª´
    Returns: (synonym, accuracy, urls) ho·∫∑c None n·∫øu kh√¥ng t√¨m th·∫•y
    """
    word = word.lower().strip()

    # Ki·ªÉm tra exact match tr∆∞·ªõc
    if word in gross_to_url:
        return (word, 100.0, gross_to_url[word])

    # Encode t·ª´ c·∫ßn t√¨m
    word_embedding = model.encode([word])

    # T√≠nh similarity
    similarities = cosine_similarity(word_embedding, embeddings)[0]

    # T√¨m best match
    best_idx = np.argmax(similarities)
    best_score = similarities[best_idx]

    if best_score >= threshold:
        best_match = texts[best_idx]
        return (best_match, round(float(best_score * 100), 2), gross_to_url[best_match])

    return None

@app.route("/translate", methods=["POST"])
def translate():
    """
    API ƒë·ªÉ d·ªãch c√¢u th√†nh c√°c t·ª´ chu·∫©n
    Input: {"sentence": "Tao x∆°i c∆°m r·∫•t ngon"}
    Output: C√¢u ƒë√£ ƒë∆∞·ª£c chu·∫©n h√≥a v√† th√¥ng tin chi ti·∫øt
    """
    body = request.get_json()

    if not body or "sentence" not in body:
        return jsonify({"error": "Missing 'sentence' in request body"}), 400

    sentence = body["sentence"]
    words = sentence.split()

    result_words = []
    word_details = []
    skipped_words = []

    for word in words:
        synonym_result = find_synonym(word)

        if synonym_result:
            synonym, accuracy, urls = synonym_result
            result_words.append(synonym)
            word_details.append({
                "original": word,
                "synonym": synonym,
                "accuracy": accuracy,
                "urls": urls
            })
        else:
            # Gi·ªØ nguy√™n t·ª´ g·ªëc n·∫øu kh√¥ng t√¨m th·∫•y
            result_words.append(word)
            skipped_words.append(word)

    return jsonify({
        "original_sentence": sentence,
        "translated_sentence": " ".join(result_words),
        "word_details": word_details,
        "skipped_words": skipped_words,
        "total_words": len(words),
        "translated_words": len(word_details),
        "skipped_count": len(skipped_words)
    })

@app.route("/search", methods=["POST"])
def search():
    """
    API ƒë·ªÉ t√¨m t·ª´ ƒë·ªìng nghƒ©a cho nhi·ªÅu t·ª´
    Input: {"queries": ["tao", "x∆°i", "c∆°m"]}
    Output: K·∫øt qu·∫£ cho t·ª´ng t·ª´ (CH·ªà 1 URL T·ªêT NH·∫§T)
    """
    body = request.get_json()
    queries = body.get("queries", [])

    if isinstance(queries, str):
        queries = [queries]

    if not isinstance(queries, list) or len(queries) == 0:
        return jsonify({"error": "'queries' must be a non-empty array"}), 400

    results = {}

    for query in queries:
        synonym_result = find_synonym(query)

        if synonym_result:
            synonym, accuracy, urls = synonym_result

            # ‚úÖ CH·ªà L·∫§Y URL ƒê·∫¶U TI√äN (ho·∫∑c random 1 c√°i)
            results[query] = {
                "found": True,
                "synonym": synonym,
                "accuracy": accuracy,
                "url": find_shortest_video(urls)
                # "all_urls": urls  # ‚ùå Kh√¥ng c·∫ßn tr·∫£ v·ªÅ t·∫•t c·∫£
            }
        else:
            results[query] = {
                "found": False,
                "message": f"No synonym found with accuracy >= {SIMILARITY_THRESHOLD * 100}%"
            }

    return jsonify({
        "total_queries": len(queries),
        "threshold": SIMILARITY_THRESHOLD * 100,
        "results": results
    })

@app.route("/health", methods=["GET"])
def health():
    """Health check endpoint"""
    return jsonify({
        "status": "healthy",
        "model": "paraphrase-multilingual-MiniLM-L12-v2",
        "vocabulary_size": len(texts),
        "threshold": SIMILARITY_THRESHOLD * 100
    })

if __name__ == '__main__':
    from pyngrok import ngrok

    # Expose URL c√¥ng khai
    ngrok.set_auth_token("372B0cBmcE6XQkk7Y0JSSGdlw2P_3GqNZtELf3VTxdGDvaPMV")
    public_url = ngrok.connect(5500)

    print(f"\n{'='*60}")
    print(f"üöÄ Flask server running at: {public_url}")
    print(f"{'='*60}\n")
    print(f"üìä Vocabulary size: {len(texts)}")
    print(f"üéØ Similarity threshold: {SIMILARITY_THRESHOLD * 100}%")
    print(f"\nüìç Endpoints:")
    print(f"  POST {public_url}/translate - D·ªãch c√¢u")
    print(f"  POST {public_url}/search    - T√¨m t·ª´ ƒë·ªìng nghƒ©a")
    print(f"  GET  {public_url}/health    - Health check")
    print(f"\nüí° Example request:")
    print(f'  curl -X POST {public_url}/translate \\')
    print(f'    -H "Content-Type: application/json" \\')
    print(f'    -d \'{{"sentence": "Tao x∆°i c∆°m r·∫•t ngon"}}\'')
    print(f"\n{'='*60}\n")

    app.run(host='0.0.0.0', port=5500, debug=False)

üîÑ ƒêang encode embeddings...


Batches:   0%|          | 0/104 [00:00<?, ?it/s]

‚úÖ ƒê√£ load 3318 t·ª´ v·ª±ng

üöÄ Flask server running at: NgrokTunnel: "https://demoded-lourie-unpoulticed.ngrok-free.dev" -> "http://localhost:5500"

üìä Vocabulary size: 3318
üéØ Similarity threshold: 80.0%

üìç Endpoints:
  POST NgrokTunnel: "https://demoded-lourie-unpoulticed.ngrok-free.dev" -> "http://localhost:5500"/translate - D·ªãch c√¢u
  POST NgrokTunnel: "https://demoded-lourie-unpoulticed.ngrok-free.dev" -> "http://localhost:5500"/search    - T√¨m t·ª´ ƒë·ªìng nghƒ©a
  GET  NgrokTunnel: "https://demoded-lourie-unpoulticed.ngrok-free.dev" -> "http://localhost:5500"/health    - Health check

üí° Example request:
  curl -X POST NgrokTunnel: "https://demoded-lourie-unpoulticed.ngrok-free.dev" -> "http://localhost:5500"/translate \
    -H "Content-Type: application/json" \
    -d '{"sentence": "Tao x∆°i c∆°m r·∫•t ngon"}'


 * Serving Flask app '__main__'
 * Debug mode: off


 * Running on all addresses (0.0.0.0)
 * Running on http://127.0.0.1:5500
 * Running on http://172.28.0.12:5500
INFO:werkzeug:[33mPress CTRL+C to quit[0m


**Model chuy√™n ti√™ÃÅng vi√™Ã£t**

In [None]:
!pip install -q sentence-transformers transformers torch scikit-learn

In [None]:
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity

model = SentenceTransformer(
    "VoVanPhuc/sup-SimCSE-VietNamese-phobert-base"
)


In [None]:
texts = [item["gross"] for item in data]

embeddings = model.encode(
    texts,
    show_progress_bar=True
)

print("‚úÖ ƒê√£ t·∫°o embedding ti·∫øng Vi·ªát")


In [None]:
def vietnamese_semantic_search(
    query,
    data,
    embeddings,
    threshold=0.8
):
    query_embedding = model.encode([query])

    similarities = cosine_similarity(
        query_embedding,
        embeddings
    )[0]

    results = []
    for item, score in zip(data, similarities):
        if score >= threshold:
            results.append({
                "gross": item["gross"],
                "url": item["url"],
                "accuracy": round(score * 100, 2)
            })

    results.sort(key=lambda x: x["accuracy"], reverse=True)
    return results


In [None]:
query = "phuÃ£ huynh"

results = vietnamese_semantic_search(
    query=query,
    data=data,
    embeddings=embeddings
)

if not results:
    print("‚ùå Kh√¥ng c√≥ k·∫øt qu·∫£ ‚â• 80%")
else:
    print(f"K·∫øt qu·∫£ cho t·ª´ '{query}':\n")
    for r in results:
        print(f"{r['accuracy']}% ‚Üí {r['gross']} ‚Üí {r['url']}")


In [None]:
import json
from flask import Flask, request, jsonify
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity

# load data
with open("data.json", "r", encoding="utf-8") as f:
    data = json.load(f)

texts = [item["gross"] for item in data]

#load model
model = SentenceTransformer(
    "VoVanPhuc/sup-SimCSE-VietNamese-phobert-base"
)

embeddings = model.encode(texts)

app = Flask(__name__)

@app.route("/search", methods=["POST"])
def search():
    body = request.get_json()

    if not body or "query" not in body:
        return jsonify({
            "error": "Missing 'query' in request body"
        }), 400

    query = body["query"]
    query_embedding = model.encode([query])

    similarities = cosine_similarity(
        query_embedding,
        embeddings
    )[0]

    # T√¨m index c·ªßa k·∫øt qu·∫£ c√≥ ƒë·ªô t∆∞∆°ng ƒë·ªìng cao nh·∫•t
    best_match_index = similarities.argmax()
    best_score = similarities[best_match_index]

    # Ch·ªâ tr·∫£ v·ªÅ best match n·∫øu ƒë·ªô ch√≠nh x√°c >= 90%
    if best_score >= 0.85:
        best_match = data[best_match_index]
        result = {
            "gross": best_match["gross"],
            "url": best_match["url"],
            "accuracy": round(float(best_score * 100), 2)
        }

        return jsonify({
            "query": query,
            "result": [result]
        })
    else:
        return jsonify({
            "query": query,
            "result": [],
            "message": "No match found with accuracy >= 85%"
        })

if __name__ == '__main__':
    # Expose URL c√¥ng khai
    from flask import Flask
    from pyngrok import ngrok

    ngrok.set_auth_token("372B0cBmcE6XQkk7Y0JSSGdlw2P_3GqNZtELf3VTxdGDvaPMV")
    public_url = ngrok.connect(5500)
    print(f"Flask server running at: {public_url}")

    app.run(host='0.0.0.0', port=5500, debug=False)

# M·ª•c m·ªõi

In [None]:
import json
from flask import Flask, request, jsonify
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

# Load data
with open("data.json", "r", encoding="utf-8") as f:
    data = json.load(f)

texts = [item["gross"] for item in data]

# Load model
model = SentenceTransformer("paraphrase-multilingual-MiniLM-L12-v2")

# Pre-compute embeddings for all data
print("Encoding data embeddings...")
embeddings = model.encode(texts, batch_size=32, show_progress_bar=True)
print(f"Loaded {len(data)} items")

app = Flask(__name__)

# Configuration
SIMILARITY_THRESHOLD = 0.80  # Ng∆∞·ª°ng ƒë·ªô t∆∞∆°ng ƒë·ªìng t·ªëi thi·ªÉu
MAX_RESULTS = 100  # S·ªë l∆∞·ª£ng k·∫øt qu·∫£ t·ªëi ƒëa tr·∫£ v·ªÅ
BATCH_SIZE = 32  # Batch size cho encoding

@app.route("/search", methods=["POST"])
def search():
    body = request.get_json()
    queries = body.get("query", [])
    max_per_query = body.get("max_results_per_query", 1)  # M·∫∑c ƒë·ªãnh 3

    if isinstance(queries, str):
        queries = [queries]

    if not isinstance(queries, list) or len(queries) == 0:
        return jsonify({"error": "'query' must be a non-empty array"}), 400

    # Lo·∫°i b·ªè duplicate
    unique_queries = list(set(queries))

    # Batch encode
    query_embeddings = model.encode(unique_queries, batch_size=32)

    # T√≠nh similarity
    similarities = cosine_similarity(query_embeddings, embeddings)

    # Thu th·∫≠p k·∫øt qu·∫£ v·ªõi gi·ªõi h·∫°n per query
    results_by_query = {}

    for query_idx, query in enumerate(unique_queries):
        query_results = []

        # L·∫•y indices sorted theo similarity (cao -> th·∫•p)
        sorted_indices = np.argsort(similarities[query_idx])[::-1]

        for item_idx in sorted_indices:
            score = similarities[query_idx][item_idx]

            if score >= 0.8:  # Threshold
                query_results.append({
                    "gross": data[item_idx]["gross"],
                    "url": data[item_idx]["url"],
                    "accuracy": round(float(score * 100), 2)
                })

                # Gi·ªõi h·∫°n s·ªë k·∫øt qu·∫£ cho m·ªói query
                if len(query_results) >= max_per_query:
                    break

        results_by_query[query] = query_results

    return jsonify({
        "total_queries": len(unique_queries),
        "max_results_per_query": max_per_query,
        "results": results_by_query
    })


@app.route("/health", methods=["GET"])
def health():
    """Health check endpoint"""
    return jsonify({
        "status": "healthy",
        "model": "paraphrase-multilingual-MiniLM-L12-v2",
        "data_items": len(data),
        "threshold": SIMILARITY_THRESHOLD,
        "max_results": MAX_RESULTS
    })


@app.route("/config", methods=["GET"])
def get_config():
    """Get current configuration"""
    return jsonify({
        "similarity_threshold": SIMILARITY_THRESHOLD,
        "max_results": MAX_RESULTS,
        "batch_size": BATCH_SIZE,
        "total_data_items": len(data)
    })


if __name__ == '__main__':
    # Expose URL c√¥ng khai v·ªõi ngrok
    from pyngrok import ngrok

    ngrok.set_auth_token("372B0cBmcE6XQkk7Y0JSSGdlw2P_3GqNZtELf3VTxdGDvaPMV")
    public_url = ngrok.connect(5500)
    print(f"\n{'='*60}")
    print(f"üöÄ Flask server running at: {public_url}")
    print(f"{'='*60}\n")
    print(f"üìä Loaded {len(data)} items")
    print(f"üéØ Similarity threshold: {SIMILARITY_THRESHOLD * 100}%")
    print(f"üìã Max results: {MAX_RESULTS}")
    print(f"\nEndpoints:")
    print(f"  POST {public_url}/search - Search for synonyms")
    print(f"  GET  {public_url}/health - Health check")
    print(f"  GET  {public_url}/config - View configuration")
    print(f"\n{'='*60}\n")

    app.run(host='0.0.0.0', port=5500, debug=False)

In [None]:
import json

with open("data.json", "r", encoding="utf-8") as f:
    data = json.load(f)

print(f"‚úÖ ƒê√£ load {len(data)} b·∫£n ghi")


In [None]:
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity

model = SentenceTransformer(
    "paraphrase-multilingual-MiniLM-L12-v2"
)


In [None]:
texts = [item["gross"] for item in data]

embeddings = model.encode(
    texts,
    show_progress_bar=True
)

print("‚úÖ ƒê√£ t·∫°o embedding")


In [None]:
def semantic_search_80_percent(
    query,
    data,
    embeddings
):
    query_embedding = model.encode([query])

    similarities = cosine_similarity(
        query_embedding,
        embeddings
    )[0]

    results = []
    for item, score in zip(data, similarities):
        if score >= 0.9:   # üî• CH·ªà L·∫§Y >= 80%
            results.append({
                "gross": item["gross"],
                "url": item["url"],
                "accuracy": round(float(score * 100), 2)
            })

    results.sort(key=lambda x: x["accuracy"], reverse=True)
    return results


In [None]:
query = "ba meÃ£"

results = semantic_search_80_percent(
    query=query,
    data=data,
    embeddings=embeddings
)

if not results:
    print("‚ùå Kh√¥ng c√≥ k·∫øt qu·∫£ >= 80%")
else:
    print(f"K·∫øt qu·∫£ cho t·ª´ '{query}':\n")
    for r in results:
        print(f"{r['accuracy']}% ‚Üí {r['gross']} ‚Üí {r['url']}")
