# 1. Validate

## 1.1. Check type of information

In [None]:
## Check phone number 

import re

def is_vietnamese_phone_number(number):
    """Validates if a given string is a Vietnamese phone number.

    Args:
        number (str): The phone number to validate.

    Returns:
        bool: True if the number is valid, False otherwise.
    """

    # Remove non-numeric characters
    cleaned_number = re.sub(r"\D", "", number)

    # Check for valid lengths and prefixes
    return bool(re.match(r"^(0|\+84|84|0084)([3-9]{1}|[1][2-9])\d{8}$", cleaned_number))


def convert_to_10_digits(number):
    """Converts an 11-digit Vietnamese phone number to 10 digits based on the provided rules.

    Args:
        number (str): The 11-digit phone number.

    Returns:
        str: The converted 10-digit phone number or the original number if not applicable.
    """

    cleaned_number = re.sub(r"\D", "", number)
    if len(cleaned_number) != 11:
        return number

    # Convert old format of VNese phone number into 10 digit format
    prefix = cleaned_number[:3]
    if prefix in ["0120", "0121", "0122", "0126", "0128"]:
        return "07" + cleaned_number[3:]
    elif prefix in ["0123", "0124", "0125", "0127", "0129"]:
        return "08" + cleaned_number[3:]
    elif prefix in ["0162", "0163", "0164", "0165", "0166", "0167", "0168", "0169"]:
        return "03" + cleaned_number[3:]
    elif prefix in ["0186", "0188"]:
        return "05" + cleaned_number[3:]
    elif prefix == "0199":
        return "059" + cleaned_number[4:]
    else:
        return number

# Example usage:
phone_number = "+84987654321"
if is_vietnamese_phone_number(phone_number):
    converted_number = convert_to_10_digits(phone_number)
    print(converted_number)  # Output: 987654321

+84987654321


In [None]:
import re

class DataValidator:
    def __init__(self):
        pass

    def validate_phone_number(self, phone_number):
        # A basic phone number validation, you might want to refine based on specific formats
        phone_pattern = r"^\d{10,11}$"  # Adjust pattern as needed
        return bool(re.match(phone_pattern, phone_number))

    def validate_email(self, email):
        email_pattern = r"^[a-zA-Z0-9_.+-]+@[a-zA-Z0-9-]+\.[a-zA-Z0-9-.]+$"
        return bool(re.match(email_pattern, email))

    def validate_name(self, name):
        # A basic check for non-empty strings
        return bool(name.strip())

    def validate_and_clean(self, data):
        """Validates and cleans the given data.

        Args:
            data (dict): A dictionary containing customer information.

        Returns:
            dict: A validated and cleaned dictionary.
        """

        cleaned_data = {}
        for key, value in data.items():
            if key == 'phone_number':
                if self.validate_phone_number(value):
                    cleaned_data[key] = value
                else:
                    cleaned_data[key] = None
            elif key == 'email':
                if self.validate_email(value):
                    cleaned_data[key] = value
                else:
                    cleaned_data[key] = None
            else:
                cleaned_data[key] = value.strip()  # Basic cleaning for other fields

        return cleaned_data

# Example usage:
customer_data = {
    'id': '0001',
    'phone_number': '0987654321',
    'name': 'Nguyễn Văn A',
    'email': 'nguyenvanA@gmail.com'
}

validated_data = DataValidator().validate_and_clean(customer_data)
print(validated_data)

# 3. Match 

## 3.1. Compare 2 separate texts

### 3.1.1. Cosine Similarity

In [28]:
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

def calculate_text_similarity(text1, text2):
    """Tính toán độ tương đồng giữa hai văn bản.

    Args:
        text1 (str): Văn bản thứ nhất.
        text2 (str): Văn bản thứ hai.

    Returns:
        float: Độ tương đồng giữa hai văn bản, trong khoảng từ 0 đến 1.
    """

    # Tiền xử lý văn bản: Loại bỏ stop words, stemming
    stop_words = set(stopwords.words('english'))
    stemmer = PorterStemmer()

    def preprocess(text):
        words = nltk.word_tokenize(text)
        words = [stemmer.stem(word) for word in words if word not in stop_words]
        return ' '.join(words)

    text1 = preprocess(text1)
    text2 = preprocess(text2)

    # Tạo vector đặc trưng cho từng văn bản
    vectorizer = TfidfVectorizer()
    tfidf_matrix = vectorizer.fit_transform([text1, text2])

    # Tính toán độ tương đồng cosine
    cosine_sim = cosine_similarity(tfidf_matrix)[0][1]

    return cosine_sim

In [31]:
text1 = "This is a sample sentence"
text2 = "This sentence is an example"
similarity = calculate_text_similarity(text1, text2)
print(similarity)

0.5031026124151314


In [32]:
text_1 = 'Bùi Văn Hải'
text_2 = 'Bùi Vân Hải'
calculate_text_similarity(text_1, text_2)

0.5031026124151314

### 3.1.2. All methods

In [72]:
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from nltk.util import ngrams
from Levenshtein import distance
from gensim.models import Word2Vec
from ot import emd2  # Assuming ot is installed


def calculate_similarities(text1, text2):
    """Tính toán độ tương đồng giữa hai văn bản bằng nhiều phương pháp.

    Args:
        text1 (str): Văn bản thứ nhất.
        text2 (str): Văn bản thứ hai.

    Returns:
        dict: Một dictionary chứa các phương pháp và giá trị độ tương đồng tương ứng.
    """

    # Tiền xử lý văn bản
    stop_words = set(stopwords.words('english'))
    stemmer = PorterStemmer()
    def preprocess(text):
        words = nltk.word_tokenize(text)
        words = [stemmer.stem(word) for word in words if word not in stop_words]
        return words

    tokens1 = preprocess(text1)
    tokens2 = preprocess(text2)

    # Tính toán các loại độ tương đồng
    similarities = {}

    # Jaccard similarity
    set1 = set(tokens1)
    set2 = set(tokens2)
    similarities['Jaccard'] = len(set1.intersection(set2)) / len(set1.union(set2))

    # Levenshtein distance
    similarities['Levenshtein'] = 1 - (distance(text1, text2) / max(len(text1), len(text2)))

    # Cosine similarity
    vectorizer = TfidfVectorizer()
    tfidf_matrix = vectorizer.fit_transform([text1, text2])
    similarities['Cosine'] = cosine_similarity(tfidf_matrix)[0][1]

    # N-gram overlap
    bigrams1 = list(ngrams(tokens1, 2))
    bigrams2 = list(ngrams(tokens2, 2))
    similarities['Bigram overlap'] = len(set(bigrams1).intersection(set(bigrams2))) / len(set(bigrams1).union(bigrams2))

    # ... Thêm các phương pháp khác như WMD, embedding-based similarity ...
    # Word Mover's Distance (WMD)
    model = Word2Vec([tokens1, tokens2], min_count=1)
    similarities['WMD'] = model.wv.wmdistance(tokens1, tokens2)

    # Embedding-based similarity (e.g., using Word2Vec)
    word_vectors = model.wv
    vector1 = sum(word_vectors[word] for word in tokens1) / len(tokens1)
    vector2 = sum(word_vectors[word] for word in tokens2) / len(tokens2)
    similarities['Embedding'] = cosine_similarity([vector1], [vector2])[0][0]

    return similarities

In [73]:
text1 = "This is a sample sentence"
text2 = "This sentence is an example"
similarity = calculate_similarities(text1, text2)
print(similarity)
print(sum(similarity.values())/len(similarity))

{'Jaccard': 0.5, 'Levenshtein': 0.33333333333333337, 'Cosine': 0.5101490193104813, 'Bigram overlap': 0.0, 'WMD': 0.47457931422164573, 'Embedding': 0.6445959}
0.4104429313175069


In [74]:
text1 = 'Bùi Văn Hải'
text2 = 'Bùi Vân Hải'
similarity = calculate_similarities(text1, text2)
print(similarity)
print(sum(similarity.values())/len(similarity))

{'Jaccard': 0.5, 'Levenshtein': 0.9090909090909091, 'Cosine': 0.5031026124151314, 'Bigram overlap': 0.0, 'WMD': 0.47457931422164573, 'Embedding': 0.6445959}
0.5052281261278779


## 3.2. Compare 2 records (as 2 vectors)

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import re
import pandas as pd

def compare_records(record1, record2):
    """Compare similarity between 2 records syntaxtically, not semantically.

    Args:
        record1 (tuple): First record Ex: (id, phone_no, name, email).
        record2 (tuple): Second record Ex: (id, phone_no, name, email).

    Returns:
        float: Cosine similarity between 2 records
    """

    # Prepare the input data
    def preprocess_text(text):
        # Preprocess text depending on the type it needed
        text = re.sub(r'\D', '', text)
        # ... Other preprocessing if needed for each type of data
        # (Ex: stemming, lemmatization)
        return text

    # Create vector for each records
    vector_data = []
    for record in [record1, record2]:
        vector_data.append(' '.join([
            preprocess_text(record[1]),
            preprocess_text(record[2]),
            record[3]
        ]))
        print(record)

    print(vector_data)

    # Tính toán TF-IDF và cosine similarity
    vectorizer = TfidfVectorizer()
    tfidf_matrix = vectorizer.fit_transform(vector_data)
    cosine_sim = cosine_similarity(tfidf_matrix)[0][1]

    print(tfidf_matrix)

    # Get the feature names (words)
    words = vectorizer.get_feature_names_out()

    # Get the TF-IDF values for each document
    tfidf_values = tfidf_matrix.toarray()

    # Create a DataFrame to visualize the results
    df = pd.DataFrame(tfidf_values, columns=words)
    print(df)

    return cosine_sim

In [50]:
record1 = ('0001', "123456789", "John smith", "john@gmail.com")
record2 = ('0001', "0123456789", "john sMITH", "john@gmail.com")

similarity = compare_records(record1, record2)
print(similarity)

('0001', '123456789', 'John smith', 'john@gmail.com')
('0001', '0123456789', 'john sMITH', 'john@gmail.com')
['123456789  john@gmail.com', '0123456789  john@gmail.com']
  (0, 2)	0.44832087319911734
  (0, 3)	0.44832087319911734
  (0, 4)	0.44832087319911734
  (0, 1)	0.6300993445179441
  (1, 0)	0.6300993445179441
  (1, 2)	0.44832087319911734
  (1, 3)	0.44832087319911734
  (1, 4)	0.44832087319911734
   0123456789  123456789       com     gmail      john
0    0.000000   0.630099  0.448321  0.448321  0.448321
1    0.630099   0.000000  0.448321  0.448321  0.448321
0.6029748160380572
