# Bài 1: Cài đặt giải thuật Viterbi với framcode:  

In [4]:
def viterbi(obs, states, start_p, trans_p, emit_p):
    T = len(obs)
    # N = len(states)
    T1 = [{} for _ in range(T)]
    T2 = [{} for _ in range(T)]

    # Initialization
    for state in states:
        T1[0][state] = start_p[state] * emit_p[state][obs[0]]
        T2[0][state] = None

    # Forward algorithm
    for t in range(1, T):
        for curr_state in states:
            max_prob = max(T1[t - 1][prev_state] * trans_p[prev_state][curr_state] * emit_p[curr_state][obs[t]] for prev_state in states)
            T1[t][curr_state] = max_prob
            T2[t][curr_state] = max((T1[t - 1][prev_state] * trans_p[prev_state][curr_state], prev_state) for prev_state in states)[1]

    # Backtracking
    best_path = []
    max_prob = max(T1[T - 1].values())
    previous_state = None

    for state, prob in T1[T - 1].items():
        if prob == max_prob:
            best_path.append(state)
            previous_state = state
            break

    for t in range(T - 2, -1, -1):
        best_path.insert(0, T2[t + 1][previous_state])
        previous_state = T2[t + 1][previous_state]

    return best_path

obs = ('normal', 'cold', 'dizzy')
states = ('Healthy', 'Fever')
start_p = {'Healthy': 0.6, 'Fever': 0.4}
trans_p = {
    'Healthy': {'Healthy': 0.7, 'Fever': 0.3},
    'Fever': {'Healthy': 0.4, 'Fever': 0.6}
}
emit_p = {
    'Healthy': {'normal': 0.5, 'cold': 0.4, 'dizzy': 0.1},
    'Fever': {'normal': 0.1, 'cold': 0.3, 'dizzy': 0.6}
}

result = viterbi(obs, states, start_p, trans_p, emit_p)
print("The most likely state sequence is:", result)


The most likely state sequence is: ['Healthy', 'Healthy', 'Fever']


# Bài 2: Cài đặt giải thuật BFR và ứng dụng giải thuật này cho 1 bộ dữ liệu  


In [5]:
import numpy as np

class BFR:
    def __init__(self, data, k, threshold):
        self.data = data
        self.k = k
        self.threshold = threshold
        self.clusters = []

    def run(self):
        self.clusters.append(self.data)
        while len(self.clusters) < self.k:
            largest_cluster = max(self.clusters, key=len)
            if len(largest_cluster) <= self.threshold:
                break
            new_clusters = self.split_cluster(largest_cluster)
            self.clusters.remove(largest_cluster)
            self.clusters.extend(new_clusters)

    def split_cluster(self, cluster):
        centroid = np.mean(cluster, axis=0)
        distances = np.linalg.norm(cluster - centroid, axis=1)
        furthest_point_idx = np.argmax(distances)
        furthest_point = cluster[furthest_point_idx]
        split_point = np.random.uniform(furthest_point - centroid, furthest_point + centroid)
        cluster1 = []
        cluster2 = []
        for point in cluster:
            if np.linalg.norm(point - split_point) < np.linalg.norm(point - centroid):
                cluster1.append(point)
            else:
                cluster2.append(point)
        return cluster1, cluster2

# Example usage:
data = np.random.randn(100, 2)  # Sample data with 100 points in 2 dimensions
bfr = BFR(data, k=3, threshold=10)
bfr.run()
print("Final clusters:")
for i, cluster in enumerate(bfr.clusters):
    print(f"Cluster {i+1}: {len(cluster)} points")


Final clusters:
Cluster 1: 7 points
Cluster 2: 6 points
Cluster 3: 87 points


# Bài 3: Cài đặt giải thuật CURE và ứng dụng giải thuật này cho 1 bộ dữ liệu  

In [6]:
import numpy as np

class CURE:
    def __init__(self, k, num_representatives):
        self.k = k
        self.num_representatives = num_representatives

    def fit_predict(self, X):
        # Chọn các điểm đại diện ban đầu
        representatives = self.select_representatives(X)

        # Phân cụm dữ liệu ban đầu
        clusters = self.cluster(X, representatives)

        # Tối ưu hóa các cụm
        self.optimize_clusters(clusters)

        return clusters

    def select_representatives(self, X):
        # Lựa chọn ngẫu nhiên các điểm làm đại diện ban đầu
        indices = np.random.choice(X.shape[0], self.num_representatives, replace=False)
        return X[indices]

    def cluster(self, X, representatives):
        clusters = [[] for _ in range(self.num_representatives)]

        # Phân mỗi điểm dữ liệu vào cụm gần nhất
        for point in X:
            min_dist = float('inf')
            min_rep_index = None
            for i, rep in enumerate(representatives):
                dist = np.linalg.norm(point - rep)
                if dist < min_dist:
                    min_dist = dist
                    min_rep_index = i
            clusters[min_rep_index].append(point)

        return clusters

    def optimize_clusters(self, clusters):
        # Phương pháp tối ưu hóa cụm ở đây
        pass

# Sử dụng giải thuật CURE trên một tập dữ liệu giả định
if __name__ == "__main__":
    # Tạo dữ liệu giả định
    np.random.seed(0)
    X = np.random.rand(100, 2)  # 100 điểm dữ liệu 2 chiều

    # Sử dụng giải thuật CURE
    k = 3  # số cụm cần phân
    num_representatives = 5  # số điểm đại diện ban đầu
    cure = CURE(k, num_representatives)
    clusters = cure.fit_predict(X)

    # In ra các cụm
    for i, cluster in enumerate(clusters):
        print(f"Cluster {i + 1}: {cluster}")


Cluster 1: [array([0.65632959, 0.13818295]), array([0.56660145, 0.26538949]), array([0.52324805, 0.09394051]), array([0.58651293, 0.02010755]), array([0.52103661, 0.05433799]), array([0.57722859, 0.23789282])]
Cluster 2: [array([0.07103606, 0.0871293 ]), array([0.52184832, 0.41466194]), array([0.21038256, 0.1289263 ]), array([0.31542835, 0.36371077]), array([0.20887676, 0.16130952]), array([0.46631077, 0.24442559]), array([0.15896958, 0.11037514]), array([0.19658236, 0.36872517]), array([0.28280696, 0.12019656]), array([0.2961402 , 0.11872772]), array([0.31798318, 0.41426299]), array([0.28940609, 0.18319136]), array([0.0191932 , 0.30157482]), array([0.13547406, 0.29828233]), array([0.4071833, 0.069167 ]), array([0.01171408, 0.35997806]), array([0.19999652, 0.01852179]), array([0.39822106, 0.20984375]), array([0.22741463, 0.25435648])]
Cluster 3: [array([0.5488135 , 0.71518937]), array([0.60276338, 0.54488318]), array([0.4236548 , 0.64589411]), array([0.0202184 , 0.83261985]), array([0.

# END

In [5]:
def calculate_mean(data):
    return sum(data) / len(data)

def calculate_median(data):
    n = len(data)
    if n % 2 == 0:
        return (data[n//2 - 1] + data[n//2]) / 2
    else:
        return data[n//2]

def calculate_midrange(data):
    return (max(data) + min(data)) / 2

def calculate_quartiles(data):
    n = len(data)
    q2 = calculate_median(data)
    if n % 2 == 0:
        lower_half = data[:n//2]
        upper_half = data[n//2:]
    else:
        lower_half = data[:n//2]
        upper_half = data[n//2 + 1:]
    q1 = calculate_median(lower_half)
    q3 = calculate_median(upper_half)
    return q1, q2, q3

def calculate_iqr(data):
    q1, q2, q3 = calculate_quartiles(data)
    return q3 - q1

# Given sorted data
data = [6, 7, 15, 36, 39, 40, 41, 42, 43, 47, 49]

# Calculating statistics
mean = calculate_mean(data)
median = calculate_median(data)
midrange = calculate_midrange(data)
q1, q2, q3 = calculate_quartiles(data)
iqr = calculate_iqr(data)

# Printing the results
print("Data:", data)
print("Mean =", mean)
print("Median =", median)
print("Midrange =", midrange)
print("Q1 =", q1)
print("Q2 =", q2)
print("Q3 =", q3)
print("IQR =", iqr)


Data: [6, 7, 15, 36, 39, 40, 41, 42, 43, 47, 49]
Mean = 33.18181818181818
Median = 40
Midrange = 27.5
Q1 = 15
Q2 = 40
Q3 = 43
IQR = 28


- Bấm máy thì ra được Mean
- Bấm máy ra được median
- Bấm máy sẽ ra được min với max : (max +  min)/2 = Midrange
- Bấm máy ra được Q1
- Bấm máy ra được Q3
- IQR : Lấy Q3 - Q1 = IQR
- Q2 = Median

In [12]:
import numpy as np

# Given sorted data
data = [4, 8, 9, 15, 21,21, 21, 24, 25, 26, 28, 29, 34]

# Step 1: Partition into Equal-Width Bins
equal_width_bins = np.array_split(data, 4)
print(equal_width_bins)
# Step 2: Partition into Equal-Depth Bins
equal_depth_bins = []
bin_size = len(data) // 4
print(bin_size)
for i in range(4):
    bin_start = i * bin_size
    bin_end = bin_start + bin_size
    equal_depth_bins.append(data[bin_start:bin_end])

# Step 3: Smoothing by Bin Median
bin_median_smoothed_data = [np.median(bin) for bin in equal_depth_bins]

# Step 4: Smoothing by Bin Boundaries
bin_boundary_smoothed_data = [bin_data[0] if i % 2 == 0 else bin_data[-1] for i, bin_data in enumerate(equal_depth_bins)]

# Comparing results
print("Equal-Width Bins:")
for i, bin_data in enumerate(equal_width_bins):
    print(f"Bin {i+1}: {bin_data}")

print("\nEqual-Depth Bins:")
for i, bin_data in enumerate(equal_depth_bins):
    print(f"Bin {i+1}: {bin_data}")

print("\nSmoothing by Bin Median:")
print(bin_median_smoothed_data)

print("\nSmoothing by Bin Boundaries:")
print(bin_boundary_smoothed_data)


[array([ 4,  8,  9, 15]), array([21, 21, 21]), array([24, 25, 26]), array([28, 29, 34])]
3
Equal-Width Bins:
Bin 1: [ 4  8  9 15]
Bin 2: [21 21 21]
Bin 3: [24 25 26]
Bin 4: [28 29 34]

Equal-Depth Bins:
Bin 1: [4, 8, 9]
Bin 2: [15, 21, 21]
Bin 3: [21, 24, 25]
Bin 4: [26, 28, 29]

Smoothing by Bin Median:
[8.0, 21.0, 24.0, 28.0]

Smoothing by Bin Boundaries:
[4, 21, 21, 29]


: 

- Equal-width :
    - Bin 1: [4, 8, 9]
    - Bin 2: [15, 21, 21]
    - Bin 3: [24, 25, 26]
    - Bin 4: [28, 29, 34]

- equal-depth