In [None]:
#!pip install pandas
#!pip install numpy

In [None]:
import numpy as np
from sklearn.model_selection import train_test_split
from tabulate import tabulate
import pandas as pd
import random as rd
import re
import math
import string  

In [None]:
data = pd.read_csv("https://raw.githubusercontent.com/Hlopez490/ML01/main/usnewshealth.txt", sep='|', header=None)

In [None]:
data = data.drop(0, axis=1)
data = data.drop(1, axis=1)

In [None]:
data.rename(columns = {2:'Tweets'}, inplace = True)

In [None]:
def preProcessing(df):
      df = re.sub('@[^\s]+','',df)
      df = re.sub('http[^\s]+','',df)
      df = re.sub('\W', ' ', df)
      df = df.lower() 
      
      return df

In [None]:
data['Tweets'] = data['Tweets'].apply(preProcessing)

In [None]:
def k_means(tweets, num_clusters, max_iterations=100):
    centroids = []

    count = 0
    cluster_index_map = dict()
    while count < num_clusters:
        random_tweet_index = rd.randint(0, len(tweets) - 1)
        if random_tweet_index not in cluster_index_map:
            count += 1
            cluster_index_map[random_tweet_index] = True
            centroids.append(tweets[random_tweet_index])

    iteration_count = 0
    previous_centroids = []

    while (has_not_converged(previous_centroids, centroids)) and (iteration_count < max_iterations):
        clusters = assign_clusters_to_tweets(tweets, centroids)
        previous_centroids = centroids
        centroids = update_centroids(clusters)

        iteration_count = iteration_count + 1

    if (iteration_count == max_iterations):
        print("!! Maximum iterations reached !!")

    sse = find_SSE(clusters)
    return clusters, sse


def has_not_converged(prev, new):
    if len(prev) != len(new):
        return True
    for i in range(len(new)):
        if " ".join(new[i]) != " ".join(prev[i]):
            return True

    return False


def assign_clusters_to_tweets(tweets, centroids):
    clusters = dict()

    for x in range(len(tweets)):
        minimum_distance = math.inf
        cluster_index = -1;
        for i in range(len(centroids)):
            distance = find_distance(centroids[i], tweets[x])

            if centroids[i] == tweets[x]:
                cluster_index = i
                minimum_distance = 0
                break

            if distance < minimum_distance:
                cluster_index = i
                minimum_distance = distance

        if minimum_distance == 1:
            cluster_index = rd.randint(0, len(centroids) - 1)

        clusters.setdefault(cluster_index, []).append([tweets[x]])
        last_tweet_index = len(clusters.setdefault(cluster_index, [])) - 1
        clusters.setdefault(cluster_index, [])[last_tweet_index].append(minimum_distance)

    return clusters


def update_centroids(clusters):
    centroids = []

    for i in range(len(clusters)):
        minimum_distance_sum = math.inf
        centroid_index = -1

        minimum_distance_dp = []

        for x1 in range(len(clusters[i])):
            minimum_distance_dp.append([])
            sum_distance = 0
            for x2 in range(len(clusters[i])):
                if x1 != x2:
                    if x2 < x1:
                        distance = minimum_distance_dp[x2][x1]
                    else:
                        distance = find_distance(clusters[i][x1][0], clusters[i][x2][0])

                    minimum_distance_dp[x1].append(distance)
                    sum_distance += distance
                else:
                    minimum_distance_dp[x1].append(0)

            if sum_distance < minimum_distance_sum:
                minimum_distance_sum = sum_distance
                centroid_index = x1

        centroids.append(clusters[i][centroid_index][0])

    return centroids


def find_distance(tweet1, tweet2):
    intersection = set(tweet1).intersection(tweet2)
    union = set().union(tweet1, tweet2)

    return 1 - (len(intersection) / len(union))


def find_SSE(clusters):
    sse = 0

    for x in range(len(clusters)):
        for i in range(len(clusters[x])):
            sse = sse + (clusters[x][i][1] * clusters[x][i][1])

    return sse

In [None]:
def format_print(results):
	print(tabulate(results, headers=['Value of K', 'SSE', 'Size of each cluster'], tablefmt='fancy_grid'))

In [None]:
practice_data = data

k_values = [10, 25, 50, 75, 100]

results = []

for k in k_values:
  clusters, sse = k_means(practice_data['Tweets'], k)
  cluster_size = ""
  for c in range(len(clusters)):
    cluster_size += str(c+1) + ": "+ str(len(clusters[c])) + " tweets \n"
  results.append([k, sse, cluster_size])

In [None]:
format_print(results)

╒══════════════╤═════════╤════════════════════════╕
│   Value of K │     SSE │ Size of each cluster   │
╞══════════════╪═════════╪════════════════════════╡
│           10 │ 49.6222 │ 1: 205 tweets          │
│              │         │ 2: 95 tweets           │
│              │         │ 3: 114 tweets          │
│              │         │ 4: 162 tweets          │
│              │         │ 5: 87 tweets           │
│              │         │ 6: 102 tweets          │
│              │         │ 7: 14 tweets           │
│              │         │ 8: 550 tweets          │
│              │         │ 9: 34 tweets           │
│              │         │ 10: 32 tweets          │
├──────────────┼─────────┼────────────────────────┤
│           25 │ 39.5172 │ 1: 5 tweets            │
│              │         │ 2: 21 tweets           │
│              │         │ 3: 8 tweets            │
│              │         │ 4: 23 tweets           │
│              │         │ 5: 6 tweets            │
│           