In [1]:
import string
import nltk
from nltk.corpus import stopwords
#nltk.download() # only need to download for once.
import json
from pprint import pprint

In [2]:
def clean_up_tweet(tweet):
    
    # remove punctuations.
    for p in string.punctuation:
        tweet = tweet.replace(p,'')
    
    tweet = tweet.split(' ')
    
    # remove stop words.
    stopWords = set(stopwords.words('english'))
    filtered = []
    for w in tweet:
        if w not in stopWords:
            filtered.append(w)
     
    # remove redundance.
    tweet_set = set(filtered)

    return tweet_set


In [3]:
# find the intersection of two tweets

def intersection(tweet1, tweet2):

    counter = 0
    for word1 in tweet1:
        for word2 in tweet2:
            if(word1 == word2):
                counter += 1
            
    return counter

In [4]:
def union(tweet1, tweet2):
    return len(tweet1) + len(tweet2) - intersection(tweet1,tweet2)

In [5]:
def jaccard_distance(tweet1, tweet2):
    return 1 - (intersection(tweet1, tweet2)/union(tweet1, tweet2))

In [6]:
def dump_tweets_to_dict(filename):
    
    # make a new dictionary to store tweets: 
    # id:text -> 323906397609791488: "RT @ItsJennaMarbles: Re..."
    tweet_dict = {}
    
    # open json file to read lines.
    with open(filename,'r') as f:
        for line in f:
            json_data = json.loads(line)
            tweet_dict[json_data.get('id')] = json_data.get('text')
   
    return tweet_dict    
    f.closed
    
        

In [7]:
def make_initial_centroid_ids():
    initial_centroids = [323906397735641088, 323906483584655360, 323906657333682176, 323907258301939713, 323909308188344320, 323913403460636673, 324067437886713856, 324117950774775809, 324138055772561408, 324219503401644033, 324320247018573824, 324346553835868161, 324372750330363904, 324408472441585664, 324422817565257728, 324448013999304704, 324785120085176320, 325059351209443329, 325060324992643072, 325162944931438592, 325253327048822784, 325337623910559745, 325409910642835456, 325701934273134594, 325946633986641920 ] 
    return initial_centroids


In [8]:
# tweets is a dictionary of tweets.
# centroids is a list of tweet ids chosen as centroids.
# returns a dictionary: centroid -> [tweet1, tweet2, ..., tweetn]
def clustering(centroids, tweets_dict):
    
    # cluster_list is a dict: centroid_id -> cluster tweets
    cluster_dict = {}

    # we first associate each tweet to a cluster centroid.
    tweet_centroid_pair = {}
    for tweet_id, tweet_text in tweets_dict.items():

        # a dictionary to store distance and centroid key.
        distances = {}
        for centroid_id in centroids:
            
            tweet = clean_up_tweet(tweet_text)
            centroid = clean_up_tweet(tweets_dict.get(centroid_id))
            
            dist = jaccard_distance(tweet,centroid)
            
            # add distance to dictionary.
            distances[centroid_id] = dist

        # find the smallest distance between current tweet to all centroids.
        min_dist = min(distances.values())
        
        target_centroid = None
        for key, val in distances.items():
            if(val == min_dist):
                target_centroid = key
                break;
        
        tweet_centroid_pair[tweet_id] = target_centroid

    
    for centroid_id in centroids:
        cluster = []
        for key, val in tweet_centroid_pair.items():
            if(val == centroid_id):
                cluster.append(key)
        cluster_dict[centroid_id] = cluster
        
    return cluster_dict

#print(clustering(make_initial_centroid_ids(),dump_tweets_to_dict('Tweets.json')))

In [9]:
# cluster_dict is a dictionary: centroid -> [tweet1, tweet2, ..., tweetn]
# tweets_dict is a dictionary: tweet_id -> tweet_context 
#                              e.g. id:text -> 323906397609791488: "RT @ItsJennaMarbles: Re..."
# return a list of new centroid ids.

def update_centroids(cluster_dict, tweets_dict):
    
    # Loop through the clusters in cluster_dict (total 25 of them).
    updated_centroid_id_list = []
    for centroid_id, cluster in cluster_dict.items():
        
        # Find a tweet that has the shortest total distance to every other points in the current cluster.
        dist = 0
        dist_lst = []
        for i in range(len(cluster)):
            for j in range(len(cluster)): 
                dist += jaccard_distance(clean_up_tweet(tweets_dict.get(cluster[i])),
                                         clean_up_tweet(tweets_dict.get(cluster[j])))
            dist_lst.append(dist)
            dist = 0
        shortest_dist_index = dist_lst.index(min(dist_lst)) 
        
        #print(tweets_dict.get(cluster[shortest_dist_index])) 
        updated_centroid_id_list.append(cluster[shortest_dist_index])
    return updated_centroid_id_list

# print(update_centroids(clustering(make_initial_centroid_ids(),dump_tweets_to_dict('Tweets.json')),
#                  dump_tweets_to_dict('Tweets.json')))    

In [None]:
def compare_list(list1, list2):
    if not list1 or not list2:
        return False
    return sorted(list1) == sorted(list2)

In [None]:
# testing...

# Read tweets from json and dump them to a dictionary.
tweets_dict = dump_tweets_to_dict('Tweets.json')

# initial set of centroids.
centroids_list = make_initial_centroid_ids()

# initial clustering results:
cluster_dict = clustering(centroids_list,tweets_dict)

while True:
    
    updated_cetnroids_list = update_centroids(cluster_dict, tweets_dict)
    if(compare_list(centroids_list,updated_cetnroids_list)): # if centroids no longer changes
        break
    else:
        centroids_list = updated_cetnroids_list
        cluster_dict = clustering(centroids_list,tweets_dict)
        

print(clustering(updated_cetnroids_list,tweets_dict))
