In [36]:
import string
import nltk
from nltk.corpus import stopwords
#nltk.download() # only need to download for once.
import json
from pprint import pprint

In [37]:
def clean_up_tweet(tweet):
    
    # remove punctuations.
    for p in string.punctuation:
        tweet = tweet.replace(p,'')
    
    tweet = tweet.split(' ')
    
    # remove stop words.
    stopWords = set(stopwords.words('english'))
    filtered = []
    for w in tweet:
        if w not in stopWords:
            filtered.append(w)
     
    # remove redundance.
    tweet_set = set(filtered)

    return tweet_set


In [38]:
# find the intersection of two tweets

def intersection(tweet1, tweet2):

    counter = 0
    for word1 in tweet1:
        for word2 in tweet2:
            if(word1 == word2):
                counter += 1
            
    return counter

In [39]:
def union(tweet1, tweet2):
    return len(tweet1) + len(tweet2) - intersection(tweet1,tweet2)

In [60]:
def jaccard_distance(tweet1, tweet2):
    return 1 - (intersection(tweet1, tweet2)/union(tweet1, tweet2))

In [98]:
def dump_tweets_to_dict(filename):
    
    # make a new dictionary to store tweets: 
    # id:text -> 323906397609791488: "RT @ItsJennaMarbles: Re..."
    tweet_dict = {}
    
    # open json file to read lines.
    with open(filename,'r') as f:
        for line in f:
            json_data = json.loads(line)
            tweet_dict[json_data.get('id')] = json_data.get('text')
   
    return tweet_dict    
    f.closed
    
        

In [99]:
def make_initial_centroid_ids():
    initial_centroids = [323906397735641088, 323906483584655360, 323906657333682176, 323907258301939713, 323909308188344320, 323913403460636673, 324067437886713856, 324117950774775809, 324138055772561408, 324219503401644033, 324320247018573824, 324346553835868161, 324372750330363904, 324408472441585664, 324422817565257728, 324448013999304704, 324785120085176320, 325059351209443329, 325060324992643072, 325162944931438592, 325253327048822784, 325337623910559745, 325409910642835456, 325701934273134594, 325946633986641920 ] 
    return initial_centroids


In [157]:
# tweets is a dictionary of tweets.
# centroids is a list of tweet ids chosen as centroids.
def init_clustering(centroids, tweets):
    
    # cluster_list is a dict: centroid_id -> cluster tweets
    cluster_list = {}

    # we first associate each tweet to a cluster centroid.
    tweet_centroid_pair = {}
    for tweet_id, tweet_text in tweets.items():

        # a dictionary to store distance and centroid key.
        distances = {}
        for centroid_id in centroids:
            
            tweet = clean_up_tweet(tweet_text)
            centroid = clean_up_tweet(tweets.get(centroid_id))
            
            dist = jaccard_distance(tweet,centroid)
            
            # add distance to dictionary.
            distances[centroid_id] = dist

        # find the smallest distance between current tweet to all centroids.
        min_dist = min(distances.values())
        
        target_centroid = None
        for key, val in distances.items():
            if(val == min_dist):
                target_centroid = key
                break;
        
        tweet_centroid_pair[tweet_id] = target_centroid

    
    for centroid_id in centroids:
        cluster = []
        for key, val in tweet_centroid_pair.items():
            if(val == centroid_id):
                cluster.append(key)
        cluster_list[centroid_id] = cluster
        
    return cluster_list

#print(init_clustering(make_initial_centroid_ids(),dump_tweets_to_dict('Tweets.json')))

{323906397735641088: [323906397609791488, 323906397618196483, 323906397735641088, 323906397853073410, 323906397962121216, 323906398012461057, 323906398230544385, 323906398314438656, 323906398352195585, 323906398826164225, 323906398993932289, 323906399149109248, 323906399295926273, 323906399300100096, 323906656318676993, 323907087551836160, 323907771256938496, 323908455545049088, 323908795254312962, 323908795396943872], 323906483584655360: [323906483584655360, 323906485249789952, 323911610236293120, 323915000567697409, 323916051614138368, 323920146454425600, 323921510282702848, 323923559799996417, 323925264352546816], 323906657333682176: [323906650987692034, 323906651209994241, 323906653651079168, 323906657333682176], 323907258301939713: [323906398176030720, 323906567294562306, 323907258301939713, 323910330315075584, 323910330457669633, 323955716392112128, 323963901769297921, 324226045052071936, 323932094190439874], 323909308188344320: [323909308188344320, 324229792834674689], 323913403

In [71]:
t1 = 'it is weather is scary is is.'
t2 = 'the weather is nice is is!'

tweet1 = clean_up_tweet(t1)
tweet2 = clean_up_tweet(t2)
