# URL Utils

## URL Recognition
In original training set

- `"://"` : tweets: 121330, users len: 7340
- `"http"` : tweets: 121327, users len: 7339
- `"http://"` : tweets: 121236, users len: 7338
- `"https://"` : tweets: 47, users len: 35

## Might related

1. `:/` as expression in sentence
```
    counter: 276
    userMap len: 192
    dict_keys([1, 3, 2, 7, 4, 5])
    userMap[7] = [5393, 5073]
    userMap[5] = [7152, 6128]
```
2. @handle num per twitts

## Preprocessing

Remove the `@handle` and `rt`.

In [1]:
remove_words = ["@handle", "rt"]

def pre_process(sentence):
    sentence = sentence.split()
    target_remove = set()
    for token in sentence:
        for target in remove_words:
            if target in token:
                target_remove.add(token)
                
    for target in target_remove:
        sentence.remove(target)
    
    sentence = ' '.join(sentence)
    return sentence

In [4]:
import csv
import collections
import time
import matplotlib.pyplot as plt
time_start=time.time()

train_file_path = "data/train_tweets.txt"
train_dict = collections.defaultdict(list)

empty_sentence_counter = 0
with open(train_file_path, encoding='utf-8') as tsvfile:
    for i, row in enumerate(tsvfile.readlines()):
        row = row.strip().split('\t')
        id = int(row[0])
        oldSentence = row[1].lower()
        instance = pre_process(oldSentence)
        if not instance == "":
            train_dict[id].append(instance)
        else:
            print("-----------------------------------")
            print("Empty sentence row:{0}".format(i+1))
            print("id: {0}".format(id))
            print("sentence: {0}".format(oldSentence))
            empty_sentence_counter += 1
    print("Total rows: %d" % i)
print("Total ids: %d" % len(train_dict))

time_end=time.time()
print("Empty sentence counter: {0}".format(empty_sentence_counter))
print("Time spent: {0:.2f}ms".format((time_end-time_start)*1000))

-----------------------------------
Empty sentence row:4049
id: 8573
sentence: @handle
-----------------------------------
Empty sentence row:5481
id: 9151
sentence: @handle
-----------------------------------
Empty sentence row:6224
id: 1416
sentence: @handle
-----------------------------------
Empty sentence row:6935
id: 2868
sentence: @handle
-----------------------------------
Empty sentence row:7334
id: 4330
sentence: @handle
-----------------------------------
Empty sentence row:12343
id: 130
sentence: @handle
-----------------------------------
Empty sentence row:14434
id: 4133
sentence: @handle
-----------------------------------
Empty sentence row:14437
id: 4133
sentence: @handle
-----------------------------------
Empty sentence row:14466
id: 4133
sentence: @handle
-----------------------------------
Empty sentence row:14803
id: 446
sentence: @handle "worthless"
-----------------------------------
Empty sentence row:14812
id: 446
sentence: @handle
----------------------------

-----------------------------------
Empty sentence row:177422
id: 9844
sentence: @handle dirty!
-----------------------------------
Empty sentence row:179299
id: 263
sentence: alefbaybee@handle.com
-----------------------------------
Empty sentence row:179565
id: 1474
sentence: http://bit.ly/rtcpe
-----------------------------------
Empty sentence row:180993
id: 2662
sentence: ??[:@handle
-----------------------------------
Empty sentence row:182486
id: 4185
sentence: @handle
-----------------------------------
Empty sentence row:193740
id: 6501
sentence: http://cli.gs/gqrta
-----------------------------------
Empty sentence row:194199
id: 2102
sentence: @handle models@handle.com
-----------------------------------
Empty sentence row:196628
id: 9193
sentence: @handle
-----------------------------------
Empty sentence row:196636
id: 9193
sentence: @handle upnextis@handle.com
-----------------------------------
Empty sentence row:199886
id: 5464
sentence: grrrrrr@handle-mobile
----------

In [None]:
def containSubstr(row, id, sentence, quiet=False):
    #if ("://" in sentence) and ("gn://" not in sentence) and ("http://" not in sentence):
    #if (":/" in sentence):
    #if ("https://" in sentence) and ("http://" in sentence):
    if ("@handle" in sentence):
        if not quiet:
            print("--------------------------------------------")
            print("Row: {0}".format(row+1))
            print("id: {0}".format(id))
            print("Sentence: {0}".format(sentence))
            print("--------------------------------------------")
        return True
    return False

In [127]:
def extractURL(row, id, sentence, quiet=False):
    target = "http://"
    subStr = sentence
    urlList = []
    while (target in subStr):
        index = subStr.index(target)
        subStr = subStr[index:]
        tempURL = subStr.split()[0]
        subStr = subStr[len(tempURL):]
        tempURL = tempURL[len(target):]
        if len(tempURL) <= 5:
#             print("--------------------------------------------")
#             print("Row: {0}".format(row+1))
#             print("Sentence: {0}".format(sentence))
#             print(tempURL)
            continue
        if "http://" in tempURL:
            newURLList = tempURL.split("http://")
            if '' in newURLList:
                newURLList.remove('')
            for newURLListItem in newURLList:
                if '/' in newURLListItem:
                    newURLListItem = newURLListItem.split('/')[0]
                    part = newURLListItem.split('.')
                    urlList.append("{0}.{1}".format(part[-2], part[-1]))
            continue
        elif "http:/" in tempURL:
            tempURL = tempURL.replace("http:/", '')
            if len(tempURL) < 5:
                continue
        if '/' in tempURL:
            tempURL = tempURL.split('/')[0]
        part = tempURL.split('.')
        while '' in part:
            part.remove('')
        if len(part) < 2:
            continue
        tempURL = "{0}.{1}".format(part[-2], part[-1])
        if '?' in tempURL:
            part = tempURL.split("?")
            while '' in part:
                part.remove('')
            tempURL = part[0]
        urlList.append(tempURL)
    if (0 != len(urlList)) and (not quiet):
        print("--------------------------------------------")
        print("Row: {0}".format(row+1))
        print("id: {0}".format(id))
        print("Sentence: {0}".format(sentence))
        print("--------------------------------------------")
    return urlList

In [None]:
counter = 0
userMap = {}
urlMap = {}
with open(train_file_path, encoding='utf-8') as tsvfile:
    lines = tsvfile.readlines()
    for i, row in enumerate(lines):
        row = row.strip().split("\t")
        id = int(row[0])
        instance = row[1].lower()
#         if containSubstr(i, id, instance, quiet=True):
#         if containSubstr(i, id, instance, quiet=True):
#            if id in userMap.keys():
#                userMap[id] += 1
#            else:
#                userMap[id] = 1
        urlList = extractURL(i, id, instance, quiet=True)
        len_urlList = len(urlList)
        if (0 != len_urlList):
            if id in userMap.keys():
                userMap[id] += len_urlList
            else:
                userMap[id] = len_urlList
            if id in urlMap.keys():
                urlMap[id] += urlList
            else:
                urlMap[id] = urlList
        counter += 1

In [None]:
counterMap = {}
for key in userMap.keys():
    tempNum = userMap[key]
    if tempNum in counterMap.keys():
        counterMap[tempNum].append(key)
    else:
        counterMap[tempNum] = [key]

x = list(counterMap.keys()).copy()
x.sort()
x = x[50:]
y = []
for key in x:
    y.append(len(counterMap[key]))

%matplotlib inline
plt.plot(x, y)
plt.xlabel("Tweets contains key words pre user")
plt.ylabel("num of users")
plt.show()
print("tweets: %d, users len: %d" % (counter, len(userMap.keys())))
#print(counterMap.keys())
#print(sum(y[:20]))

In [125]:
def sortedDictValues(adict): 
    keys = list(adict.keys())
    keys.sort()
    return [(key, adict[key]) for key in keys]

urlUserMap = {}
for user in urlMap.keys():
    for url in urlMap[user]:
        if url in urlUserMap.keys():
            urlUser = urlUserMap[url]
            if user in urlUser.keys():
                urlUser[user] += 1
            else:
                urlUser[user] = 1
        else:
            urlUserMap[url] = {user : 1}

# sort user
for urlUser in urlUserMap.keys():
    urlUserMap[urlUser] = sortedDictValues(urlUserMap[urlUser])

urlUserListSorted = sortedDictValues(urlUserMap)
# output sorted url
with open("data/sortedURL.csv", 'w') as file:
    for (url, item) in urlUserListSorted:
        for (user, frequency) in item:
            file.write("{0},{1},{2}\n".format(url,user,frequency))