#  Web and Social Information Extraction - A.Y. 2017/2018 

# MULTI-DOMAIN SOCIAL RECOMMENDER SYSTEM

## Author: Kadir Mert Ozcan - Matricola: 1780512



#### Important: All the files need to exist in the same directory as this ipynb file.

In [73]:
import numpy as np
import sklearn

In [74]:
import time
#start_time = time.time()

# Part 1 - Reading the Data and Categorizing the Interests

In [75]:
#Ignore the categories that include these in their names
ignore = {"living_people", "deaths", "births"}

First, the IDs and names of the interests are read from the info files below

In [76]:
#Dictionary to store <interestId, interestName> pairs
interestDict = {}
with open('friend-based_interest_info.tsv') as tsvfile:
  reader = csv.reader(tsvfile, delimiter="\t", quotechar='"')
  for row in reader:
    interestID = row[0]
    interestName = row[1][8:]
    interestDict[interestID] = interestName
with open('message-based_interest_info.tsv', encoding="utf8") as tsvfile:
  reader = csv.reader(tsvfile, delimiter="\t", quotechar='"')
  for row in reader:
    interestID = row[0]
    interestName = row[1][8:]
    interestDict[interestID] = interestName

After that, another dictionary is created that is the inversed version of interestDict

This will make it easier to access interests when fetching the categories (and will cost only a small amount of memory)

In [77]:
#Dictionary to store <interestName, interestId> pairs
inversedInterestDict = {}
for interest in interestDict:
    interestName = interestDict[interest]
    if interestName in inversedInterestDict:
        inversedInterestDict[interestName].add(interest)
    else:
        inversedInterestDict[interestName] = set()
        inversedInterestDict[interestName].add(interest)

Then the file called 'article_categories_en.nt' is read to get the categories, 
#### downloaded from http://downloads.dbpedia.org/3.9/en/article_categories_en.nt.bz2

In [None]:
#Dictionary to store <interestName, categorySet> pairs
interestCategoryDict = {}
with open('article_categories_en.nt') as ntfile:
  reader = csv.reader(ntfile, delimiter=" ", quotechar='"')
  for row in reader:
    interestResult = re.search('<http://dbpedia.org/resource/(.*)>', row[0])
    categoryResult = re.search('<http://dbpedia.org/resource/Category:(.*)>', row[2])
    if interestResult != None and categoryResult != None:
        interest = interestResult.group(1)
        category = categoryResult.group(1)
        if any(x in category.lower() for x in ignore):
            continue
        else:
            if interest in inversedInterestDict:
                if interest in interestCategoryDict:
                    interestCategoryDict[interest].add(category)
                else:
                    interestCategoryDict[interest] = set()
                    interestCategoryDict[interest].add(category)

Every user and their interests are read from the 2 datasets below.

InterestIds are held as strings instead of set or list to be more memory efficient.

In [None]:
#Dictionary to store <userId, interestIds> pairs.
userInterestDict = {}
with open('friend-based_dataset.tsv') as tsvfile:
    reader = csv.reader(tsvfile, delimiter="\t", quotechar='"')
    for row in reader:
        if row[0] in userInterestDict:
            userInterestDict[row[0]] = userInterestDict[row[0]] + "," + row[1]
        else:
            userInterestDict[row[0]] = row[1]
with open('message-based_dataset.tsv') as tsvfile:
    reader = csv.reader(tsvfile, delimiter="\t", quotechar='"')
    for row in reader:
        if row[0] in userInterestDict:
            userInterestDict[row[0]] = userInterestDict[row[0]] + "," + row[2]
        else:
            userInterestDict[row[0]] = row[2]

Top 4 most frequent categories are gathered for each user.

In [None]:
#List to store top categories for each user. Every entry is top n categories for a user.
categories = []
#Number of categories to get for each user
numberOfCats = 3
for user, interests in userInterestDict.items():
    #Dictionary to store category frequencies of all interest for a user
    localCategoryFreqDict = {}
    interestArr = interests.split(',')
    for interest in interestArr:
        if interestDict[interest] in interestCategoryDict:
            for category in interestCategoryDict[interestDict[interest]]:
                if category in localCategoryFreqDict:
                    localCategoryFreqDict[category] = localCategoryFreqDict[category] + 1
                else:
                    localCategoryFreqDict[category] = 1
    categories.append(" ".join(sorted(localCategoryFreqDict, key=localCategoryFreqDict.get, reverse=True)[:numberOfCats]))
#print("--- %s seconds ---" % (time.time() - start_time))

# Part 2 - Clustering the Users

This custom tokenizer is used so that every category in the list "category" is treated as a token.

In [35]:
def my_tokenizer(s):
    return s.split(' ')

Categories are vectorized using TfidfVectorizer.

In [36]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans
from sklearn.metrics import adjusted_rand_score
vectorizer = TfidfVectorizer(tokenizer = my_tokenizer)
#X is a sparse term-document matrix of user-category pairs
X = vectorizer.fit_transform(categories)

Dimentionality is reduced using TruncatedSVD to prevent overlapping clusters

In [44]:
from sklearn.decomposition import TruncatedSVD
from sklearn.random_projection import sparse_random_matrix
svd = TruncatedSVD(n_components=8, n_iter=10, random_state=42)
reducedX = svd.fit_transform(X)  

Reduced matrix is fitted to k-means clusterin algorithm.

In [56]:
kmeans = KMeans(n_clusters=30, init='k-means++',n_init=10, verbose=0)
kmeans.fit(reducedX)

KMeans(algorithm='auto', copy_x=True, init='k-means++', max_iter=300,
    n_clusters=30, n_init=10, n_jobs=1, precompute_distances='auto',
    random_state=None, tol=0.0001, verbose=0)

# Part 3 - Evaluating the Clusters

Clusters are evaluated with silhouette score.

In [57]:
from sklearn import metrics
metrics.silhouette_score(reducedX, kmeans.labels_, metric='euclidean', sample_size = 1000)

0.79683111782414362

# Part 4 - Clustering Twitter Users

Special keys and tokens are required to fetch data using twitter api

In [15]:
from twitter import *

CONSUMER_KEY = '9HqYMGHtCok5QINP6n4YokR9U'
CONSUMER_SECRET = 'jfOG4iLpX8pKgZ4ZGbjdPzCxZjlI6VNUfY1RKxrYSgPoDHzRzF'
ACCESS_TOKEN = '973238402580664325-WiXwkPnQ5YzGrwTv0nr9omsjspfxyhT'
ACCESS_TOKEN_SECRET = '4z4q0zoJ7Pfqv03IiQeGRQDmX2BM5o1FsbkeXipCyaLKA'

twitter = Twitter(auth = OAuth(ACCESS_TOKEN,
                  ACCESS_TOKEN_SECRET,
                  CONSUMER_KEY,
                  CONSUMER_SECRET))

UserId's are read from S21.tsv and requested from twitter. Due to the rate limit, the loop is staggered using time.sleep(60)


## NOTE: Twitter gives Time Limit error after 15 fetches per 15 minutes. With 1500 users, it takes 1500 minutes (almost 24 hours) to fetch for all the users. For this reason, the last clustering wasn't done for complete list of users. This is why there is an error for interrupting below the code block. However, if one has time to wait for all friendship data to be fetched, all users can be clustered.

In [16]:
#start_time = time.time()
#Dictionary to store <userId, friends> pair
twitterUserList = {}
with open('S21.tsv') as tsvfile:
  reader = csv.reader(tsvfile, quotechar='"')
  for row in reader:
    twitterUserList[row[0]] = ""
    try:
        friends = twitter.friends.ids(user_id=row[0])['ids']
        twitterUserList[row[0]] = ' '.join(str(x) for x in friends)
    except Exception as e:
        del twitterUserList[row[0]]
        print(e)
    time.sleep(60)
#print("--- %s seconds ---" % (time.time() - start_time))

Twitter sent status 404 for URL: 1.1/friends/ids.json using parameters: (oauth_consumer_key=9HqYMGHtCok5QINP6n4YokR9U&oauth_nonce=849017805189761574&oauth_signature_method=HMAC-SHA1&oauth_timestamp=1532794708&oauth_token=973238402580664325-WiXwkPnQ5YzGrwTv0nr9omsjspfxyhT&oauth_version=1.0&user_id=1035276847&oauth_signature=Tw9j855ci6ljmpwoi8P4f%2Fa6bGE%3D)
details: {'errors': [{'code': 34, 'message': 'Sorry, that page does not exist.'}]}
Twitter sent status 401 for URL: 1.1/friends/ids.json using parameters: (oauth_consumer_key=9HqYMGHtCok5QINP6n4YokR9U&oauth_nonce=10680308026869856934&oauth_signature_method=HMAC-SHA1&oauth_timestamp=1532794952&oauth_token=973238402580664325-WiXwkPnQ5YzGrwTv0nr9omsjspfxyhT&oauth_version=1.0&user_id=1056443120&oauth_signature=xdKZLNk%2Ft2rpq20xjwfH2T695%2Bc%3D)
details: {'request': '/1.1/friends/ids.json', 'error': 'Not authorized.'}


KeyboardInterrupt: 

In [17]:
#check again if any of the values are empty, possibly caused by an error upon fetching the friend lists
for key, value in twitterUserList.items():
    if value == '':
        del twitterUserList[key]

List of friends are converted to a matrix of token counts using CountVectorizer.

Similarity matrix is by created using cosine similarities.

In [60]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.cluster import KMeans
from scipy.spatial.distance import pdist, squareform

vectorizer = CountVectorizer()
X = vectorizer.fit_transform(list(twitterUserList.values()))
#Similarity Matrix
cs_users = squareform(pdist(X.toarray(), 'cosine'))

Clusters are created via same method used for Part 2

In [61]:
from sklearn.decomposition import TruncatedSVD
from sklearn.random_projection import sparse_random_matrix
svd = TruncatedSVD(n_components=5, n_iter=7, random_state=42)
reducedX = svd.fit_transform(cs_users)  
kmeans = KMeans(n_clusters=5, init='k-means++',n_init=1000, verbose=0)
kmeans.fit(reducedX)

KMeans(algorithm='auto', copy_x=True, init='k-means++', max_iter=300,
    n_clusters=5, n_init=1000, n_jobs=1, precompute_distances='auto',
    random_state=None, tol=0.0001, verbose=0)

Alternatively, we can get better results using SimRank.

However, the algorithm takes to long to execute, and the user of dense matrix may cause memory error.

In [None]:
vectorizer = CountVectorizer()
X = vectorizer.fit_transform(list(twitterUserList.values()))
graph = X.todense()
friends  = vectorizer.get_feature_names()
users = list(twitterUserList.keys())

Implemented by modifying https://github.com/littleq0903/simrank

In [None]:
user_sim = matrix(np.identity(len(users)))
friend_sim = matrix(np.identity(len(friends)))

def get_friends_num(user):
    q_i = users.index(user)
    return graph[q_i]

def get_friends(user):
    series = get_friends_num(user).tolist()[0]
    return [ friends[x] for x in range(len(series)) if series[x] > 0 ]

def user_simrank(q1, q2, C):
    if q1 == q2 : return 1
    prefix = C / (get_friends_num(q1).sum() * get_friends_num(q2).sum())
    postfix = 0
    for friend_i in get_friends(q1):
        for friend_j in get_friends(q2):
            i = friends.index(friend_i)
            j = friends.index(friend_j)
            postfix += friend_sim[i, j]
            
    return prefix * postfix


def simrank(C=0.8):
    global user_sim, friend_sim

    # users simrank
    new_user_sim = matrix(np.identity(len(users)))
    for qi in users:
        for qj in users:
            i = users.index(qi)
            j = users.index(qj)
            new_user_sim[i,j] = user_simrank(qi, qj, C)
    user_sim = new_user_sim
    return user_sim

In [None]:
X = simrank()
from sklearn.decomposition import TruncatedSVD
from sklearn.random_projection import sparse_random_matrix
svd = TruncatedSVD(n_components=5, n_iter=7, random_state=42)
reducedX = svd.fit_transform(X)  
kmeans = KMeans(n_clusters=5, init='k-means++',n_init=1000, verbose=0)
kmeans.fit(reducedX)

# PART 5 - Recommending Interests

Existing <User, InterestSet> pairs, and <User, InterestSet> pairs to be recommended are read from the files below.

In [25]:
#Dictionary to store existing <User,Interests> pairs.
userInterestDict2 = {}
#Dictionary to store 6 Interests to be recommended to each user.
recommendDict = {}
#Dictionary to store <Interest, Categories> pairs. This will be filled in the next block.
interestCategoryDict2 = {}
with open('S22_preferences.tsv') as tsvfile:
    reader = csv.reader(tsvfile, delimiter="\t", quotechar='"')
    for row in reader:
        user = row[0]
        interest = row[1][8:]
        if row[0] in userInterestDict2:
            userInterestDict2[user].append(interest)
        else:
            userInterestDict2[user] = []
            userInterestDict2[user].append(category)
        if row[1][8:] not in interestCategoryDict2:
            interestCategoryDict2[category] = set()
with open('S23.tsv', encoding="utf8") as tsvfile:
    reader = csv.reader(tsvfile, delimiter="\t", quotechar='"')
    for row in reader:
        user = row[0]
        interest = row[1][8:]
        if row[0] in recommendDict:
            recommendDict[user].append(interest)
        else:
            recommendDict[user] = []
            recommendDict[user].append(interest)
        if row[1][8:] not in interestCategoryDict2:
            interestCategoryDict2[row[1][8:]] = set()

Read 'article_categories_en.nt' to get the categories, same as the first step. Data are stored in interestCategoryDict2

In [26]:
with open('article_categories_en.nt') as ntfile:
  reader = csv.reader(ntfile, delimiter=" ", quotechar='"')
  for row in reader:
    interestResult = re.search('<http://dbpedia.org/resource/(.*)>', row[0])
    categoryResult = re.search('<http://dbpedia.org/resource/Category:(.*)>', row[2])
    if interestResult != None and categoryResult != None:
        interest = interestResult.group(1)
        category = categoryResult.group(1)
        if any(x in category.lower() for x in ignore):
            continue
        else:
            if interest in interestCategoryDict2 or interest in recommendDict:
                interestCategoryDict2[interest].add(category)

In [27]:
#Dictionary to store <User, <Category, Frequency>> pairs.
userCategoryFrequency = {}
for user, interestList in userInterestDict2.items():
    #category frequencies for each user, will be added to userCategoryFrequency as a value
    categoryFrequencies = {}
    for interest in interestList:
        if interest in interestCategoryDict2:
            categories = interestCategoryDict2[interest]
            for category in categories:
                if category in categoryFrequencies:
                    categoryFrequencies[category] = categoryFrequencies[category] + 1
                else:
                    categoryFrequencies[category] = 1
    userCategoryFrequency[user] = categoryFrequencies

In [28]:
#Dictionary that holds <UserId, Interest> pairs for top 3 recommendations
top3Recommendations = {}
for user, interestList in recommendDict.items():
    #Dictionary that holds an Interest-Frequency pair. Frequency is the total frequency of categories for that interest
    localCategoryFrequencyDict = {}
    for interest in interestList:
        frequency = 0
        for category in interestCategoryDict2[interest]:
            if category in userCategoryFrequency[user]:
                frequency = frequency + userCategoryFrequency[user][category]
        localCategoryFrequencyDict[interest] = frequency
    #get top 3 interest with most common categories
    topInterests = list(dict(sorted(localCategoryFrequencyDict.items(), key=lambda x: x[1], reverse=True)[:3]).keys())
    top3Recommendations[user] = topInterests

In [29]:
top3Recommendations

{'100004041': ['Sean_Ringgold', 'Rafinha_Bastos', 'Oscar_Filho'],
 '100008460': ['Fall_Out_Boy', 'Shannon_Leto', 'Laura_Berg'],
 '100031697': ['Darren_Criss', 'Ian_Somerhalder', 'Rahul_Raj'],
 '1000465040': ['One_Direction', 'Niall_Horan', 'Liam'],
 '1000466221': ['Zendaya', 'Logan_Lerman', 'Brighton_Festival'],
 '1000497458': ['Fantasy_author', 'Andy_Straka', 'Allu_Arjun'],
 '1000818570': ['Nathan_Ballard', 'Campbell_Brown', 'Hugh_Laurie'],
 '1000874750': ['David_Faitelson', 'Andy_Vernon', 'Premier_League'],
 '100101137': ['Allison_Melnick', 'Russell_Peters', 'Jason_Strauss'],
 '100123499': ['Ed_Morrissey', 'InStyle', 'Dalai_Lama'],
 '100126414': ['Lil_Duval', 'Jackie_Kashian', 'Philip_DeFranco'],
 '100127782': ['Ariana_Grande', 'Mike_Posner', 'Justin_Bieber'],
 '1001865625': ['National_Basketball_Association', 'Keith_Jenkins', 'ESPN'],
 '1002338478': ['Jenni_Rivera', 'Judith_Hill', 'Paulina_Goto'],
 '1002370621': ['Lady_Gaga', 'Juanes', 'Paulina_Rubio'],
 '1002479173': ['John_Gruber'