# Incremental Collaborative Filtering (ICR) - Algorithm experiments

Movielens 100k data set: https://maxhalford.github.io/files/datasets/ml_100k.zip

In [1]:
import csv
# from pycallgraph import PyCallGraph
# from pycallgraph.output import GraphvizOutput

In [116]:
def stream(filepath, delimiter, max_cases=500):
    with open(filepath, 'r') as csvf:
        #load csv file data using csv library's dictionary reader
        csvReader = csv.DictReader(csvf, delimiter=delimiter)
        n=0
        for row in csvReader:
            if n == max_cases:
                break
            n+=1
            yield row['user'], row['item'], float(row['rating'])

In [222]:
class myICF():
    def __init__(self):
        self.user_ratings = {} # Dict to store ratings
        self.user_meta = {} # Dict to cache users info on number of ratings and average ratings
        self.user_pair_meta = {} # Dict to cache info on factors calculated for pairs of users (B, C, D, sum of ratings to co-rated items)
        self.corr_threshold=0.65
        self.high_rating=4
   
    def _new_user(self, user, item, rating):
        # initialize new user
        self.user_ratings[user] = {} # initializes user in ratings dict
        self.user_meta[user] = {'q': 0, 'avg.rating': 0} # initializes user in meta dict, assign number of items user has rated and avg rating of user

        # initializes pairs of existing users with new user in user_pair_meta dict
        for u in self.user_meta.keys(): 
            if u == user:
                continue
            self.user_pair_meta[(u, user)] = {'B': 0, 'C': 0, 'D': 0}
            self.user_pair_meta[(u, user)]['sum.co_ratings'] = {u: 0, user: 0, 'n': 0}
            
    def _new_rating(self, user, item, rating):
        # Submission of a new rating
        q = self.user_meta[user]['q'] # gets number of items user has rated
        A_avg_rating = self.user_meta[user]['avg.rating']
        new_avg = ( rating/( q+1 ) ) + ( q/( q+1 ) )*A_avg_rating # calculates new avg rating for active user
        delta_avg = new_avg - A_avg_rating # difference of user's previous and current avg rating
        
        for userB in self.user_meta.keys():
            if userB == user:
                continue
                        
            B_avg_rating = self.user_meta[userB]['avg.rating']
            
            if item in self.user_ratings[userB].keys():
                flag = 'By'
                # User B has rated the item                
                A_sum_coratings, B_sum_coratings, n_coratings, key = self._update_get_coratings(user, userB, item, rating, new_rating=True)
                B_rating = self.user_ratings[userB][item]
                
                e = ( rating-new_avg )*( B_rating-B_avg_rating ) - delta_avg*( B_sum_coratings - n_coratings*B_avg_rating )
                f = ( rating-new_avg )**2 + n_coratings*(delta_avg**2) - 2*delta_avg*( A_sum_coratings - n_coratings*A_avg_rating )
                g = ( B_rating-B_avg_rating )**2
                
            else:
                flag = 'Ny'
                # User B had not rated the item
                A_sum_coratings, B_sum_coratings, n_coratings, key = self._get_coratings(user, userB)
                
                e = - delta_avg*( B_sum_coratings - n_coratings*B_avg_rating )
                f = n_coratings*(delta_avg**2) - 2*delta_avg*( A_sum_coratings - n_coratings*A_avg_rating )
                g = 0
            
            for factor, increment in zip(['B', 'C', 'D'], [e, f, g]):
                self.user_pair_meta[key][factor] += increment
                
            if (user == '684') and (userB == '851'):
                print('new' + flag)
                print(f'e:{e}, f:{f}, g:{g}')
                print(f'A_sum_coratings: {A_sum_coratings}, B_sum_coratings: {B_sum_coratings}, n_coratings: {n_coratings}')
                print(f'rating: {rating}, new_avg: {new_avg}, A_avg_rating: {A_avg_rating}, delta_avg: {delta_avg}')
                print(f'B_avg_rating: {B_avg_rating}')
                print(f'C: {self.user_pair_meta[key]["C"]}')
                print(10*'-')
        
        self.user_ratings[user][item] = rating # updates rating given by user to item
        self.user_meta[user]['q'] += 1 # updates number of items user has rated
        self.user_meta[user]['avg.rating'] = new_avg # updates avg rating
        
        
    def _update_rating(self, user, item, rating):
        # Update of an existing rating
        delta_rating = rating - self.user_ratings[user][item] # difference of user's previous and current rating for item
        q = self.user_meta[user]['q'] # gets number of items user has rated
        A_avg_rating = self.user_meta[user]['avg.rating']
        new_avg = delta_rating/q + A_avg_rating # calculates new avg rating for active user
        delta_avg = new_avg - A_avg_rating # difference of user's previous and current avg rating
        
        for userB in self.user_meta.keys():
            if userB == user:
                continue
                            
            B_avg_rating = self.user_meta[userB]['avg.rating']
            
            if item in self.user_ratings[userB].keys():
                # User B has rated the item                
                A_sum_coratings, B_sum_coratings, n_coratings, key = self._update_get_coratings(user, userB, item, rating, new_rating=False)
                B_rating = self.user_ratings[userB][item]
                
                e = delta_rating*( B_rating-B_avg_rating ) - delta_avg*( B_sum_coratings - n_coratings*B_avg_rating )
                # calculus of f is not clear for both cases
                f = delta_rating**2 + 2*delta_rating*( rating-new_avg ) + n_coratings*delta_avg**2 - 2*delta_avg*( A_sum_coratings - n_coratings*A_avg_rating )
                g = 0
            else:
                # User B had not rated the item
                A_sum_coratings, B_sum_coratings, n_coratings, key = self._get_coratings(user, userB)
                
                e = - delta_avg*( B_sum_coratings - n_coratings*B_avg_rating )
                # calculus of f is not clear for both cases
                f = n_coratings*delta_avg**2 - 2*delta_avg*( A_sum_coratings - n_coratings*A_avg_rating )
                g = 0

            for factor, increment in zip(['B', 'C', 'D'], [e, f, g]):
                self.user_pair_meta[key][factor] += increment
            
            if (user == '684') and (userB == '851'):
                print('new' + flag)
                print(f'e:{e}, f:{f}, g:{g}')
                print(f'A_sum_coratings: {A_sum_coratings}, B_sum_coratings: {B_sum_coratings}, n_coratings: {n_coratings}')
                print(f'rating: {rating}, new_avg: {new_avg}, A_avg_rating: {A_avg_rating}, delta_avg: {delta_avg}')
                print(f'B_avg_rating: {B_avg_rating}')
                print(f'C: {self.user_pair_meta[key]["C"]}')
                print(10*'-')
                
        self.user_ratings[user][item] = rating # updates rating given by user to item
        self.user_meta[user]['q'] += 1 # updates number of items user has rated
        self.user_meta[user]['avg.rating'] = new_avg # updates avg rating        
        
    def _get_coratings(self, userA, userB):
        if (userB, userA) in self.user_pair_meta.keys():
            key = (userB, userA)
        else:
            key = (userA, userB)
        A_sum_coratings = self.user_pair_meta[key]['sum.co_ratings'][userA]
        B_sum_coratings = self.user_pair_meta[key]['sum.co_ratings'][userB]
        n_coratings = self.user_pair_meta[key]['sum.co_ratings']['n']
        return A_sum_coratings, B_sum_coratings, n_coratings, key
        
    def _update_get_coratings(self, userA, userB, item, rating, new_rating=True):
        if (userB, userA) in self.user_pair_meta.keys():
            key = (userB, userA)
        else:
            key = (userA, userB)
        if new_rating:            
            self.user_pair_meta[key]['sum.co_ratings'][userA] += rating
            A_sum_coratings = self.user_pair_meta[key]['sum.co_ratings'][userA]
            self.user_pair_meta[key]['sum.co_ratings'][userB] += self.user_ratings[userB][item]
            B_sum_coratings = self.user_pair_meta[key]['sum.co_ratings'][userB]
            self.user_pair_meta[key]['sum.co_ratings']['n'] += 1
            n_coratings = self.user_pair_meta[key]['sum.co_ratings']['n']
        else:
            self.user_pair_meta[key]['sum.co_ratings'][userA] += (rating - self.user_ratings[userA][item])
            A_sum_coratings = self.user_pair_meta[key]['sum.co_ratings'][userA]
            B_sum_coratings = self.user_pair_meta[key]['sum.co_ratings'][userB]
            n_coratings = self.user_pair_meta[key]['sum.co_ratings']['n']
                
        return A_sum_coratings, B_sum_coratings, n_coratings, key
    
    def run(self, user, item, rating):
        # initialize new user
        if user not in self.user_meta.keys(): 
            self._new_user(user, item, rating)
            
        # Submission of a new rating
        if item not in self.user_ratings[user].keys(): 
            self._new_rating(user, item, rating)
            
        # Update of an existing rating
        else: 
            self._update_rating(user, item, rating)
            
    def recommend(self, user, n_recs=10):
        item_count = {}
        for userB in self.user_meta.keys():
            if userB == user:
                continue
            if (userB, user) in self.user_pair_meta.keys():
                key = (userB, user)
            else:
                key = (user, userB)
            print(f'userB: {userB}')
            # if no co-ratings exist, just skip
            if self.user_pair_meta[key]['sum.co_ratings']['n'] == 0:
                print('no co-ratings')
                continue
            B = self.user_pair_meta[key]['B']
            C = self.user_pair_meta[key]['C']
            D = self.user_pair_meta[key]['D']
            
#         try:
            if (C == 0) or (D == 0):
                continue
            print(f'B:{B}, C:{C}, D:{D}')
            pearson_corr = B / ( ( C**(1/2) ) * ( D**(1/2) ) ) 
            print(f'Pearson Corr:{round(pearson_corr, 3)}')
            # FOR DEBBUGING
            if abs(pearson_corr) > 1.1:                
                print(f'abs(PEARSON CORR) > 1.1')

            if pearson_corr >= self.corr_threshold: # sometimes, C**(1/2) and D**(1/2) are complex, thus they cannot be compared
                for item in self.user_ratings[userB].keys():
                    if item in self.user_ratings[user].keys():
                        continue
                    if self.user_ratings[userB][item] >= self.high_rating:
                        item_count.setdefault(item, 0)
                        item_count[item] += 1
#         except:
#             continue
        sorted_item_count = sorted(item_count.items(), key=lambda x: -x[1])
        recommended_items = [i[0] for i in sorted_item_count[:n_recs]]
        print(recommended_items)
        print(10*'-') # DEBUG
        return recommended_items 

In [224]:
# f = ( rating-new_avg )**2 + n_coratings*(delta_avg**2) - 2*delta_avg*( A_sum_coratings - n_coratings*A_avg_rating )

# I'm having a problem with the initialization of the average ratings - when it's 0, then the delta is too high, and 'f' end up negative

In [223]:
icf = myICF()
for user, item, rating in stream(filepath='ml_100k.csv', delimiter='\t', max_cases=10000):    
    icf.run(user, item, rating)

newBy
e:-1.6901408450704216, f:-9.0, g:0.3173973417972621
A_sum_coratings: 3.0, B_sum_coratings: 4.0, n_coratings: 1
rating: 3.0, new_avg: 3.0, A_avg_rating: 0, delta_avg: 3.0
B_avg_rating: 3.4366197183098595
C: -9.0
----------
newNy
e:-0.28169014084507027, f:0.25, g:0
A_sum_coratings: 3.0, B_sum_coratings: 4.0, n_coratings: 1
rating: 4.0, new_avg: 3.5, A_avg_rating: 3.0, delta_avg: 0.5
B_avg_rating: 3.4366197183098595
C: -8.75
----------
newNy
e:0.0938967136150236, f:-0.1388888888888891, g:0
A_sum_coratings: 3.0, B_sum_coratings: 4.0, n_coratings: 1
rating: 3.0, new_avg: 3.333333333333333, A_avg_rating: 3.5, delta_avg: -0.16666666666666696
B_avg_rating: 3.4366197183098595
C: -8.88888888888889
----------
newNy
e:-0.23474178403755872, f:0.45138888888888906, g:0
A_sum_coratings: 3.0, B_sum_coratings: 4.0, n_coratings: 1
rating: 5.0, new_avg: 3.75, A_avg_rating: 3.333333333333333, delta_avg: 0.41666666666666696
B_avg_rating: 3.4366197183098595
C: -8.4375
----------
newBy
e:0.2063380281690

In [214]:
icf.user_pair_meta[('851', '684')]

{'B': -0.03053956504813507,
 'C': -6.342168061303237,
 'D': 6.4124181709978165,
 'sum.co_ratings': {'851': 31.0, '684': 31.0, 'n': 8}}

In [215]:
user='851'
icf.recommend(user)

userB: 259
B:-0.9339908273915597, C:5.572959684137984, D:1.7656250000000018
Pearson Corr:-0.298
userB: 712
B:3.1644031914588715, C:14.111047282887002, D:10.646070834516614
Pearson Corr:0.258
userB: 119
B:0.5919547914031298, C:41.54876083757516, D:30.75781656981458
Pearson Corr:0.017
userB: 640
B:5.2005084697146025, C:23.507890963525824, D:6.819023204513658
Pearson Corr:0.411
userB: 594
B:-0.24296519848657702, C:0.31713060751285577, D:0.1836734693877538
Pearson Corr:-1.007
userB: 23
B:3.045630892862906, C:20.97410289841518, D:15.593704449399944
Pearson Corr:0.168
userB: 276
B:49.85980862686567, C:67.94481092956764, D:75.08108921317897
Pearson Corr:0.698
userB: 913
userB: 532
B:0.02096288554741487, C:4.7701725888032565, D:2.8826392644672767
Pearson Corr:0.006
userB: 821
B:4.286023355700034, C:20.984476120954092, D:12.581590770196419
Pearson Corr:0.264
userB: 291
B:33.15976921380081, C:62.520700707779966, D:60.780495729458195
Pearson Corr:0.538
userB: 157
no co-ratings
userB: 817
B:-5.053

TypeError: type complex doesn't define __round__ method

In [151]:
for userB in icf.user_meta.keys():
    if userB == user:
        continue
    if (userB, user) in icf.user_pair_meta.keys():
        key = (userB, user)
    else:
        key = (user, userB)
    print(icf.user_pair_meta[key])

{'B': -1.4260110294117643, 'C': 2.24841857698962, 'D': 0.765625, 'sum.co_ratings': {'259': 3.0, '851': 5.0, 'n': 1}}
{'B': 0.0, 'C': 0.0, 'D': 0, 'sum.co_ratings': {'851': 0, '712': 0, 'n': 0}}
{'B': 1.0313199570729437, 'C': 4.277180569839734, 'D': 4.341535671919597, 'sum.co_ratings': {'851': 25.0, '119': 23.0, 'n': 6}}
{'B': 0.0, 'C': 0.0, 'D': 0, 'sum.co_ratings': {'851': 0, '640': 0, 'n': 0}}
{'B': 0.0, 'C': 0.0, 'D': 0, 'sum.co_ratings': {'851': 0, '594': 0, 'n': 0}}
{'B': 0.5617715918676519, 'C': 0.5499018318862084, 'D': 1.8304498269896197, 'sum.co_ratings': {'851': 5.0, '23': 4.0, 'n': 1}}
{'B': 1.1244675707464062, 'C': 4.727216221327047, 'D': 5.415224913494811, 'sum.co_ratings': {'851': 35.0, '276': 33.0, 'n': 9}}
{'B': 0.0, 'C': 0.0, 'D': 0, 'sum.co_ratings': {'851': 0, '913': 0, 'n': 0}}
{'B': 0.0, 'C': 0.0, 'D': 0, 'sum.co_ratings': {'851': 0, '532': 0, 'n': 0}}
{'B': -1.797015066005039, 'C': 5.745649027988533, 'D': 5.634920634920639, 'sum.co_ratings': {'851': 27.0, '821': 26

In [141]:
for u in icf.user_meta.keys():
    print(u)
    icf.recommend(user=u) # |pearson| correlations greater than |1|

259
['790', '11', '496', '56', '382', '180', '170', '134', '474', '770']
----------
851
[]
----------
712
['288', '325', '237', '475', '129', '147', '9', '844', '1012', '508']
----------
119
['876', '327', '288', '294', '748', '329', '358', '328', '1', '129']
----------
640
['181', '98', '97', '14', '15', '255', '286', '298', '185', '173']
----------
594
['12', '255', '286', '298', '185', '173', '108', '772', '928', '117']
----------
23
['288', '325', '237', '475', '129', '147', '9', '844', '1012', '508']
----------
276
[]
----------
913
[]
----------
532
['876', '327', '288', '294', '748', '329', '358', '328', '222', '1']
----------
821
['790', '11', '12', '496', '382', '170', '134', '474', '770', '175']
----------
291
['385', '79', '96', '173', '172', '421', '99', '174', '83', '95']
----------
157
[]
----------
817
['286', '322', '323', '275', '458', '100', '50', '741', '235', '813']
----------
195
[]
----------
