# A Regularity-Based Preprocessing Method for Collaborative RSs 

In [1]:
import numpy as np
import pandas as pd

from time import time
from numpy import save, load

from surprise import SVD
from surprise import KNNBaseline

from surprise import Dataset
from surprise import accuracy
from surprise.model_selection import train_test_split

%run apriori_algorithm.ipynb

tic = time()
np.random.seed(1593523459)

### Datasets:
* MovieLens-100K
* MovieLens-1M

### Variables:
* ```N``` - top 'N' movies with the highest noise.
* ```diff``` - difference between old and the new rating.
* ```minSup``` - minimal support.
* ```file_path_users``` - file's path with Users transactions on Items.
* ```file_path_items``` - file's path with Items transactions on Users.

In [2]:
TRIGGER = 'M'

if TRIGGER == 'K':
    N = 60
    diff = 1.5
    minSup = 70
    file_path_users = '../datasets/filter_1/file_users_100K.txt'
    file_path_items = '../datasets/filter_1/file_items_100K.txt'
    data = load('../datasets/main/train_set_100K.npy')
#     data = np.loadtxt('../datasets/main/ml-100k.data', skiprows=0, delimiter='\t').astype('int32')

elif TRIGGER == 'M':
    N = 50
    diff = 1.5
    minSup = 430
    file_path_users = '../datasets/filter_1/file_users_1M.txt'
    file_path_items = '../datasets/filter_1/file_items_1M.txt'
    data = load('../datasets/main/train_set_1M.npy')
#     data = np.loadtxt('../datasets/main/ml-1M.dat', skiprows=0, delimiter='::').astype('int32')

In [3]:
# Convert to DataFrame and filter the dataset by 'user' and 'item'
data_csv = pd.DataFrame(data, columns = ['user', 'item', 'rating', 'timestamp'])
data_csv.sort_values(['user', 'item'], ascending=[True, True], inplace=True)
data_csv.reset_index(inplace=True)
data_csv.drop('index', axis=1, inplace=True)
data_csv

Unnamed: 0,user,item,rating,timestamp
0,1,1,5,978824268
1,1,48,5,978824351
2,1,150,5,978301777
3,1,260,4,978300760
4,1,527,5,978824195
...,...,...,...,...
900184,6040,3671,4,997454367
900185,6040,3683,4,960971696
900186,6040,3703,4,964828575
900187,6040,3751,4,964828782


In [4]:
users = np.unique(data[:, 0]).tolist() # list of unique users
items = np.unique(data[:, 1]).tolist() # list of unique items

In [5]:
n_u = len(users)     # number of users
n_m = len(items)     # number of movies
n_r = data.shape[0]  # number of ratings

print("USERS: {}\t ITEMS: {}\t RATINGS: {}".format(n_u, n_m, n_r))

USERS: 6040	 ITEMS: 3694	 RATINGS: 900189


In [6]:
# Make dict for users and movies, where udict[u_id] = index(0..942), and mdict[m_id] = index(0..1681)
udict = {}
for i, u_id in enumerate(users):
    udict[u_id] = i
    
mdict = {}
for i, m_id in enumerate(items):
    mdict[m_id] = i

In [7]:
# Empty matrix
matrix = np.zeros((n_u, n_m), dtype='int32')
noise_matrix = np.zeros((n_u, n_m), dtype='float32')

In [8]:
# Fill matrix
for i in range(n_r):
    u_id = data[i, 0]
    m_id = data[i, 1]
    r = data[i, 2]
    
    matrix[udict[u_id], mdict[m_id]] = int(r)

In [9]:
matrix

array([[5, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [3, 0, 0, ..., 0, 0, 0]])

#### Train Algorithm

In [10]:
# Load the movielens-100k-1m dataset
data = Dataset.load_builtin('ml-1m') # 'ml-100k' and 'ml-1m'

# Retrieve the trainset.
trainset, testset = train_test_split(data, test_size=.1)

# Build an algorithm, and train it.
knn = KNNBaseline(k=60)
svd = SVD(n_factors=10, n_epochs=30)

# Fit data
knn.fit(trainset)
svd.fit(trainset)

# Test
knn_predictions = knn.test(testset)
svd_predictions = svd.test(testset)

# Then compute RMSE
accuracy.rmse(knn_predictions)
accuracy.rmse(svd_predictions)

# get a prediction for specific users and items.
knn_pred = knn.predict(str(196), str(302), r_ui=4, verbose=True)
svd_pred = svd.predict(str(196), str(302), r_ui=4, verbose=True)

Estimating biases using als...
Computing the msd similarity matrix...
Done computing similarity matrix.
RMSE: 0.8926
RMSE: 0.8580
user: 196        item: 302        r_ui = 4.00   est = 3.84   {'actual_k': 60, 'was_impossible': False}
user: 196        item: 302        r_ui = 4.00   est = 3.77   {'was_impossible': False}


# Running

In [11]:
def predict_rating(algo, user, item):
    u_id = list(udict.keys())[list(udict.values()).index(user)]
    i_id = list(mdict.keys())[list(mdict.values()).index(item)]
    
    rating = round(algo.predict(str(u_id), str(i_id))[3], 3)
    
    return rating

In [12]:
def reg_separator(regularities):
    single_terms = []
    regularities_ID = []
    regularities_ID_R = []

    terms = re.split('\[\'|\', \'|\'\]', regularities[0])[1:-1]
    for term in terms: single_terms.append(term)
        
    for unique_term in single_terms:
        ID_R = re.split('Item_|User_|: ', unique_term)[1:]
        regularities_ID.append(int(ID_R[0]))
        regularities_ID_R.append(list(map(int, ID_R)))
            
    return single_terms, regularities_ID, regularities_ID_R

In [13]:
def users_items_exist(n_id, ID, data_type):
    
    if data_type == 'users':
        items_list = data_csv[data_csv['user'] == n_id]['item'].tolist()
        result = all(k in items_list for k in ID)
        
    elif data_type == 'items':
        users_list = data_csv[data_csv['item'] == n_id]['user'].tolist()
        result = all(k in users_list for k in ID)
    
    return result

In [14]:
def calculate_noise_degree(n_id, ID_R, data_type):
    
    if data_type == 'users':         
        for m_id in ID_R:
            r = matrix[udict[n_id], mdict[m_id[0]]]
            if m_id[1] != r:
                dist = abs(m_id[1] - r)
                noise_matrix[udict[n_id], mdict[m_id[0]]] += dist
                
    elif data_type == 'items':
        for u_id in ID_R:
            r = matrix[udict[u_id[0]], mdict[n_id]]
            if u_id[1] != r:
                dist = abs(u_id[1] - r)
                noise_matrix[udict[u_id[0]], mdict[n_id]] += dist
                
    return noise_matrix

In [15]:
def get_noise_matrix(regularities_users, regularities_items):
    for ru in regularities_users:
        single_terms, ID, ID_R = reg_separator(ru)
        for u_id in users:
            if users_items_exist(u_id, ID, 'users'):
                calculate_noise_degree(u_id, ID_R, 'users')

    for ri in regularities_items:
        single_terms, ID, ID_R = reg_separator(ri)
        for i_id in items:
            if users_items_exist(i_id, ID, 'items'):
                calculate_noise_degree(i_id, ID_R, 'items')
                
    return noise_matrix

In [16]:
def top_N(noise_matrix, N):
    anomalousItemsPerUser = {}

    for u in users:
        temp = {}
        temp_sorted = {}
        temp_cropped = {}

        for i, t in enumerate(noise_matrix[udict[u]]):
            if t != 0:
                temp[i] = t

        # Reduce dictionary  
        if len(temp) >= N:
            s = 0
            temp_sorted = {i: value for i, value in sorted(temp.items(), key=lambda item: item[1], reverse=True)}
            for i, value in temp_sorted.items():
                s += 1
                temp_cropped[i] = value
                if s >= N:
                    break
            anomalousItemsPerUser[udict[u]] = temp_cropped
        else:
            anomalousItemsPerUser[udict[u]] = temp

    return anomalousItemsPerUser

In [17]:
def noise_correction(anomalousItemsPerUser, diff, algo):
    final_matrix = matrix.copy().astype('float32')
    
    for user, item_dic in anomalousItemsPerUser.items():
        for item in item_dic:
            oldRating = matrix[user, item]
            newRating = predict_rating(algo, user, item)

            if abs(oldRating - newRating) > diff:
                final_matrix[user, item] = newRating
            
    return final_matrix

In [18]:
def reshape_data(final_matrix):
    data_sorted = np.array(data_csv)                                             # get sorted data
    
    new_ratings = final_matrix.reshape(-1, 1)                                    # reshape into one column
    new_ratings = new_ratings[new_ratings != 0].reshape(-1, 1).astype('float32') # remove empty ratings
    
    new_data = np.hstack((data_sorted[:,:2], new_ratings))                       # combine
    
    return new_data

In [19]:
# Get regularities for Users and Items by Apriory
regularities_users = apriory_algorithm(THRESHOLD=minSup, file=file_path_users)
regularities_items = apriory_algorithm(THRESHOLD=minSup, file=file_path_items)

There are 17971 unique items, 227 of which are frequent
There are 25574 candidate pairs, 58 of which are frequent
There are 73 candidate triples, 2 of which are frequent
There are 0 candidate fourfold, 0 of which are frequent
--------------
['Item_1196: 5', 'Item_1210: 5', 'Item_260: 5']: 466
['Item_1196: 5', 'Item_1198: 5', 'Item_260: 5']: 477
Preprocessing time: 1 mins!
There are 29184 unique items, 22 of which are frequent
There are 229 candidate pairs, 0 of which are frequent
--------------
User_3824: 3: 431
User_4085: 4: 437
User_1015: 4: 440
User_1088: 4: 444
User_2063: 3: 453
User_4425: 3: 461
User_1941: 3: 464
User_424: 4: 469
User_3272: 4: 469
User_678: 4: 472
User_4277: 5: 513
User_889: 3: 513
User_4808: 3: 544
User_3224: 4: 552
User_1181: 3: 606
User_1680: 4: 638
User_5831: 4: 646
User_4725: 3: 652
User_4169: 4: 658
User_4169: 3: 674
User_4277: 4: 781
User_3618: 3: 833
Preprocessing time: 0 mins!


In [20]:
# Get matrix with degree of noises
noise_matrix = get_noise_matrix(regularities_users, regularities_items)

In [31]:
ALGORITHM = 'knn'

if ALGORITHM == 'knn':
    N = 5
    diff = 2.0
    algo = knn

elif ALGORITHM == 'svd':
    N = 50
    diff = 1.5
    algo = svd

In [32]:
# Get Top 'N' Movies with the highist noises per User
anomalousItemsPerUser = top_N(noise_matrix, N)

In [33]:
anomalousItemsPerUser

{0: {},
 1: {},
 2: {1100: 2.0, 1114: 1.0},
 3: {1100: 3.0, 1114: 2.0},
 4: {},
 5: {},
 6: {},
 7: {},
 8: {},
 9: {1114: 1.0},
 10: {},
 11: {},
 12: {},
 13: {},
 14: {252: 2.0, 1100: 2.0, 1102: 1.0, 1114: 1.0},
 15: {},
 16: {1114: 1.0},
 17: {},
 18: {252: 2.0, 1102: 1.0},
 19: {},
 20: {},
 21: {},
 22: {},
 23: {},
 24: {},
 25: {},
 26: {},
 27: {},
 28: {1100: 2.0, 1114: 3.0},
 29: {},
 30: {},
 31: {},
 32: {},
 33: {},
 34: {},
 35: {1102: 1.0},
 36: {},
 37: {},
 38: {},
 39: {},
 40: {},
 41: {},
 42: {},
 43: {},
 44: {},
 45: {},
 46: {},
 47: {},
 48: {},
 49: {},
 50: {},
 51: {},
 52: {1114: 1.0},
 53: {},
 54: {},
 55: {},
 56: {},
 57: {1100: 2.0},
 58: {},
 59: {},
 60: {},
 61: {252: 2.0, 1100: 4.0, 1102: 1.0, 1114: 2.0},
 62: {},
 63: {},
 64: {252: 2.0, 1102: 1.0, 1114: 1.0},
 65: {},
 66: {},
 67: {},
 68: {},
 69: {},
 70: {},
 71: {},
 72: {},
 73: {},
 74: {},
 75: {},
 76: {},
 77: {},
 78: {},
 79: {},
 80: {1100: 2.0, 1114: 2.0},
 81: {},
 82: {},
 83: {}

In [34]:
# Testing
k = 0
for u, dic in anomalousItemsPerUser.items():
    if len(dic) == N: k += 1
print('Total:', k)

Total: 0


In [35]:
# Correct the Matrix
final_matrix = noise_correction(anomalousItemsPerUser, diff, algo)

In [36]:
new_data = reshape_data(final_matrix)

In [37]:
# Save final matrix

if TRIGGER == 'K':
    file_matrix = '../datasets/filter_1/final_matrix_100K_{0}_{1}_{2}_{3}.npy'.format(str(N), str(diff), str(minSup), ALGORITHM)
    file_new_data = '../datasets/filter_1/dataset_100K_{0}_{1}_{2}_{3}.npy'.format(str(N), str(diff), str(minSup), ALGORITHM)
    
    save(file_new_data, new_data)
    save(file_matrix, final_matrix)

    new_data_load = load(file_new_data)
    
elif TRIGGER == 'M':
    file_matrix = '../datasets/filter_1/final_matrix_1M_{0}_{1}_{2}_{3}.npy'.format(str(N), str(diff), str(minSup), ALGORITHM)
    file_new_data = '../datasets/filter_1/dataset_1M_{0}_{1}_{2}_{3}.npy'.format(str(N), str(diff), str(minSup), ALGORITHM)
    
    save(file_new_data, new_data)
    save(file_matrix, final_matrix)

    new_data_load = load(file_new_data)

### Results

In [38]:
(new_data == new_data_load).all()

True

In [39]:
pd.DataFrame(new_data_load, columns = ['user', 'item', 'rating'])

Unnamed: 0,user,item,rating
0,1.0,1.0,5.0
1,1.0,48.0,5.0
2,1.0,150.0,5.0
3,1.0,260.0,4.0
4,1.0,527.0,5.0
...,...,...,...
900184,6040.0,3671.0,4.0
900185,6040.0,3683.0,4.0
900186,6040.0,3703.0,4.0
900187,6040.0,3751.0,4.0


In [40]:
print('Preprocessing time: {0} mins!'.format( int( time() - tic ) / 60.0) )

Preprocessing time: 10.733333333333333 mins!
