# Exploring Fuzzy Rating Regularities for Managing Natural Noise in Cl. Rc.

In [1]:
import numpy as np
import pandas as pd
from time import time
from numpy import save, load
%run apriori_algorithm.ipynb

tic = time()
np.random.seed(1593523459)

### Datasets:
* MovieLens-100K
* MovieLens-1M

### Variables:
* ```N``` - top 'N' movies with the highest noise
* ```minSup``` - minimal support
* ```stratagy``` - option of stratagy (average or removal)
* ```file_path``` - file's path with Users transactions on Items.

In [2]:
TRIGGER = 'M'

if TRIGGER == 'K':
    N = 3                # N = 1, 3
    minSup = 30          # minSup = 20, 30
    stratagy = 'avg'     # stratagy = 'rmv', 'avg'
    file_path = '../datasets/filter_2/file_users_100K.txt'
    data = load('../datasets/main/train_set_100K.npy')
#     data = np.loadtxt('../datasets/main/ml-100k.data', skiprows=0, delimiter='\t').astype('int32')

elif TRIGGER == 'M':
    N = 5                # N = 1, 5
    minSup = 430         # minSup = 200, 430
    stratagy = 'avg'     # stratagy = 'rmv', 'avg'
    file_path = '../datasets/filter_2/file_users_1M.txt'
    data = load('../datasets/main/train_set_1M.npy')
#     data = np.loadtxt('../datasets/main/ml-1M.dat', skiprows=0, delimiter='::').astype('int32')

In [3]:
data

array([[        1,         1,         5, 978824268],
       [        1,        48,         5, 978824351],
       [        1,       150,         5, 978301777],
       ...,
       [     6040,      3703,         4, 964828575],
       [     6040,      3751,         4, 964828782],
       [     6040,      3819,         5, 963272166]])

In [4]:
# Convert to DataFrame and filter the dataset by 'user' and 'item'
data_csv = pd.DataFrame(data, columns = ['user', 'item', 'rating', 'timestamp'])
data_csv.sort_values(['user', 'item'], ascending=[True, True], inplace=True)
data_csv.reset_index(inplace=True)
data_csv.drop('index', axis=1, inplace=True)
data_csv

Unnamed: 0,user,item,rating,timestamp
0,1,1,5,978824268
1,1,48,5,978824351
2,1,150,5,978301777
3,1,260,4,978300760
4,1,527,5,978824195
...,...,...,...,...
900184,6040,3671,4,997454367
900185,6040,3683,4,960971696
900186,6040,3703,4,964828575
900187,6040,3751,4,964828782


In [5]:
users = np.unique(data[:, 0]).tolist() # list of unique users
items = np.unique(data[:, 1]).tolist() # list of unique items

In [6]:
n_u = len(users)     # number of users
n_m = len(items)     # number of movies
n_r = data.shape[0]  # number of ratings

print("USERS: {}\t ITEMS: {}\t RATINGS: {}".format(n_u, n_m, n_r))

USERS: 6040	 ITEMS: 3694	 RATINGS: 900189


In [7]:
# Make dict for users and movies, where udict[u_id] = index(0..942), and mdict[m_id] = index(0..1681)
udict = {}
for i, u_id in enumerate(users):
    udict[u_id] = i
    
mdict = {}
for i, m_id in enumerate(items):
    mdict[m_id] = i

In [8]:
# Empty matrix
matrix = np.zeros((n_u, n_m), dtype='int32')
noise_matrix = np.zeros((n_u, n_m), dtype='float32')

In [9]:
# Fill matrix
for i in range(n_r):
    u_id = data[i, 0]
    m_id = data[i, 1]
    r = data[i, 2]
    
    matrix[udict[u_id], mdict[m_id]] = int(r)

In [10]:
matrix

array([[5, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [3, 0, 0, ..., 0, 0, 0]])

In [11]:
def fuzzy_transformation(value):
    if value == 5:
        ft = np.array([1, 0, 0])
    elif value == 4:
        ft = np.array([0.5, 0.5, 0])
    elif value == 3:
        ft = np.array([0, 1, 0])
    elif value == 2:
        ft = np.array([0, 0.5, 0.5])
    elif value == 1:
        ft = np.array([0, 0, 1])
        
    return ft

In [12]:
def calculate_noise_degree(regularities_ID_R):
    for u_id in users:
        for m_id in regularities_ID_R:
            r = matrix[udict[u_id], mdict[m_id[0]]]
            if r == 0: continue
                
            a = fuzzy_transformation(r)
            b = fuzzy_transformation(m_id[1])
            dist = sum(abs(a - b))
            
            noise_matrix[udict[u_id], mdict[m_id[0]]] += dist
            
    return noise_matrix

In [13]:
def top_N(noise_matrix, N):
    anomalousItemsPerUser = {}

    for u in users:
        temp = {}
        temp_sorted = {}
        temp_cropped = {}

        for i, t in enumerate(noise_matrix[udict[u]]):
            if t != 0:
                temp[i] = t

        # Reduce dictionary  
        if len(temp) >= N:
            s = 0
            temp_sorted = {i: value for i, value in sorted(temp.items(), key=lambda item: item[1], reverse=True)}
            for i, value in temp_sorted.items():
                s += 1
                temp_cropped[i] = value
                if s >= N:
                    break
            anomalousItemsPerUser[udict[u]] = temp_cropped
        else:
            anomalousItemsPerUser[udict[u]] = temp

    return anomalousItemsPerUser

In [14]:
def noise_correction(anomalousItemsPerUser):
    final_matrix = matrix.copy().astype('float32')
    
    for user, item_dic in anomalousItemsPerUser.items():
        average = round(data_csv[(data_csv['user'] == user + 1)]["rating"].mean(), 3)

        for item, deg in item_dic.items():
            final_matrix[user, item] = average      # Average strategy
#             final_matrix[user, item] = 0.0        # Removal strategy
            
    return final_matrix

In [15]:
def reshape_data(final_matrix, stratagy):
    
    if stratagy == 'avg':
        # Option 1. With Average Strategy
        data_sorted = np.array(data_csv)                                             # get sorted data
        new_ratings = final_matrix.reshape(-1, 1)                                    # reshape into one column
        new_ratings = new_ratings[new_ratings != 0].reshape(-1, 1).astype('float32') # remove empty ratings
        
        new_data = np.hstack((data_sorted[:,:2], new_ratings))                       # combine
        
    elif stratagy == 'rmv':
        # Option 2. With Removal Strategy
        new_data = []
        for u in users:
            for i in items:
                r = final_matrix[udict[u], mdict[i]]
                if r != 0:
                    new_data.append([u, i, r])

        new_data = np.array(new_data).astype('float32')
    
    return new_data

# Running

In [16]:
# Get regularities by Apriory
regularities = apriory_algorithm(THRESHOLD=minSup, file=file_path)

There are 17971 unique items, 227 of which are frequent
There are 25574 candidate pairs, 58 of which are frequent
There are 73 candidate triples, 2 of which are frequent
There are 0 candidate fourfold, 0 of which are frequent
--------------
['Item_1196: 5', 'Item_1210: 5', 'Item_260: 5']: 466
['Item_1196: 5', 'Item_1198: 5', 'Item_260: 5']: 477
Preprocessing time: 1 mins!


In [17]:
# Get single unique term, and them ID_Rating [ID, R]
single_unique_terms, regularities_ID_R = separator(regularities)
print(single_unique_terms, len(single_unique_terms))

{'Item_260: 5', 'Item_1198: 5', 'Item_1196: 5', 'Item_1210: 5'} 4


In [18]:
# Get matrix with degree of noises
noise_matrix = calculate_noise_degree(regularities_ID_R)

In [19]:
# Get Top 'N' Movies with the highist noises per User
anomalousItemsPerUser = top_N(noise_matrix, N)

In [20]:
anomalousItemsPerUser

{0: {252: 1.0},
 1: {1102: 1.0, 1114: 1.0},
 2: {1100: 1.0, 1114: 1.0},
 3: {1100: 2.0, 1114: 2.0},
 4: {},
 5: {1114: 2.0},
 6: {},
 7: {1114: 1.0},
 8: {1114: 1.0},
 9: {1114: 1.0},
 10: {1102: 1.0},
 11: {},
 12: {1114: 1.0},
 13: {},
 14: {252: 1.0, 1100: 1.0, 1102: 1.0, 1114: 1.0},
 15: {},
 16: {1114: 1.0},
 17: {},
 18: {252: 2.0, 1102: 1.0},
 19: {},
 20: {},
 21: {1100: 1.0, 1102: 1.0, 1114: 2.0},
 22: {},
 23: {},
 24: {},
 25: {252: 2.0, 1102: 2.0},
 26: {},
 27: {},
 28: {1100: 1.0, 1114: 2.0},
 29: {},
 30: {1114: 1.0},
 31: {},
 32: {},
 33: {},
 34: {1102: 1.0, 1114: 1.0},
 35: {1102: 1.0},
 36: {},
 37: {},
 38: {},
 39: {1100: 1.0},
 40: {},
 41: {},
 42: {},
 43: {},
 44: {},
 45: {},
 46: {},
 47: {},
 48: {1114: 1.0},
 49: {},
 50: {},
 51: {},
 52: {1114: 1.0},
 53: {252: 1.0},
 54: {},
 55: {},
 56: {1100: 1.0, 1102: 1.0},
 57: {1100: 1.0},
 58: {252: 2.0},
 59: {},
 60: {},
 61: {252: 1.0, 1100: 2.0, 1102: 1.0, 1114: 2.0},
 62: {},
 63: {1114: 1.0},
 64: {252: 1.

In [21]:
# Testing
k = 0
for u, dic in anomalousItemsPerUser.items():
    if len(dic) >= N: k += 1
print('Total:', k)

Total: 0


In [22]:
# Correct the Matrix
final_matrix = noise_correction(anomalousItemsPerUser)

In [23]:
new_data = reshape_data(final_matrix, stratagy)

In [24]:
# Save final matrix

if TRIGGER == 'K':
    file_matrix = '../datasets/filter_2/final_matrix_100K_{0}_{1}_{2}.npy'.format(str(N), str(minSup), stratagy)
    file_new_data = '../datasets/filter_2/dataset_100K_{0}_{1}_{2}.npy'.format(str(N), str(minSup), stratagy)
    
    save(file_new_data, new_data)
    save(file_matrix, final_matrix)

    new_data_load = load(file_new_data)
    
elif TRIGGER == 'M':
    file_matrix = '../datasets/filter_2/final_matrix_1M_{0}_{1}_{2}.npy'.format(str(N), str(minSup), stratagy)
    file_new_data = '../datasets/filter_2/dataset_1M_{0}_{1}_{2}.npy'.format(str(N), str(minSup), stratagy)
    
    save(file_new_data, new_data)
    save(file_matrix, final_matrix)

    new_data_load = load(file_new_data)

### Results

In [25]:
(new_data == new_data_load).all()

True

In [26]:
pd.DataFrame(new_data_load, columns = ['user', 'item', 'rating'])

Unnamed: 0,user,item,rating
0,1.0,1.0,5.0
1,1.0,48.0,5.0
2,1.0,150.0,5.0
3,1.0,260.0,4.2
4,1.0,527.0,5.0
...,...,...,...
900184,6040.0,3671.0,4.0
900185,6040.0,3683.0,4.0
900186,6040.0,3703.0,4.0
900187,6040.0,3751.0,4.0


In [27]:
print('Preprocessing time: {0} mins!'.format( int( time() - tic ) / 60.0) )

Preprocessing time: 2.2666666666666666 mins!


In [28]:
# Epoch: 15        RMSE: 0.822971                           baseline        1M
# Epoch: 16        RMSE: 0.821130      MAE: 0.641357        Filter 2        1M      Option 1     N=5      minSup=430 *
# Epoch: 15        RMSE: 0.821301      MAE: 0.640002        Filter 2        1M      Option 2     N=1      minSup=200 **

# Epoch: 16        RMSE: 0.821880      MAE: 0.641655        Filter 2        1M      Option 1     N=5      minSup=300 ******
# Epoch: 13        RMSE: 0.821791      MAE: 0.641570        Filter 2        1M      Option 1     N=10     minSup=300 *****
# Epoch: 15        RMSE: 0.821573      MAE: 0.641424        Filter 2        1M      Option 1     N=3      minSup=430 ****
# Epoch: 16        RMSE: 0.821130      MAE: 0.641357        Filter 2        1M      Option 1     N=5      minSup=430 *
# Epoch: 15        RMSE: 0.821223      MAE: 0.641457        Filter 2        1M      Option 1     N=10     minSup=430 **

# Epoch: 15        RMSE: 0.821301      MAE: 0.640002        Filter 2        1M      Option 2     N=1      minSup=200 ***
# Epoch: 18        RMSE: 0.824657      MAE: 0.641697        Filter 2        1M      Option 2     N=1      minSup=300 --
# Epoch: 15        RMSE: 0.822437      MAE: 0.640682        Filter 2        1M      Option 2     N=1      minSup=430 -
# Epoch: 16        RMSE: 0.824765      MAE: 0.642042        Filter 2        1M      Option 2     N=5      minSup=430 ---

# Epoch: 17        RMSE: 0.904360      MAE: 0.711333        baseline        100K
# Epoch: 6         RMSE: 0.900806      MAE: 0.707348        Filter 2        100K