In [1]:
import pandas as pd
import matplotlib.pyplot as pyplot
import numpy as np

In [2]:
np.int = int
np.float = float 
np.bool = bool

## Import Dataset

In [26]:
URM_all_dataframe = pd.read_csv('data_train.csv', 
                                sep=",", 
                                header= 0, 
                                dtype={0:int, 1:int, 2:float},
                                engine='python')

URM_all_dataframe.columns = ["UserID", "ItemID", "Interaction"]

In [27]:
# Move to sparse format
import scipy.sparse as sps

URM_all = sps.coo_matrix((URM_all_dataframe["Interaction"].values, 
                          (URM_all_dataframe["UserID"].values, URM_all_dataframe["ItemID"].values))) ## .values --> numpy array, df[..] --> pd series

In [28]:
userID_unique = URM_all_dataframe["UserID"].unique()
itemID_unique = URM_all_dataframe["ItemID"].unique()

n_users = len(userID_unique)
n_items = len(itemID_unique)
n_interactions = len(URM_all_dataframe)

print ("Number of items\t {}, Number of users\t {}".format(n_items, n_users))
print ("Max ID items\t {}, Max Id users\t {}\n".format(max(itemID_unique), max(userID_unique)))
print ("Average interactions per user {:.2f}".format(n_interactions/n_users))
print ("Average interactions per item {:.2f}\n".format(n_interactions/n_items))

print ("Sparsity {:.2f} %".format((1-float(n_interactions)/(n_items*n_users))*100))

Number of items	 22222, Number of users	 12638
Max ID items	 22347, Max Id users	 13024

Average interactions per user 37.88
Average interactions per item 21.54

Sparsity 99.83 %


## Remove idle users/items

In [29]:
mapped_id, original_id = pd.factorize(URM_all_dataframe["UserID"].unique())
user_original_ID_to_index = pd.Series(mapped_id, index=original_id)

In [30]:
for i in range(int(np.ceil(len(user_original_ID_to_index)/10))):
    print(user_original_ID_to_index[10*i:10*(i+1)])

1     0
2     1
3     2
4     3
5     4
6     5
7     6
8     7
9     8
10    9
dtype: int64
11    10
12    11
13    12
14    13
15    14
16    15
17    16
18    17
19    18
20    19
dtype: int64
21    20
23    21
24    22
25    23
26    24
27    25
28    26
29    27
30    28
31    29
dtype: int64
32    30
33    31
34    32
35    33
36    34
37    35
38    36
39    37
40    38
41    39
dtype: int64
42    40
43    41
44    42
45    43
46    44
47    45
48    46
49    47
50    48
51    49
dtype: int64
52    50
53    51
54    52
55    53
56    54
57    55
58    56
59    57
61    58
62    59
dtype: int64
63    60
64    61
66    62
67    63
68    64
69    65
70    66
71    67
72    68
73    69
dtype: int64
74    70
75    71
76    72
77    73
78    74
79    75
80    76
81    77
82    78
83    79
dtype: int64
84    80
85    81
86    82
87    83
88    84
89    85
90    86
91    87
92    88
93    89
dtype: int64
94     90
95     91
96     92
97     93
98     94
99     95
100    96
101    97
102

In [31]:
index_to_user_original_ID = pd.Series(original_id, index= mapped_id)

for i in range(int(np.ceil(len(index_to_user_original_ID)/10))):
    print(index_to_user_original_ID[10*i:10*(i+1)])

0     1
1     2
2     3
3     4
4     5
5     6
6     7
7     8
8     9
9    10
dtype: int32
10    11
11    12
12    13
13    14
14    15
15    16
16    17
17    18
18    19
19    20
dtype: int32
20    21
21    23
22    24
23    25
24    26
25    27
26    28
27    29
28    30
29    31
dtype: int32
30    32
31    33
32    34
33    35
34    36
35    37
36    38
37    39
38    40
39    41
dtype: int32
40    42
41    43
42    44
43    45
44    46
45    47
46    48
47    49
48    50
49    51
dtype: int32
50    52
51    53
52    54
53    55
54    56
55    57
56    58
57    59
58    61
59    62
dtype: int32
60    63
61    64
62    66
63    67
64    68
65    69
66    70
67    71
68    72
69    73
dtype: int32
70    74
71    75
72    76
73    77
74    78
75    79
76    80
77    81
78    82
79    83
dtype: int32
80    84
81    85
82    86
83    87
84    88
85    89
86    90
87    91
88    92
89    93
dtype: int32
90     94
91     95
92     96
93     97
94     98
95     99
96    100
97    101
98 

In [32]:
original_user_ID = 292
print("New index for user {} is {}".format(original_user_ID, user_original_ID_to_index[original_user_ID]))

New index for user 292 is 282


In [33]:
mapped_user_ID = 282
print("Original index for user {} is {}".format(mapped_user_ID, index_to_user_original_ID[mapped_user_ID]))

Original index for user 282 is 292


In [34]:
mapped_id, original_id = pd.factorize(URM_all_dataframe["ItemID"].unique())
item_original_ID_to_index = pd.Series(mapped_id, index=original_id)

In [35]:
for i in range(int(np.ceil(len(item_original_ID_to_index)/10))):
    print(item_original_ID_to_index[10*i:10*(i+1)])

7      0
15     1
16     2
133    3
161    4
187    5
205    6
222    7
237    8
354    9
dtype: int64
377     10
386     11
429     12
521     13
523     14
544     15
699     16
879     17
893     18
1005    19
dtype: int64
1108    20
1132    21
1191    22
1217    23
1935    24
2462    25
2612    26
3224    27
3901    28
3963    29
dtype: int64
4034     30
4145     31
4303     32
4470     33
7329     34
8637     35
11285    36
12252    37
16066    38
16452    39
dtype: int64
17445    40
19973    41
20884    42
22110    43
1        44
10       45
17       46
24       47
25       48
27       49
dtype: int64
42     50
87     51
97     52
107    53
112    54
156    55
234    56
267    57
461    58
490    59
dtype: int64
646     60
652     61
690     62
744     63
790     64
820     65
835     66
866     67
1113    68
1180    69
dtype: int64
1227    70
1354    71
1372    72
1707    73
1863    74
2455    75
4545    76
5568    77
6278    78
160     79
dtype: int64
847     80
1171    81
2408

In [36]:
index_to_item_original_ID = pd.Series(original_id, index= mapped_id)

In [37]:
original_item_ID = 292
print("New index for item {} is {}".format(original_item_ID, item_original_ID_to_index[original_item_ID]))

New index for item 292 is 327


In [38]:
mapped_item_ID = 327
print("Original index for user {} is {}".format(mapped_item_ID, index_to_item_original_ID[mapped_item_ID]))

Original index for user 327 is 292


In [39]:
URM_all_dataframe["UserID"] = URM_all_dataframe["UserID"].map(user_original_ID_to_index)
URM_all_dataframe["ItemID"] = URM_all_dataframe["ItemID"].map(item_original_ID_to_index)

In [40]:
userID_unique = URM_all_dataframe["UserID"].unique()
itemID_unique = URM_all_dataframe["ItemID"].unique()

n_users = len(userID_unique)
n_items = len(itemID_unique)
n_interactions = len(URM_all_dataframe)

print ("Number of items\t {}, Number of users\t {}".format(n_items, n_users))
print ("Max ID items\t {}, Max Id users\t {}\n".format(max(itemID_unique), max(userID_unique)))
print ("Average interactions per user {:.2f}".format(n_interactions/n_users))
print ("Average interactions per item {:.2f}\n".format(n_interactions/n_items))

print ("Sparsity {:.2f} %".format((1-float(n_interactions)/(n_items*n_users))*100))

Number of items	 22222, Number of users	 12638
Max ID items	 22221, Max Id users	 12637

Average interactions per user 37.88
Average interactions per item 21.54

Sparsity 99.83 %


In [41]:
import scipy.sparse as sps

URM_all_preprocessed = sps.coo_matrix((URM_all_dataframe["Interaction"].values, 
                          (URM_all_dataframe["UserID"].values, URM_all_dataframe["ItemID"].values)))

URM_all_preprocessed

<12638x22222 sparse matrix of type '<class 'numpy.float64'>'
	with 478730 stored elements in COOrdinate format>

In [42]:
URM_all_preprocessed.tocsr()

<12638x22222 sparse matrix of type '<class 'numpy.float64'>'
	with 478730 stored elements in Compressed Sparse Row format>

In [43]:
from scipy.sparse import csr_matrix, save_npz

URM_all_preprocessed = URM_all_preprocessed.tocsr()
save_npz('URM_all_preprocessed.npz', URM_all_preprocessed)

In [45]:
index_to_user_original_ID.to_csv('mapping_user.csv')
index_to_item_original_ID.to_csv('mapping_item.csv')

In [None]:
s= pd.read_csv('mapping_item.csv', index_col=0)
s = s.squeeze()
s

In [None]:
index_to_item_original_ID

In [None]:
from Evaluation.Evaluator import EvaluatorHoldout
from Data_manager.split_functions.split_train_validation_random_holdout import split_train_in_two_percentage_global_sample

URM_train, URM_validation = split_train_in_two_percentage_global_sample(URM_all_preprocessed, train_percentage = 0.80)

In [None]:
save_npz('URM_train.npz', URM_train)
save_npz('URM_validation.npz', URM_validation)

## Create 5Fold datasets

In [3]:
from Data_manager.split_functions.split_train_validation_random_holdout import split_train_in_two_percentage_global_sample_kFold
from scipy.sparse import csr_matrix, load_npz

URM_all = load_npz('C:\\Users\\melan\\shared-folder\\RecSys\\RecSys-Competition\\KFold data\\URM_all_preprocessed.npz')

In [4]:
URM_list = split_train_in_two_percentage_global_sample_kFold(URM_all, n_folds= 5, train_percentage = 0.80)

URM_all contains 478730 interactions.
There will be 382984 train interactions
and 95746 validation interactions,
for a total of 478730


In [5]:
len(URM_list)

10

In [6]:
URM_all


<12638x22222 sparse matrix of type '<class 'numpy.float64'>'
	with 478730 stored elements in Compressed Sparse Row format>

In [17]:
URM_list[9]

<12638x22222 sparse matrix of type '<class 'numpy.float64'>'
	with 95746 stored elements in Compressed Sparse Row format>

In [20]:
for fold in range(1):
    URM_train_name = "URM_train_uuu" + str(fold + 1)
    URM_validation_name = "URM_validation_uuu" + str(fold + 1)

    print("Saving fold", fold, "...")
    np.savez(URM_train_name + '.npz', URM_list[2 * fold])
    np.savez(URM_validation_name + '.npz', URM_list[2 * fold + 1])

Saving fold 0 ...


##  Visualize n_interactions distribution

### Original data

In [None]:
from scipy.sparse import csr_matrix, load_npz

URM_train = load_npz('C:\\Users\\melan\\shared-folder\\RecSys\\RecSys-Competition\\PreprocessedDataset\\URM_train.npz')
URM_validation = load_npz('C:\\Users\\melan\\shared-folder\\RecSys\\RecSys-Competition\\PreprocessedDataset\\URM_validation.npz')

In [None]:
URM_all = URM_train + URM_validation

In [None]:
user_activity = np.ediff1d(URM_all.tocsr().indptr)


pyplot.plot(user_activity, 'ro')
pyplot.ylabel('Num Interactions ')
pyplot.xlabel('Sorted User')
pyplot.show()

In [None]:
import matplotlib.pyplot as plt

# Count the occurrences of each unique value
unique_values, counts = np.unique(user_activity, return_counts=True)

# Create a bar chart
plt.bar(unique_values, counts, color='blue', edgecolor='black', alpha=0.7)

plt.title('User Activity Distribution')
plt.xlabel('Number of Interactions')
plt.ylabel('Number of Users')
plt.show()

In [None]:
# Define the number of groups (k)
k = 20

# Calculate percentiles to determine group boundaries
percentiles = np.linspace(0, 100, k + 1)
boundaries = np.percentile(user_activity, percentiles).astype(int)

plt.figure(figsize=(12, 6))

# Create a bar chart with the groups
plt.hist(user_activity, bins=boundaries, color='blue', edgecolor='black', alpha=0.7)

plt.title(f'User Activity Distribution with {k} Groups')
plt.xlabel('Number of Interactions')
plt.ylabel('Number of Users')
plt.show()

In [None]:
boundaries

In [None]:
unique_values

In [None]:
counts

### Preprocessed data

In [None]:
from scipy.sparse import csr_matrix, load_npz

URM_train_preprocessed = load_npz('C:\\Users\\melan\\shared-folder\\RecSys\\RecSys-Competition\\PreprocessedDataset\\URM_train_preprocessed.npz')
URM_validation_preprocessed = load_npz('C:\\Users\\melan\\shared-folder\\RecSys\\RecSys-Competition\\PreprocessedDataset\\URM_validation_preprocessed.npz')

In [None]:
URM_all_preprocessed = URM_train_preprocessed + URM_validation_preprocessed

In [None]:
user_activity = np.ediff1d(URM_all_preprocessed.tocsr().indptr)


pyplot.plot(user_activity, 'ro')
pyplot.ylabel('Num Interactions ')
pyplot.xlabel('Sorted User')
pyplot.show()

In [None]:
import matplotlib.pyplot as plt

# Count the occurrences of each unique value
unique_values, counts = np.unique(user_activity, return_counts=True)

# Create a bar chart
plt.bar(unique_values, counts, color='blue', edgecolor='black', alpha=0.7)

plt.title('User Activity Distribution')
plt.xlabel('Number of Interactions')
plt.ylabel('Number of Users')
plt.show()

In [None]:
# Define the number of groups (k)
k = 20

# Calculate percentiles to determine group boundaries
percentiles = np.linspace(0, 100, k + 1)
boundaries = np.percentile(user_activity, percentiles).astype(int)

plt.figure(figsize=(12, 6))

# Create a bar chart with the groups
plt.hist(user_activity, bins=boundaries, color='blue', edgecolor='black', alpha=0.7)

plt.title(f'User Activity Distribution with {k} Groups')
plt.xlabel('Number of Interactions')
plt.ylabel('Number of Users')
plt.show()

In [None]:
boundaries

In [None]:
unique_values

In [None]:
counts

### Train data

In [None]:
from scipy.sparse import csr_matrix, load_npz

URM_train = load_npz('C:\\Users\\melan\\shared-folder\\RecSys\\RecSys-Competition\\PreprocessedDataset\\URM_train.npz')

In [None]:
user_activity = np.ediff1d(URM_train.tocsr().indptr)
user_activity = np.sort(user_activity)


pyplot.plot(user_activity, 'ro')
pyplot.ylabel('Num Interactions ')
pyplot.xlabel('Sorted User')
pyplot.show()

In [None]:
import matplotlib.pyplot as plt

# Count the occurrences of each unique value
unique_values, counts = np.unique(user_activity, return_counts=True)

# Create a bar chart
plt.bar(unique_values, counts, color='blue', edgecolor='black', alpha=0.7)

plt.title('User Activity Distribution')
plt.xlabel('Number of Interactions')
plt.ylabel('Number of Users')
plt.show()

In [None]:
# Define the number of groups (k)
k = 20

# Calculate percentiles to determine group boundaries
percentiles = np.linspace(0, 100, k + 2)
boundaries = np.percentile(user_activity, percentiles).astype(int)

plt.figure(figsize=(12, 6))

# Create a bar chart with the groups
plt.hist(user_activity, bins=boundaries, color='blue', edgecolor='black', alpha=0.7)

plt.title(f'User Activity Distribution with {k} Groups')
plt.xlabel('Number of Interactions')
plt.ylabel('Number of Users')
plt.show()

In [None]:
boundaries

In [None]:
unique_values

In [None]:
counts

To ensure that users with the same number of interactions are not split across multiple bins, slightly adjust the boundaries to make them unique.

In [None]:
# Calculate integer percentiles to determine group boundaries
percentiles = np.linspace(0, 100, k + 2)
percentile_values = np.percentile(user_activity, percentiles)

# Add a small offset to ensure unique boundaries
offset = 1e-10
boundaries = np.unique(np.round(percentile_values + offset).astype(int))
print(boundaries)

In [None]:
print(len(boundaries))

In [None]:
user_act = 1095
user_groups = np.digitize(user_act, boundaries)


In [None]:
values, count = np.unique(user_groups, return_counts= True)

In [None]:
values

In [None]:
count

In [None]:
plt.figure(figsize=(12, 6))

# Create a bar chart with the groups
plt.hist(user_activity, bins=boundaries, color='blue', edgecolor='black', alpha=0.7)

plt.title(f'User Activity Distribution with {k} Groups')
plt.xlabel('Number of Interactions')
plt.ylabel('Number of Users')
plt.show()