In [20]:
import pandas as pd
from surprise.model_selection import train_test_split, GridSearchCV, cross_validate
from surprise import Reader, Dataset, CoClustering, accuracy
import time

## Interactions 100% of Dataset

In [21]:
df = pd.read_csv("interactions_100.csv")
reader = Reader(rating_scale=(0,5))
data = Dataset.load_from_df(df[['u', 'i', 'rating']], reader)
trainset, testset = train_test_split(data, test_size=.25)

In [22]:
param_grid = {'n_cltr_u': [1,3,5,10], 'n_cltr_i': [1,3,5,10], 'n_epochs':[5,10,20,30]}
gs = GridSearchCV(CoClustering, param_grid, measures=['rmse'], cv = 5, n_jobs = -1, joblib_verbose=5)
start = time.time()

gs.fit(data)

end = time.time()
elapsed_seconds = (end-start)
print("Elapsed time (in seconds): ", elapsed_seconds)

print("RMSE: " , gs.best_score['rmse'])
print("Best Paramas: " , gs.best_params['rmse'])

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 20 concurrent workers.
[Parallel(n_jobs=-1)]: Done  32 tasks      | elapsed:   51.0s
[Parallel(n_jobs=-1)]: Done 122 tasks      | elapsed:  3.2min
[Parallel(n_jobs=-1)]: Done 248 tasks      | elapsed:  7.0min


Elapsed time (in seconds):  575.3800930976868
RMSE:  1.0213632059804763
Best Paramas:  {'n_cltr_u': 1, 'n_cltr_i': 1, 'n_epochs': 5}


[Parallel(n_jobs=-1)]: Done 320 out of 320 | elapsed:  9.5min finished


In [23]:
co = CoClustering(n_cltr_u=1, n_cltr_i=1, n_epochs=5)

start = time.time()

co.fit(trainset)

end = time.time()
print(end-start)
elapsed_seconds = (end-start)
print("Elapsed time (in seconds): ", elapsed_seconds)

predictions = co.test(testset)
print(accuracy.rmse(predictions))

Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  co.fit(trainset)


6.171355962753296
Elapsed time (in seconds):  6.171355962753296
RMSE: 1.0262
1.026216029011226


In [24]:
co.predict(uid = '11', iid = '2166')

Prediction(uid='11', iid='2166', r_ui=None, est=4.564042362059749, details={'was_impossible': False})

## Interactions 50% of Dataset

In [25]:
df = pd.read_csv("interactions_50.csv")
reader = Reader(rating_scale=(0,5))
data = Dataset.load_from_df(df[['u', 'i', 'rating']], reader)
trainset, testset = train_test_split(data, test_size=.25)

In [26]:
param_grid = {'n_cltr_u': [1,3,5,10], 'n_cltr_i': [1,3,5,10], 'n_epochs':[5,10,20,30]}
gs2 = GridSearchCV(CoClustering, param_grid, measures=['rmse'], cv = 5, n_jobs = -1, joblib_verbose=5)
start = time.time()

gs2.fit(data)

end = time.time()
elapsed_seconds = (end-start)
print("Elapsed time (in seconds): ", elapsed_seconds)

print("RMSE: " , gs2.best_score['rmse'])
print("Best Paramas: " , gs2.best_params['rmse'])

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 20 concurrent workers.
[Parallel(n_jobs=-1)]: Done  32 tasks      | elapsed:   32.3s
[Parallel(n_jobs=-1)]: Done 122 tasks      | elapsed:  2.1min
[Parallel(n_jobs=-1)]: Done 248 tasks      | elapsed:  4.5min


Elapsed time (in seconds):  360.99964690208435
RMSE:  1.0524393381198474
Best Paramas:  {'n_cltr_u': 5, 'n_cltr_i': 3, 'n_epochs': 5}


[Parallel(n_jobs=-1)]: Done 320 out of 320 | elapsed:  6.0min finished


In [28]:
co2 = CoClustering(n_cltr_u=5, n_cltr_i=3, n_epochs=5)

start = time.time()

co2.fit(trainset)

end = time.time()
elapsed_seconds = (end-start)
print("Elapsed time (in seconds): ", elapsed_seconds)

predictions2 = co2.test(testset)
print(accuracy.rmse(predictions2))

Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  co2.fit(trainset)


Elapsed time (in seconds):  4.507817983627319
RMSE: 1.0618
1.0617637772824633


In [29]:
co2.predict(uid = '11', iid = '2166')

Prediction(uid='11', iid='2166', r_ui=None, est=4.566245471283483, details={'was_impossible': False})

## Interactions 25% of Dataset

In [30]:
df = pd.read_csv("interactions_25.csv")
reader = Reader(rating_scale=(0,5))
data = Dataset.load_from_df(df[['u', 'i', 'rating']], reader)
trainset, testset = train_test_split(data, test_size=.25)

In [31]:
param_grid = {'n_cltr_u': [1,3,5,10], 'n_cltr_i': [1,3,5,10], 'n_epochs':[5,10,20,30]}
gs3 = GridSearchCV(CoClustering, param_grid, measures=['rmse'], cv = 5, n_jobs = -1, joblib_verbose=5)
start = time.time()

gs3.fit(data)

end = time.time()
elapsed_seconds = (end-start)
print("Elapsed time (in seconds): ", elapsed_seconds)

print("RMSE: " , gs3.best_score['rmse'])
print("Best Paramas: " , gs3.best_params['rmse'])

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 20 concurrent workers.
[Parallel(n_jobs=-1)]: Done  32 tasks      | elapsed:   19.5s
[Parallel(n_jobs=-1)]: Done 122 tasks      | elapsed:  1.3min
[Parallel(n_jobs=-1)]: Done 248 tasks      | elapsed:  2.7min


Elapsed time (in seconds):  219.7318024635315
RMSE:  1.0765857418894385
Best Paramas:  {'n_cltr_u': 3, 'n_cltr_i': 5, 'n_epochs': 10}


[Parallel(n_jobs=-1)]: Done 320 out of 320 | elapsed:  3.6min finished


In [32]:
co3 = CoClustering(n_cltr_u=3, n_cltr_i=5, n_epochs=10)

start = time.time()

co3.fit(trainset)

end = time.time()
elapsed_seconds = (end-start)
print("Elapsed time (in seconds): ", elapsed_seconds)

predictions3 = co3.test(testset)
print(accuracy.rmse(predictions3))

Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  co3.fit(trainset)


Elapsed time (in seconds):  4.735616207122803
RMSE: 1.0800
1.079998499370296


In [33]:
co3.predict(uid = '11', iid = '2166')

Prediction(uid='11', iid='2166', r_ui=None, est=4.562956583714438, details={'was_impossible': False})