***Misc.***

In [None]:
# %tensorflow_version 1.x
import tensorflow as tf
tf.__version__

'2.2.0'

In [None]:
tf.test.gpu_device_name()

'/device:GPU:0'

In [None]:
from google.colab import drive
drive.mount('/content/gdrive')

Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3aietf%3awg%3aoauth%3a2.0%3aoob&response_type=code&scope=email%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdocs.test%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive.photos.readonly%20https%3a%2f%2fwww.googleapis.com%2fauth%2fpeopleapi.readonly

Enter your authorization code:
··········
Mounted at /content/gdrive


In [None]:
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

Mounted at /content/drive


#***Importing Data***




**Importing Libraries**

In [None]:
import numpy as np
import pandas as pd

**Importing Data**

In [None]:
def split_data_ml100k(data, num_users, num_items, split_mode="random", test_ratio=0.1):
    """Split the dataset in random mode or seq-aware mode."""
    if split_mode == "seq-aware":
        train_items, test_items, train_list = {}, {}, []
        for line in data.itertuples():
            u, i, rating, time = line[1], line[2], line[3], line[4]
            train_items.setdefault(u, []).append((u, i, rating, time))
            if u not in test_items or test_items[u][-1] < time:
                test_items[u] = (i, rating, time)
        for u in range(1, num_users + 1):
            train_list.extend(sorted(train_items[u], key=lambda k: k[3]))
        test_data = [(key, *value) for key, value in test_items.items()]
        train_data = [item for item in train_list if item not in test_data]
        train_data = pd.DataFrame(train_data)
        test_data = pd.DataFrame(test_data)
    else:
        mask = [True if x == 1 else False for x in np.random.uniform(
            0, 1, (len(data))) < 1 - test_ratio]
        neg_mask = [not x for x in mask]
        train_data, test_data = data[mask], data[neg_mask]
    return train_data, test_data

In [None]:
names = ['user_id', 'item_id', 'rating', 'timestamp']
item_cols = ['movie id','movie title','release date', 'video release date','IMDb URL','unknown','Action', 'Adventure','Animation','Childrens','Comedy','Crime', 'Documentary','Drama','Fantasy','Film-Noir','Horror', 'Musical','Mystery','Romance ','Sci-Fi','Thriller', 'War' ,'Western']
data = pd.read_csv('./drive/My Drive/Thesis/Soft_Impute/u.data', '\t', names=names, engine='python')  
item = pd.read_csv('./drive/My Drive/Thesis/Soft_Impute/u.item', sep='|', names=item_cols, encoding='latin-1')  

num_users = data.user_id.unique().shape[0]
num_items = data.item_id.unique().shape[0]  

sparsity = 1 - len(data) / (num_users * num_items)
print('number of users: %d, number of items: %d.' % (num_users, num_items))
print('matrix sparsity: %f' % sparsity)
print(data.head(5))


number of users: 943, number of items: 1682.
matrix sparsity: 0.936953
   user_id  item_id  rating  timestamp
0      196      242       3  881250949
1      186      302       3  891717742
2       22      377       1  878887116
3      244       51       2  880606923
4      166      346       1  886397596


In [None]:
train, test = split_data_ml100k(data, num_users, num_items, split_mode="random", test_ratio=0.1)

M_data = data.pivot(index = 'user_id', columns ='item_id', values = 'rating').values
#M_item = item.drop(['movie title','release date', 'video release date','IMDb URL'], axis = 1)
M_data_train = train.pivot(index = 'user_id', columns ='item_id', values = 'rating').values
M_data_test = test.pivot(index = 'user_id', columns ='item_id', values = 'rating').values


Finding location of missing entries

In [None]:
masked = np.nonzero(pd.isnull(M_data))
xx = masked[0]
yy = masked[1]
missing_mask = np.concatenate((xx[:, None],yy[:, None]), axis=1)

changing nan entries to 0

In [None]:
M_data = np.nan_to_num(M_data)

Finding location of observed entries

In [None]:
masked = np.nonzero(M_data)
xx = masked[0]
yy = masked[1]
observed_mask = np.concatenate((xx[:, None],yy[:, None]), axis=1)

In [None]:
names = ['user_id', 'item_id', 'rating', 'timestamp']
data = pd.read_csv('./drive/My Drive/Thesis/Soft_Impute/ratings.dat', sep="::", names=names, engine='python', header=None)
M_data = data.pivot(index = 'user_id', columns ='item_id', values = 'rating').values

# **Soft-Impute Implementation**

Importing Libraries

In [None]:
import numpy as np
from sklearn.utils.extmath import randomized_svd

Initializing *Variables*

In [None]:
convergence_threshold=0.001
max_iters = 100
max_rank = None
shrinkage_value = None

Defining functions

In [None]:
def _max_singular_value(X_filled):
# quick decomposition of X_filled into rank-1 SVD
    _, s, _ = randomized_svd( X_filled, 1, n_iter=5)
    return s[0]

def _svd_step(X, shrinkage_value, max_rank=None):
    if max_rank:
        # if we have a max rank then perform the faster randomized SVD
        (U, s, V) = randomized_svd(X, max_rank, n_iter=1)
    else:
        # perform a full rank SVD using ARPACK
        (U, s, V) = np.linalg.svd(X, full_matrices=False, compute_uv=True)
    
    xx = shrinkage_value*np.ones(s.shape)
    s_thresh = np.zeros(s.shape)
    s_thresh = np.maximum((s - xx), np.zeros(s.shape))
    rank = (s_thresh > 0).sum()
    s_thresh = s_thresh[:rank]
    U_thresh = U[:, :rank]
    V_thresh = V[:rank, :]
    S_thresh = np.diag(s_thresh)
    X_reconstruction = np.dot(U_thresh, np.dot(S_thresh, V_thresh))
    return X_reconstruction, rank

def _converged(X_old, X_new, missing_mask):
        # check for convergence
        old_missing_values = X_old[missing_mask[:, 0], missing_mask[:, 1]]
        new_missing_values = X_new[missing_mask[:, 0], missing_mask[:, 1]]
        difference = old_missing_values - new_missing_values
        ssd = np.sum(difference ** 2)
        old_norm = np.sqrt((old_missing_values ** 2).sum())
        return (np.sqrt(ssd) / old_norm) < convergence_threshold, (np.sqrt(ssd) / old_norm)

def masked_mae(X_true, X_pred, mask):
    masked_diff = X_true[mask[:, 0], mask[:, 1]] - X_pred[mask[:, 0], mask[:, 1]]
    return np.mean(np.abs(masked_diff))

Finding location observed and missing entries

In [None]:
X = M_data
X_init = X.copy()
X_filled = X
mae_new = []
rank_new = []
#observed_mask = ~missing_mask
max_singular_value = _max_singular_value(X_filled)

print("[SoftImpute] Max Singular Value of X_init = %f" % (max_singular_value))

[SoftImpute] Max Singular Value of X_init = 640.633623


Calculating Shrinkage value

In [None]:
if shrinkage_value:
    shrinkage_value = shrinkage_value
else:
    # totally hackish heuristic: keep only components
    # with at least 1/50th the max singular value
    shrinkage_value = max_singular_value / 50.0

Main Loop

In [None]:
for i in range(max_iters):
    X_reconstruction, rank = _svd_step(X_filled, shrinkage_value, max_rank=max_rank)
#    X_reconstruction = clip(X_reconstruction)

    # print error on observed data
    mae = masked_mae(X_true=X_init, X_pred=X_reconstruction, mask=observed_mask)
    print("[SoftImpute] Iter %d: observed rank=%d" % (i + 1, rank))
    print(shrinkage_value)
    print(mae)
    mae_new.append(mae)
    rank_new.append(rank)
    
    converged, conv_val = _converged(X_old=X_filled, X_new=X_reconstruction, missing_mask=missing_mask)
    X_filled[missing_mask[:, 0], missing_mask[:, 1]] = X_reconstruction[missing_mask[:, 0], missing_mask[:, 1]]
    print(conv_val)
    if converged:
        break

In [None]:
print("[SoftImpute] Stopped after iteration %d for lambda=%f" % (i + 1, shrinkage_value))

[SoftImpute] Stopped after iteration 74 for lambda=12.812672


# **Soft-Impute with Library**

Initializing Libraries

In [None]:
import numpy as np
from fancyimpute import SoftImpute, BiScaler
import pandas as pd

Simple implementation of Soft-Impute

In [None]:
X_incomplete_normalized = BiScaler().fit_transform(M_data)
X_filled_softimpute = SoftImpute().fit_transform(X_incomplete_normalized)

# **Soft-Impute with Functions**

In [None]:
import sys
sys.path.append('./drive/My Drive/Thesis/Soft_Impute')

In [None]:
from soft_impute import SoftImpute

In [None]:
clf = SoftImpute()
clf.solve(M_data, missing_mask)
imputed = clf.predict(M_data)