In [1]:
import torch
import pickle
import pandas as pd
import numpy as np
from torch.utils.data import TensorDataset, DataLoader
from AutoDataCleaner import AutoDataCleaner as adc

In [2]:
features = ['age', 'gender', 'occupation']
BATCH_SIZE = 128
LEARNING_RATE = 0.0001
EPOCHS = 100
max_err = 1.5

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f'Use {device}')

Use cuda


In [3]:
df_user = pd.read_csv('./ml-100k/u.user', sep='|', names=['user_id', 'age', 'gender', 'occupation', 'zip_code'])
df_user_profile = df_user[features]
df_user_cleaned = adc.clean_me(df_user_profile, detect_binary=True, one_hot=True, normalize=True, verbose=False)
df_user_cleaned = df_user_cleaned.astype('float32')

In [4]:
try:
    ui_matrix = pickle.load(open('./Models/ui_matrix_ua.pkl', 'rb'))
except:
    df = pd.read_csv('./ml-100k/ua.base', sep='\t', names=['user_id', 'item_id', 'rating', 'timestamp'])
    ui_matrix = df.pivot(index='user_id', columns='item_id', values='rating')
    ui_matrix.fillna(0, inplace=True)

pickle.dump(ui_matrix, open('./Models/ui_matrix_ua.pkl', 'wb'))

In [5]:
def train(dl: DataLoader, item_id: int, epochs: int = EPOCHS, lr: float = LEARNING_RATE, device: str = device):
    try:
        model = torch.load(f'./Models/item_{item_id}.pt')
    except:
        model = torch.nn.Sequential(
            torch.nn.Linear(df_user_cleaned.shape[1], 128),
            torch.nn.ReLU(),
            torch.nn.Linear(128, 64),
            torch.nn.ReLU(),
            torch.nn.Linear(64, 1)
        )
    criterion = torch.nn.MSELoss()
    optimizer = torch.optim.Adam(model.parameters(), lr=lr)
    model.to(device)

    for epoch in range(epochs):
        for X, y in dl:
            X, y = X.to(device), y.to(device).reshape(-1, 1)
            y_pred = model(X)
            loss = criterion(y_pred, y)
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

    torch.save(model, f'./Models/item_{item_id}.pt')

def predict(X: np.array, item_id: int, device: str = device):
    try:
        model = torch.load(f'./Models/item_{item_id}.pt')
    except:
        return -1
    model.to(device)
    model.eval()
    with torch.no_grad():
        X: torch.Tensor = torch.from_numpy(X).float()
        X = X.to(device).reshape(1, -1)
        y_pred = model(X)
        return y_pred.item()
    

In [6]:
for item_id in ui_matrix.columns:
    # get all users who rated for item i
    user_ids = ui_matrix[ui_matrix[item_id] != 0].index
    y_train = torch.tensor(ui_matrix[ui_matrix[item_id] != 0][item_id].values, dtype=torch.float)
    user_indexs = user_ids - 1
    # get user profile of users who rated for item i
    X_train = torch.tensor(df_user_cleaned.loc[user_indexs].to_numpy())
    train_loader = DataLoader(TensorDataset(X_train, y_train), batch_size=BATCH_SIZE, shuffle=True)
    train(train_loader, item_id, EPOCHS, LEARNING_RATE, device)


In [7]:
test_df = pd.read_csv('./ml-100k/ua.test', sep='\t', names=['user_id', 'item_id', 'rating', 'timestamp'])
test_df['pred_rating'] = test_df.apply(lambda row: predict(df_user_cleaned.loc[row['user_id'] - 1].values, row['item_id'], device), axis=1)
test_df['err'] = test_df['pred_rating'] - test_df['rating']
test_df

Unnamed: 0,user_id,item_id,rating,timestamp,pred_rating,err
0,1,20,4,887431883,1.300080,-2.699920
1,1,33,4,878542699,3.221707,-0.778293
2,1,61,4,878542420,2.361757,-1.638243
3,1,117,3,874965739,3.706520,0.706520
4,1,155,2,878542201,3.050886,1.050886
...,...,...,...,...,...,...
9425,943,232,4,888639867,3.245536,-0.754464
9426,943,356,4,888639598,2.786338,-1.213662
9427,943,570,1,888640125,1.872620,0.872620
9428,943,808,4,888639868,1.849651,-2.150349


In [11]:
good_errs = test_df[test_df['err'].abs() <= max_err]['err'].values
print(f'Accuracy: {len(good_errs) / len(test_df)}')

Accuracy: 0.6965005302226935
