In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import KFold
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
import time

In [2]:
# Reading the data
ratings_col_names = ["UserID", "MovieID", "Rating", "Timestamp"]
rdf = pd.read_csv("./ml-1m/ratings.dat", sep="::", header=None, names=ratings_col_names, engine="python")
rdf.head()

Unnamed: 0,UserID,MovieID,Rating,Timestamp
0,1,1193,5,978300760
1,1,661,3,978302109
2,1,914,3,978301968
3,1,3408,4,978300275
4,1,2355,5,978824291


In [3]:
# Dividing the data into 5 folds
RandomState = 42

kf = KFold(n_splits=5, shuffle=True, random_state=RandomState)
Folds = []
for train_index, test_index in kf.split(rdf):
    Folds.append((rdf.iloc[train_index, :], rdf.iloc[test_index, :]))

In [7]:
# Trivial functions
def R_item(df, item, R_global):
    df_filtered = df.loc[df["MovieID"] == item]
    if len(df_filtered) == 0:
        """The item is not in our database"""
        # strategy is using global value in our training data
        return R_global
    return df_filtered["Rating"].mean()

def R_user(df, user, R_global):
    df_filtered = df.loc[df["UserID"] == user]
    if len(df_filtered) == 0:
        """The user is not in our database"""
        # strategy is using global value in our training data
        return R_global
    return df_filtered["Rating"].mean()

In [9]:
results = []

for i, (train, test) in enumerate(Folds):
    t0 = time.time()
    R_global = train.Rating.mean()
    train.loc[:, "R_item"] = train["MovieID"].apply(lambda x:R_item(train, x, R_global))
    t1 = time.time()
    test.loc[:, "R_item"] = test["MovieID"].apply(lambda x:R_item(test, x, R_global))
    t2 = time.time()
    train.loc[:, "R_user"] = train["UserID"].apply(lambda x: R_user(train, x, R_global))
    t3 = time.time()
    test.loc[:, "R_user"] = test["UserID"].apply(lambda x: R_user(test, x, R_global))
    t4 = time.time()
    
    
    X_train = train.loc[:, ["R_user", "R_item"]].values
    X_train = np.append(X_train, np.ones((X_train.shape[0],1)), axis=1) # I think this line is unnecessary
    y_train = train.loc[:, "Rating"]
    
    X_test = test.loc[:, ["R_user", "R_item"]].values
    X_test = np.append(X_test, np.ones((X_test.shape[0],1)), axis=1) # I think this line is unnecessary
    y_test = test.loc[:, "Rating"]
    
    
    t5 = time.time()
    lr = LinearRegression()
    lr.fit(X_train, y_train)
    
    t6 = time.time()
    
    coefs = lr.coef_
    intercept = lr.intercept_
    
    y_pred = lr.predict(X_test)
    
    rmse = mean_squared_error(y_test, y_pred, squared=False)
    
    # The mean squared error
    print('Root mean squared error: %.2f' % rmse)
    
    r2 = r2_score(y_test, y_pred)
    # The coefficient of determination: 1 is perfect prediction
    print('Coefficient of determination: %.2f'% r2_score(y_test, y_pred))
    
    res = {
        "fold": i,
        "t0": t0,
        "t1": t1,
        "t2": t2,
        "t3": t3,
        "t4": t4,
        "t5": t5,
        "t6": t6,
        "coefs": coefs,
        "intercept": intercept,
        "rmse": rmse,
        "r2_score":r2
    }
    
    results.append(res)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_column(ilocs[0], value, pi)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_column(ilocs[0], value, pi)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_column(ilocs[0], value, pi)
A value is trying to be set on a copy of a slice from a Da

Root mean squared error: 0.90
Coefficient of determination: 0.35


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.obj[key] = value
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_column(ilocs[0], value, pi)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.obj[key] = value
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = val

Root mean squared error: 0.90
Coefficient of determination: 0.35


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.obj[key] = value
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_column(ilocs[0], value, pi)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.obj[key] = value
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = val

Root mean squared error: 0.90
Coefficient of determination: 0.35


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.obj[key] = value
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_column(ilocs[0], value, pi)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.obj[key] = value
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = val

Root mean squared error: 0.90
Coefficient of determination: 0.35


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.obj[key] = value
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_column(ilocs[0], value, pi)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.obj[key] = value
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = val

Root mean squared error: 0.90
Coefficient of determination: 0.35


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.obj[key] = value
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_column(ilocs[0], value, pi)


In [10]:
results

[{'fold': 0,
  't0': 1635019580.203698,
  't1': 1635020273.029936,
  't2': 1635020378.8557289,
  't3': 1635021022.951227,
  't4': 1635021126.526325,
  't5': 1635021126.5394611,
  't6': 1635021126.588769,
  'coefs': array([0.78107459, 0.87409659, 0.        ]),
  'intercept': -2.34639559161956,
  'rmse': 0.9035220028549386,
  'r2_score': 0.3488939149920688},
 {'fold': 1,
  't0': 1635021126.5940611,
  't1': 1635021774.5321758,
  't2': 1635021872.303982,
  't3': 1635022515.2569978,
  't4': 1635022618.531567,
  't5': 1635022618.545743,
  't6': 1635022618.593517,
  'coefs': array([0.78284655, 0.87534183, 0.        ]),
  'intercept': -2.356804780842453,
  'rmse': 0.8985829558384985,
  'r2_score': 0.35049614000857476},
 {'fold': 2,
  't0': 1635022618.598351,
  't1': 1635023265.524678,
  't2': 1635023363.2533472,
  't3': 1635024009.6320908,
  't4': 1635024112.823288,
  't5': 1635024112.837472,
  't6': 1635024112.885192,
  'coefs': array([0.78278052, 0.87515523, 0.        ]),
  'intercept': -2.3

In [14]:
import pickle

with open("./naive_results.pkl", "wb") as write_file:
    pickle.dump(results, write_file)