In [3]:
import numpy as np
import pandas as pd
from surprise import KNNBaseline
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.model_selection import cross_validate
from sklearn.compose import ColumnTransformer

In [4]:
df_ratings = pd.read_csv("../data/new_df.csv")
df_users = pd.read_csv("../data/users_fixed.csv")

In [5]:
df_users["userID"] = pd.to_numeric(df_users["userID"], errors="coerce")

In [6]:
df = pd.merge(df_ratings, df_users, on="userID")

In [15]:
df = df[df["itemID"] == "Ginseng"]
X = df[["age", "gender", "mood", "anxiety", "libido", "cognition", "motivation", "focus", "userID"]]
y = df[["rating"]].to_numpy().reshape(-1)

numerical_columns = ["age"]
categorical_columns = ["gender", "mood", "anxiety", "libido", "cognition", "motivation", "focus", "userID"]
ct = ColumnTransformer(
    [("scaler", StandardScaler(), numerical_columns),
     ("encoder", OneHotEncoder(handle_unknown="ignore"), categorical_columns)])

pipe = Pipeline([('encoder', ct), ('reg', LinearRegression())])

#pipe.fit(X, y)

In [6]:
cross_validate(pipe, X, y, scoring="neg_root_mean_squared_error")

{'fit_time': array([0.19332194, 0.1692152 , 0.18176794, 0.18714809, 0.16723919]),
 'score_time': array([0.00835204, 0.00770283, 0.00788999, 0.00791121, 0.00809288]),
 'test_score': array([-2.90677405, -2.92838968, -2.98526376, -2.98384783, -3.01804983])}

In [16]:
pipe.fit(X, y)

Pipeline(steps=[('encoder',
                 ColumnTransformer(transformers=[('scaler', StandardScaler(),
                                                  ['age']),
                                                 ('encoder',
                                                  OneHotEncoder(handle_unknown='ignore'),
                                                  ['gender', 'mood', 'anxiety',
                                                   'libido', 'cognition',
                                                   'motivation', 'focus',
                                                   'userID'])])),
                ('reg', LinearRegression())])

In [19]:
pipe["reg"].coef_

array([-6.95520250e-10,  6.45586316e-02,  1.83272977e-01,  6.45586337e-02,
       -3.12390242e-01, -6.64713189e-01,  3.71014456e-01,  2.93698733e-01,
       -2.07257482e-01, -1.62442416e-01,  3.69699899e-01, -6.24902097e-02,
       -6.24902131e-02, -6.24902127e-02,  1.87470636e-01, -1.66640568e-01,
        8.33202827e-02,  8.33202854e-02, -3.38720463e-01,  3.63246545e-02,
        3.02395808e-01, -2.14027093e-01,  2.40049123e-01, -2.60220304e-02,
        1.01424914e+00,  1.42491396e-02, -6.90538303e-01,  3.01424914e+00,
        2.23214597e+00,  7.64288287e-01, -2.69053830e+00,  1.48210683e+00,
       -1.63913700e+00, -3.51789317e+00,  1.41237162e-01,  2.76428829e+00,
       -2.23571171e+00,  1.74817798e+00, -5.01782869e-01,  3.23214598e+00,
       -8.26125972e-01, -1.46605523e+00,  5.84337047e+00, -2.76785402e+00,
        5.59422549e-01, -4.47989007e-01, -2.90843514e+00, -8.63620074e-01,
        4.68450681e+00, -3.23571171e+00,  1.05906420e+00,  7.48177984e-01,
        6.55942255e+00,  

In [15]:
pipe["reg"].coef

AttributeError: 'LinearRegression' object has no attribute 'coef'

In [35]:
from sklearn.dummy import DummyRegressor
pipe_dumb = Pipeline([('encoder', ct), ('reg', DummyRegressor())])
cross_validate(pipe_dumb, X, y, scoring="neg_root_mean_squared_error")


{'fit_time': array([0.05318618, 0.02622485, 0.02079511, 0.02066493, 0.02061391]),
 'score_time': array([0.01059985, 0.00620103, 0.00604391, 0.00610209, 0.00604892]),
 'test_score': array([-3.12178076, -3.181965  , -3.20067359, -3.18234548, -3.21047638])}