In [1]:
import pandas as pd
import numpy as np

import dataframe

  df_ratings = pd.read_csv("./ml-1m/ratings.dat", sep="::", names=['userId', 'movieId', 'rating', 'timestamp'])
  df_movies = pd.read_table("./ml-1m/movies.dat", sep="::", names=["movieId","movie_name", "genre"], encoding = "latin-1")
  df_users = pd.read_table("./ml-1m/users.dat", sep="::", names=["userId", "gender", "age", "occupation", "zipcode"])


In [2]:
df = dataframe.final_df

In [3]:
wide_cols = ["userId", "movieId", "genre", "movie_year", "gender", "age", "occupation"]
crossed_cols = (["gender", "genre"], ["gender", "age"], ["age", "genre"])
embeddings_cols = ["userId", "movieId", "genre", "gender", "age", "occupation"]
continuous_cols = ["movie_year"]
target = "rating"

df['age'] = df.apply(lambda row : str(row["age"]),axis=1)
df['occupation'] = df.apply(lambda row : str(row["occupation"]),axis=1)

In [4]:
Y = np.array(df[target])

### Create datset for Wide Model

In [5]:
df_wide = df.copy()

# Build the crossed columns
crossed_columns = []
for cols in crossed_cols:
    colname = '_'.join(cols)
    df_wide[colname] = df_wide[cols].apply(lambda x: '-'.join(x), axis=1)
    crossed_columns.append(colname)
    
categorical_columns = list(df_wide.select_dtypes(include=['object']).columns)

In [6]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
for cc in ["userId", "movieId", "movie_year"]:
    df_wide[cc]  = scaler.fit_transform(df_wide[cc].values.reshape(-1,1))

In [7]:
dummy_cols = [c for c in wide_cols+crossed_columns if c in categorical_columns]
df_wide = pd.get_dummies(df_wide, columns=dummy_cols)

In [8]:
import pickle
with open("df_wide.pkl", "wb") as f:
    pickle.dump(df_wide, f)

### Create dataset for Deep Model

In [9]:
df_deep = df.copy()

In [10]:
def encoder(df, cols=None):
    if cols == None:
        cols = list(df.select_dtypes(include=['object']).columns)

    val_types = dict()
    for c in cols:
        val_types[c] = df[c].unique()

    val_to_idx = dict()
    for k, v in val_types.items():
        val_to_idx[k] = {o: i for i, o in enumerate(val_types[k])}

    for k, v in val_to_idx.items():
        df[k] = df[k].apply(lambda x: v[x])

    return val_to_idx, df

val_to_idx, df_deep = encoder(df_deep)

In [11]:
scaler = StandardScaler()
df_deep["movie_year"]  = scaler.fit_transform(df_deep["movie_year"].values.reshape(-1,1))

In [12]:
import pickle
with open("df_deep.pkl", "wb") as f:
    pickle.dump(df_deep, f)