In [1]:
import os
import pandas as pd
import numpy as np
import keras
import tensorflow as tf
from sklearn.preprocessing import OneHotEncoder, LabelEncoder, MinMaxScaler
from sklearn.model_selection import train_test_split
from keras.layers import Input, Embedding, concatenate, Flatten, Dense, Dropout, merge
from keras.models import Model

In [2]:
# Column headers
data_cols = ['user id', 'movie id', 'rating', 'timestamp']
item_cols = ['movie id', 'movie title', 'release date', 'video release date', 'IMDb URL', 'unknown',
             'Action','Adventure', 'Animation', 'Childrens', 'Comedy', 'Crime',
             'Documentary', 'Drama', 'Fantasy', 'Film-Noir','Horror', 'Musical', 
             'Mystery', 'Romance ', 'Sci-Fi', 'Thriller', 'War', 'Western']
user_cols = ['user id', 'age', 'gender', 'occupation', 'zip code']
# Importing users, items and data
users = pd.read_csv('ml-100k/u.user', sep='|', names=user_cols, encoding='latin-1')
item = pd.read_csv('ml-100k/u.item', sep='|', names=item_cols, encoding='latin-1')
data = pd.read_csv('ml-100k/u.data', sep='\t', names=data_cols, encoding='latin-1')
df = pd.merge(pd.merge(item, data), users)

In [3]:
df['genre'] = df[['unknown', 'Action', 'Adventure', 'Animation', 'Childrens', 'Comedy', 'Crime', 'Documentary',
                            'Drama', 'Fantasy', 'Film-Noir', 'Horror', 'Musical', 'Mystery', 'Romance ', 'Sci-Fi',
                            'Thriller', 'War', 'Western']].idxmax(1)

In [4]:
df.drop(columns=['unknown', 'Action', 'Adventure', 'Animation', 'Childrens', 'Comedy', 'Crime', 'Documentary',
                      'Drama', 'Fantasy', 'Film-Noir', 'Horror', 'Musical', 'Mystery', 'Romance ', 'Sci-Fi', 'Thriller',
                      'War', 'Western'], axis=1, inplace=True)

In [5]:
#augment gender and occupation into a single category to gain more insight about the person 
df_wide = df[['gender', 'genre']]
df_wide['gender_genre'] = df_wide['gender'] + "_" + df_wide['genre']

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_wide['gender_genre'] = df_wide['gender'] + "_" + df_wide['genre']


In [6]:
df_wide

Unnamed: 0,gender,genre,gender_genre
0,M,Animation,M_Animation
1,M,Action,M_Action
2,M,Crime,M_Crime
3,M,Drama,M_Drama
4,M,Childrens,M_Childrens
...,...,...,...
99995,M,Action,M_Action
99996,M,Action,M_Action
99997,M,Action,M_Action
99998,M,Childrens,M_Childrens


In [7]:
encode = OneHotEncoder(handle_unknown='ignore')
encode.fit(df_wide[['gender_genre']])
encoded = pd.DataFrame(encode.transform(df_wide[['gender_genre']]).toarray(), columns=encode.get_feature_names())

In [8]:
df_wide = df_wide.join(encoded)
df_wide.drop(['gender', 'genre', 'gender_genre'], axis=1, inplace=True)

In [9]:
df_wide.columns = df_wide.columns.str.lstrip("x0_")

In [10]:
df_wide

Unnamed: 0,F_Action,F_Adventure,F_Animation,F_Childrens,F_Comedy,F_Crime,F_Documentary,F_Drama,F_Fantasy,F_Film-Noir,...,M_Film-Noir,M_Horror,M_Musical,M_Mystery,M_Romance,M_Sci-Fi,M_Thriller,M_War,M_Western,M_unknown
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
99995,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
99996,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
99997,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
99998,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [11]:
df_deep = df[['age', 'gender', 'occupation', 'genre']]

In [12]:
# Encode categorical features
for feature in ['gender', 'occupation', 'genre']:
    encoder = LabelEncoder()
    encoder.fit(df_deep[[feature]])
    transformed_feature = encoder.transform(df_deep[[feature]])
    df_deep[feature] = transformed_feature

# Min-max scaling for numerical features
for feature in ['age']:
    scaler = MinMaxScaler()
    scaler.fit(df_deep[[feature]])
    transformed_feature = scaler.transform(df_deep[[feature]])
    df_deep[feature] = transformed_feature

  return f(*args, **kwargs)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_deep[feature] = transformed_feature
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_deep[feature] = transformed_feature


In [13]:
df_deep

Unnamed: 0,age,gender,occupation,genre
0,0.803030,1,15,2
1,0.803030,1,15,0
2,0.803030,1,15,5
3,0.803030,1,15,7
4,0.803030,1,15,3
...,...,...,...,...
99995,0.181818,1,18,0
99996,0.181818,1,18,0
99997,0.181818,1,18,0
99998,0.181818,1,18,3


In [14]:
# Split data

X = pd.concat([df_wide, df_deep], axis=1)
y = df[['rating']]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=7)
y_train = y_train.values
y_test = y_test.values
def input_values(X):
    gender = X[['gender']].values
    occupation = X[['occupation']].values
    genre = X[['genre']].values
    age = X[['age']].values
    
    wide_inputs = X[['F_Action', 'F_Adventure', 'F_Animation', 'F_Childrens', 'F_Comedy',
       'F_Crime', 'F_Documentary', 'F_Drama', 'F_Fantasy', 'F_Film-Noir',
       'F_Horror', 'F_Musical', 'F_Mystery', 'F_Romance ', 'F_Sci-Fi',
       'F_Thriller', 'F_War', 'F_Western', 'F_unknown', 'M_Action',
       'M_Adventure', 'M_Animation', 'M_Childrens', 'M_Comedy', 'M_Crime',
       'M_Documentary', 'M_Drama', 'M_Film-Noir', 'M_Horror', 'M_Musical',
       'M_Mystery', 'M_Romance ', 'M_Sci-Fi', 'M_Thriller', 'M_War',
       'M_Western', 'M_unknown']].values
    
    return wide_inputs, gender, occupation, genre, age
wide_inputs_train, gender_train, occupation_train, genre_train, age_train = input_values(X_train)
wide_inputs_test, gender_test, occupation_test, genre_test, age_test = input_values(X_test)

In [15]:
gender = Input(shape=(1,), name='gender')
occupation = Input(shape=(1,), name='occupation')
genre = Input(shape=(1,), name='genre')
age = Input(shape=(1,), name='age')

In [16]:
emb_1 = Flatten()(Embedding(input_dim=50, output_dim=64,input_length=1, name='emb_1')(gender))
emb_2 = Flatten()(Embedding(input_dim=50, output_dim=64,input_length=1, name='emb_2')(occupation))
emb_3 = Flatten()(Embedding(input_dim=50, output_dim=64,input_length=1, name='emb_3')(genre))
concat_emb = concatenate([emb_1, emb_2, emb_3, age])
concat_emb = Dropout(rate=0.2)(concat_emb)
x1 = Dense(64, activation='relu')(concat_emb)
x1 = Dropout(rate=0.2)(x1)
x2 = Dense(64, activation='relu')(x1)
x2 = Dropout(rate=0.2)(x2)
x3 = Dense(64, activation='relu')(x2)
x3 = Dropout(rate=0.2)(x3)
x4 = Dense(64, activation='relu')(merge.add([x1, x3]))
deep_output = Dense(64, activation='relu')(x4)

In [17]:
wide_inputs = Input(shape=(len(df_wide.columns),), name='wide_inputs')

In [18]:
# Combine wide and deep into one model
wide_and_deep = Dropout(rate=0.2)(concatenate([wide_inputs, deep_output]))
output = Dense(1, activation='relu')(wide_and_deep)
model = Model(inputs=[wide_inputs] + [gender] + [occupation] + [genre] + [age], outputs=output)
model.summary()

Model: "model"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
gender (InputLayer)             [(None, 1)]          0                                            
__________________________________________________________________________________________________
occupation (InputLayer)         [(None, 1)]          0                                            
__________________________________________________________________________________________________
genre (InputLayer)              [(None, 1)]          0                                            
__________________________________________________________________________________________________
emb_1 (Embedding)               (None, 1, 64)        3200        gender[0][0]                     
______________________________________________________________________________________________

In [19]:
model.compile(loss='mse', optimizer='adam', metrics=['mse'])

In [20]:
model.fit(x={'wide_inputs': wide_inputs_train,'gender': gender_train,'occupation': occupation_train,'genre': genre_train,
             'age': age_train},y=y_train,batch_size=32, epochs=50, verbose=1, validation_split=0.2)

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


<keras.callbacks.History at 0x1e218031d00>

In [21]:
results=model.evaluate(x={'wide_inputs': wide_inputs_test,'gender': gender_test,'occupation': occupation_test,
                                'genre': genre_test,'age': age_test},y=y_test,batch_size=32, verbose=1)



In [22]:
print("test loss, test mse:", results)

test loss, test mse: [1.2099841833114624, 1.2099841833114624]


In [23]:
predictions = model.predict(x={'wide_inputs': wide_inputs_test,'gender': gender_test,'occupation': occupation_test,
                               'genre': genre_test,'age': age_test},batch_size=32, verbose=1)



In [24]:
predictions

array([[3.4743001],
       [3.7015936],
       [3.46363  ],
       ...,
       [3.3852673],
       [3.4853547],
       [3.8298736]], dtype=float32)

In [30]:
    df['rating'].max()

5