In [1]:
import os
import pandas as pd
import numpy as np
import keras
import tensorflow as tf
from sklearn.preprocessing import OneHotEncoder, LabelEncoder, MinMaxScaler
from sklearn.model_selection import train_test_split
from keras.layers import Input, Embedding, concatenate, Flatten, Dense, Dropout, merge
from keras.models import Model

In [2]:
# Column headers
data_cols = ['user id', 'movie id', 'rating', 'timestamp']
item_cols = ['movie id', 'movie title', 'release date', 'video release date', 'IMDb URL', 'unknown',
             'Action','Adventure', 'Animation', 'Childrens', 'Comedy', 'Crime',
             'Documentary', 'Drama', 'Fantasy', 'Film-Noir','Horror', 'Musical', 
             'Mystery', 'Romance ', 'Sci-Fi', 'Thriller', 'War', 'Western']
user_cols = ['user id', 'age', 'gender', 'occupation', 'zip code']
# Importing users, items and data
users = pd.read_csv('ml-100k/u.user', sep='|', names=user_cols, encoding='latin-1')
item = pd.read_csv('ml-100k/u.item', sep='|', names=item_cols, encoding='latin-1')
data = pd.read_csv('ml-100k/u.data', sep='\t', names=data_cols, encoding='latin-1')
df = pd.merge(pd.merge(item, data), users)

In [3]:
df

Unnamed: 0,movie id,movie title,release date,video release date,IMDb URL,unknown,Action,Adventure,Animation,Childrens,...,Thriller,War,Western,user id,rating,timestamp,age,gender,occupation,zip code
0,1,Toy Story (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Toy%20Story%2...,0,0,0,1,1,...,0,0,0,308,4,887736532,60,M,retired,95076
1,4,Get Shorty (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Get%20Shorty%...,0,1,0,0,0,...,0,0,0,308,5,887737890,60,M,retired,95076
2,5,Copycat (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Copycat%20(1995),0,0,0,0,0,...,1,0,0,308,4,887739608,60,M,retired,95076
3,7,Twelve Monkeys (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Twelve%20Monk...,0,0,0,0,0,...,0,0,0,308,4,887738847,60,M,retired,95076
4,8,Babe (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Babe%20(1995),0,0,0,0,1,...,0,0,0,308,5,887736696,60,M,retired,95076
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
99995,748,"Saint, The (1997)",14-Mar-1997,,http://us.imdb.com/M/title-exact?Saint%2C%20Th...,0,1,0,0,0,...,1,0,0,729,4,893286638,19,M,student,56567
99996,751,Tomorrow Never Dies (1997),01-Jan-1997,,http://us.imdb.com/M/title-exact?imdb-title-12...,0,1,0,0,0,...,1,0,0,729,3,893286338,19,M,student,56567
99997,879,"Peacemaker, The (1997)",01-Jan-1997,,http://us.imdb.com/M/title-exact?Peacemaker%2C...,0,1,0,0,0,...,1,1,0,729,3,893286299,19,M,student,56567
99998,894,Home Alone 3 (1997),01-Jan-1997,,http://us.imdb.com/M/title-exact?imdb-title-11...,0,0,0,0,1,...,0,0,0,729,1,893286511,19,M,student,56567


In [4]:
#augment gender and occupation into a single category to gain more insight about the person 
df_wide = df[['gender', 'occupation']]
df_wide['gender_occupation'] = df_wide['gender'] + "_" + df_wide['occupation']

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_wide['gender_occupation'] = df_wide['gender'] + "_" + df_wide['occupation']


In [5]:
df_wide

Unnamed: 0,gender,occupation,gender_occupation
0,M,retired,M_retired
1,M,retired,M_retired
2,M,retired,M_retired
3,M,retired,M_retired
4,M,retired,M_retired
...,...,...,...
99995,M,student,M_student
99996,M,student,M_student
99997,M,student,M_student
99998,M,student,M_student


In [7]:
encode = OneHotEncoder(handle_unknown='ignore')
encode.fit(df_wide[['gender_occupation']])
encoded = pd.DataFrame(encode.transform(df_wide[['gender_occupation']]).toarray(), columns=encode.get_feature_names())

In [8]:
df_wide = df_wide.join(encoded)
df_wide.drop(['gender', 'occupation', 'gender_occupation'], axis=1, inplace=True)

In [9]:
df_wide.columns = df_wide.columns.str.lstrip("x0_")

In [10]:
df_wide

Unnamed: 0,F_administrator,F_artist,F_educator,F_engineer,F_entertainment,F_executive,F_healthcare,F_homemaker,F_lawyer,F_librarian,...,M_marketing,M_none,M_other,M_programmer,M_retired,M_salesman,M_scientist,M_student,M_technician,M_writer
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
99995,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
99996,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
99997,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
99998,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0


In [11]:
df_deep = df[['age', 'unknown', 'Action', 'Adventure', 'Animation', 'Childrens', 'Comedy', 'Crime', 'Documentary',
              'Drama', 'Fantasy', 'Film-Noir', 'Horror', 'Musical', 'Mystery', 'Romance ', 'Sci-Fi', 'Thriller', 'War',
              'Western', 'gender', 'occupation']]

In [12]:
df_deep

Unnamed: 0,age,unknown,Action,Adventure,Animation,Childrens,Comedy,Crime,Documentary,Drama,...,Horror,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western,gender,occupation
0,60,0,0,0,1,1,1,0,0,0,...,0,0,0,0,0,0,0,0,M,retired
1,60,0,1,0,0,0,1,0,0,1,...,0,0,0,0,0,0,0,0,M,retired
2,60,0,0,0,0,0,0,1,0,1,...,0,0,0,0,0,1,0,0,M,retired
3,60,0,0,0,0,0,0,0,0,1,...,0,0,0,0,1,0,0,0,M,retired
4,60,0,0,0,0,1,1,0,0,1,...,0,0,0,0,0,0,0,0,M,retired
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
99995,19,0,1,0,0,0,0,0,0,0,...,0,0,0,1,0,1,0,0,M,student
99996,19,0,1,0,0,0,0,0,0,0,...,0,0,0,1,0,1,0,0,M,student
99997,19,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,1,1,0,M,student
99998,19,0,0,0,0,1,1,0,0,0,...,0,0,0,0,0,0,0,0,M,student


In [13]:
df_deep['genre'] = df_deep[['unknown', 'Action', 'Adventure', 'Animation', 'Childrens', 'Comedy', 'Crime', 'Documentary',
                            'Drama', 'Fantasy', 'Film-Noir', 'Horror', 'Musical', 'Mystery', 'Romance ', 'Sci-Fi',
                            'Thriller', 'War', 'Western']].idxmax(1)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_deep['genre'] = df_deep[['unknown', 'Action', 'Adventure', 'Animation', 'Childrens', 'Comedy', 'Crime', 'Documentary',


In [14]:
df_deep

Unnamed: 0,age,unknown,Action,Adventure,Animation,Childrens,Comedy,Crime,Documentary,Drama,...,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western,gender,occupation,genre
0,60,0,0,0,1,1,1,0,0,0,...,0,0,0,0,0,0,0,M,retired,Animation
1,60,0,1,0,0,0,1,0,0,1,...,0,0,0,0,0,0,0,M,retired,Action
2,60,0,0,0,0,0,0,1,0,1,...,0,0,0,0,1,0,0,M,retired,Crime
3,60,0,0,0,0,0,0,0,0,1,...,0,0,0,1,0,0,0,M,retired,Drama
4,60,0,0,0,0,1,1,0,0,1,...,0,0,0,0,0,0,0,M,retired,Childrens
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
99995,19,0,1,0,0,0,0,0,0,0,...,0,0,1,0,1,0,0,M,student,Action
99996,19,0,1,0,0,0,0,0,0,0,...,0,0,1,0,1,0,0,M,student,Action
99997,19,0,1,0,0,0,0,0,0,0,...,0,0,0,0,1,1,0,M,student,Action
99998,19,0,0,0,0,1,1,0,0,0,...,0,0,0,0,0,0,0,M,student,Childrens


In [15]:
df_deep.drop(columns=['unknown', 'Action', 'Adventure', 'Animation', 'Childrens', 'Comedy', 'Crime', 'Documentary',
                      'Drama', 'Fantasy', 'Film-Noir', 'Horror', 'Musical', 'Mystery', 'Romance ', 'Sci-Fi', 'Thriller',
                      'War', 'Western'], axis=1, inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().drop(


In [16]:
df_deep

Unnamed: 0,age,gender,occupation,genre
0,60,M,retired,Animation
1,60,M,retired,Action
2,60,M,retired,Crime
3,60,M,retired,Drama
4,60,M,retired,Childrens
...,...,...,...,...
99995,19,M,student,Action
99996,19,M,student,Action
99997,19,M,student,Action
99998,19,M,student,Childrens


In [17]:
# Encode categorical features
for feature in ['gender', 'occupation', 'genre']:
    encoder = LabelEncoder()
    encoder.fit(df_deep[[feature]])
    transformed_feature = encoder.transform(df_deep[[feature]])
    df_deep[feature] = transformed_feature

# Min-max scaling for numerical features
for feature in ['age']:
    scaler = MinMaxScaler()
    scaler.fit(df_deep[[feature]])
    transformed_feature = scaler.transform(df_deep[[feature]])
    df_deep[feature] = transformed_feature

  return f(*args, **kwargs)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_deep[feature] = transformed_feature
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_deep[feature] = transformed_feature


In [18]:
df_deep

Unnamed: 0,age,gender,occupation,genre
0,0.803030,1,15,2
1,0.803030,1,15,0
2,0.803030,1,15,5
3,0.803030,1,15,7
4,0.803030,1,15,3
...,...,...,...,...
99995,0.181818,1,18,0
99996,0.181818,1,18,0
99997,0.181818,1,18,0
99998,0.181818,1,18,3


In [19]:
# Split data

X = pd.concat([df_wide, df_deep], axis=1)
y = df[['rating']]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=7)

def input_values(X):
    gender = X[['gender']].values
    occupation = X[['occupation']].values
    genre = X[['genre']].values
    age = X[['age']].values
    
    wide_inputs = X[['F_administrator', 'F_artist', 'F_educator', 'F_engineer',
                     'F_entertainment', 'F_executive', 'F_healthcare',
                     'F_homemaker', 'F_lawyer', 'F_librarian', 'F_marketing',
                     'F_none', 'F_other', 'F_programmer', 'F_retired',
                     'F_salesman', 'F_scientist', 'F_student', 'F_technician',
                     'F_writer', 'M_administrator', 'M_artist', 'M_doctor',
                     'M_educator', 'M_engineer', 'M_entertainment',
                     'M_executive', 'M_healthcare', 'M_homemaker', 'M_lawyer',
                     'M_librarian', 'M_marketing', 'M_none', 'M_other',
                     'M_programmer', 'M_retired', 'M_salesman', 'M_scientist',
                     'M_student', 'M_technician', 'M_writer']].values
    
    return wide_inputs, gender, occupation, genre, age

wide_inputs_train, gender_train, occupation_train, genre_train, age_train = input_values(X_train)

wide_inputs_test, gender_test, occupation_test, genre_test, age_test = input_values(X_test)

y_train = y_train.values
y_test = y_test.values

In [20]:
gender = Input(shape=(1,), name='gender')
emb_1 = Embedding(input_dim=50, output_dim=64,input_length=1, name='emb_1')(gender)
emb_1 = Flatten()(emb_1)
occupation = Input(shape=(1,), name='occupation')
emb_2 = Embedding(input_dim=50, output_dim=64,input_length=1, name='emb_2')(occupation)
emb_2 = Flatten()(emb_2)
genre = Input(shape=(1,), name='genre')
emb_3 = Embedding(input_dim=50, output_dim=64,input_length=1, name='emb_3')(genre)
emb_3 = Flatten()(emb_3)
age = Input(shape=(1,), name='age')

concatenated_embeddings = concatenate([emb_1, emb_2, emb_3, age])
concatenated_embeddings = Dropout(rate=0.2)(concatenated_embeddings)

x1 = Dense(64, activation='relu')(concatenated_embeddings)
x1 = Dropout(rate=0.2)(x1)
x2 = Dense(64, activation='relu')(x1)
x2 = Dropout(rate=0.2)(x2)
x3 = Dense(64, activation='relu')(x2)
x3 = Dropout(rate=0.2)(x3)
x4 = Dense(64, activation='relu')(merge.add([x1, x3]))


deep_output = Dense(64, activation='relu')(x4)

In [21]:
num_features = len(df_wide.columns)
wide_inputs = Input(shape=(num_features,), name='wide_inputs')

In [22]:
# Combine wide and deep into one model
x = concatenate([wide_inputs, deep_output])
x = Dropout(rate=0.2)(x)
wide_and_deep_output = Dense(1, activation='relu')(x)
wide_and_deep_model = Model(inputs=[wide_inputs] + [gender] + [occupation] + [genre] + [age], outputs=wide_and_deep_output)
wide_and_deep_model.summary()

Model: "model"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
gender (InputLayer)             [(None, 1)]          0                                            
__________________________________________________________________________________________________
occupation (InputLayer)         [(None, 1)]          0                                            
__________________________________________________________________________________________________
genre (InputLayer)              [(None, 1)]          0                                            
__________________________________________________________________________________________________
emb_1 (Embedding)               (None, 1, 64)        3200        gender[0][0]                     
______________________________________________________________________________________________

In [23]:
wide_and_deep_model.compile(loss='mse', optimizer='adam', metrics=['mse'])

In [24]:
wide_and_deep_model.fit(x={'wide_inputs': wide_inputs_train,
                           'gender': gender_train,
                           'occupation': occupation_train,
                           'genre': genre_train,
                           'age': age_train},
                        y=y_train,
                        batch_size=32, epochs=50, verbose=1, validation_split=0.2)

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50

KeyboardInterrupt: 

In [None]:
results=wide_and_deep_model.evaluate(x={'wide_inputs': wide_inputs_test,
                                'gender': gender_test,
                                'occupation': occupation_test,
                                'genre': genre_test,
                                'age': age_test},
                             y=y_test,
                             batch_size=32, verbose=1)

In [None]:
print("test loss, test mse:", results)