In [2]:
usr_ratings_url = 'http://files.grouplens.org/datasets/movielens/ml-100k/u.data'
movie_info_url  = 'http://files.grouplens.org/datasets/movielens/ml-100k/u.item'
usr_info_url    = 'http://files.grouplens.org/datasets/movielens/ml-100k/u.user'
genre_info_url  = 'http://files.grouplens.org/datasets/movielens/ml-100k/u.genre'
ocp_info_url    = 'http://files.grouplens.org/datasets/movielens/ml-100k/u.occupation'

In [6]:
import pandas as pd
import lonely_boy2 as lb2
from sklearn import preprocessing
import numpy as np

### Initialize correct column names

In [7]:
usr_ratings_cols = 'user id | item id | rating | timestamp'.split(' | ')
movie_info_cols = '''movie id | movie title | release date | video release date | IMDb URL | unknown | Action | Adventure | Animation | Children's | Comedy | Crime | Documentary | Drama | Fantasy | Film-Noir | Horror | Musical | Mystery | Romance | Sci-Fi | Thriller | War | Western'''.split(' | ')
usr_info_cols = 'user id | age | gender | occupation | zip code'.split(' | ')

### Read CSVs

In [8]:
usr_ratings = pd.read_csv(usr_ratings_url, delimiter='\t', header=None, names=usr_ratings_cols, encoding='latin-1')

movie_info = pd.read_csv(movie_info_url, delimiter='|', header=None, names=movie_info_cols, encoding='latin-1')

usr_info = pd.read_csv(usr_info_url, delimiter='|', header=None, names=usr_info_cols, encoding='latin-1')

genre_info = pd.read_csv(genre_info_url, delimiter='|', header=None, names=['genre_id', 'genre_code'], encoding='latin-1')
genre_info = genre_info[['genre_id']]

ocp_info = pd.read_csv(ocp_info_url, delimiter='|', header=None, names=['ocp_id'], encoding='latin-1')
ocp_info = ocp_info[['ocp_id']]

### Create one-hot dataframes

In [9]:
ocp1hot = lb.addOneHot(ocp_info)
gnr1hot = lb.addOneHot(genre_info)

### Merge One-hots with original DFs

In [10]:
tmp1 = usr_info.merge(ocp1hot, left_on = 'occupation', right_on = 'ocp_id')
usr1hot = tmp1.drop(['occupation', 'ocp_id'],axis=1)

In [11]:
usr1hot.head()

Unnamed: 0,user id,age,gender,zip code,ocp_id_administrator,ocp_id_artist,ocp_id_doctor,ocp_id_educator,ocp_id_engineer,ocp_id_entertainment,...,ocp_id_marketing,ocp_id_none,ocp_id_other,ocp_id_programmer,ocp_id_retired,ocp_id_salesman,ocp_id_scientist,ocp_id_student,ocp_id_technician,ocp_id_writer
0,1,24,M,85711,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
1,4,24,M,43537,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
2,44,26,M,46260,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
3,77,30,M,29379,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
4,143,42,M,8832,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0


In [12]:
mv1hot = movie_info.drop(movie_info.columns[1:5],axis=1)

### Merge all the dfs

In [13]:
tmp3 = usr_ratings.merge(usr1hot)

In [14]:
finaldf = tmp3.merge(mv1hot, left_on = 'item id', right_on = 'movie id')

In [15]:
finaldf = finaldf.drop(['user id', 'item id','timestamp','zip code'], axis =1)

In [16]:
bingend = list(map(lambda x:0 if (x=='M') else 1,finaldf['gender'].values))

In [17]:
col = pd.Series(bingend)
finaldf['bingender'] = col.values

In [18]:
finaldf = finaldf.drop(['gender'],axis=1)

In [21]:
finaldf 

Unnamed: 0,rating,age,ocp_id_administrator,ocp_id_artist,ocp_id_doctor,ocp_id_educator,ocp_id_engineer,ocp_id_entertainment,ocp_id_executive,ocp_id_healthcare,...,Film-Noir,Horror,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western,bingender
0,3,49,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,5,23,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,4,42,0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
3,4,60,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,3,31,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5,1,26,0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
6,4,27,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
7,5,25,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
8,2,47,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
9,3,33,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [86]:
x = finaldf.values #returns a numpy array
min_max_scaler = preprocessing.MinMaxScaler()
x_scaled = min_max_scaler.fit_transform(x)
finaldfnormed = pd.DataFrame(x_scaled)



In [91]:
finaldfnormed = np.matrix(finaldfnormed)

In [93]:
finaldfnormed

matrix([[ 0.5       ,  0.63636364,  0.        ,  0.        ,  0.        ,
          0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
          0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
          0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
          0.        ,  0.        ,  1.        ,  0.14336704,  0.        ,
          0.        ,  0.        ,  0.        ,  0.        ,  1.        ,
          0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
          0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
          0.        ,  0.        ,  0.        ,  0.        ]])

In [94]:
np.save('finaldf.npy', finaldfnormed)