## Import Libraries

In [2]:
import numpy as np
import pandas as pd
from sklearn import preprocessing
from tqdm import tqdm,tqdm_notebook

In [3]:
import lonely_boy2 as lb2

## Initialize Relation Attributes

In [4]:
usr_ratings_cols = 'user id | item id | rating | timestamp'.split(' | ')
movie_info_cols = '''movie id | movie title | release date | video release date | IMDb URL | unknown | Action | Adventure | Animation | Children's | Comedy | Crime | Documentary | Drama | Fantasy | Film-Noir | Horror | Musical | Mystery | Romance | Sci-Fi | Thriller | War | Western'''.split(' | ')
usr_info_cols = 'user id | age | gender | occupation | zip code'.split(' | ')

## Initialize Relation URL's

In [5]:
usr_ratings_url = 'http://files.grouplens.org/datasets/movielens/ml-100k/u.data'
movie_info_url  = 'http://files.grouplens.org/datasets/movielens/ml-100k/u.item'
usr_info_url    = 'http://files.grouplens.org/datasets/movielens/ml-100k/u.user'
genre_info_url  = 'http://files.grouplens.org/datasets/movielens/ml-100k/u.genre'
ocp_info_url    = 'http://files.grouplens.org/datasets/movielens/ml-100k/u.occupation'

## Read CSV's

In [6]:
ratings = pd.read_csv(usr_ratings_url, delimiter='\t', header=None, names=usr_ratings_cols, encoding='latin-1')

movies = pd.read_csv(movie_info_url, delimiter='|', header=None, names=movie_info_cols, encoding='latin-1')

users = pd.read_csv(usr_info_url, delimiter='|', header=None, names=usr_info_cols, encoding='latin-1')

genres = pd.read_csv(genre_info_url, delimiter='|', header=None, names=['genre_id', 'genre_code'], encoding='latin-1')
genres = genres[['genre_id']]

ocps = pd.read_csv(ocp_info_url, delimiter='|', header=None, names=['ocp_id'], encoding='latin-1')
ocps = ocps[['ocp_id']]

In [7]:
users.head(3)

Unnamed: 0,user id,age,gender,occupation,zip code
0,1,24,M,technician,85711
1,2,53,F,other,94043
2,3,23,M,writer,32067


In [8]:
ratings.head(3)

Unnamed: 0,user id,item id,rating,timestamp
0,196,242,3,881250949
1,186,302,3,891717742
2,22,377,1,878887116


In [9]:
movies.head(3)

Unnamed: 0,movie id,movie title,release date,video release date,IMDb URL,unknown,Action,Adventure,Animation,Children's,...,Fantasy,Film-Noir,Horror,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
0,1,Toy Story (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Toy%20Story%2...,0,0,0,1,1,...,0,0,0,0,0,0,0,0,0,0
1,2,GoldenEye (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?GoldenEye%20(...,0,1,1,0,0,...,0,0,0,0,0,0,0,1,0,0
2,3,Four Rooms (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Four%20Rooms%...,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0


In [10]:
ratings_full = ratings.merge(movies, left_on='item id', right_on='movie id')
ratings_full = ratings_full.drop(['item id','timestamp','movie title','release date','video release date','IMDb URL'],axis = 1)
ratings_full.head()

ratings_full.drop(ratings_full[ratings_full.rating < 3].index, inplace=True)

In [11]:
ratings_full3 = ratings_full.drop(['rating','movie id'],axis =1)

In [12]:
print(ratings_full.shape)
print(ratings_full3.shape)

(82520, 22)
(82520, 20)


In [13]:
ratings_full3.head(2)

Unnamed: 0,user id,unknown,Action,Adventure,Animation,Children's,Comedy,Crime,Documentary,Drama,Fantasy,Film-Noir,Horror,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
0,196,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0
1,63,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0


In [14]:
lst = [] 
for usr in tqdm_notebook(ratings_full3['user id'].unique()):
    tmp = ratings_full3.loc[ratings_full3['user id'] == usr]
    tmplst = []
    tmplst.append(usr)
    sums = tmp.iloc[:,1:20].sum(axis=0)
    for sumamount in sums:
        tmplst.append(sumamount)
    lst.append(tmplst)
cols = [col for col in ratings_full3.columns]
df = pd.DataFrame(lst , columns = cols)

A Jupyter Widget




In [15]:
df.head(3)

Unnamed: 0,user id,unknown,Action,Adventure,Animation,Children's,Comedy,Crime,Documentary,Drama,Fantasy,Film-Noir,Horror,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
0,196,0,0,1,0,2,27,0,1,11,1,0,0,0,0,9,1,0,1,0
1,63,0,9,6,2,3,26,2,1,36,0,1,1,0,3,18,7,12,4,0
2,226,0,7,4,3,1,11,3,1,25,0,1,0,2,3,4,9,11,5,2


In [16]:
x = df.values[:, 1:] #returns a numpy array
minimax = preprocessing.MinMaxScaler()
x_minimax_scaled = minimax.fit_transform(x.T).T

x_stdscl = preprocessing.scale(x, axis=1)



In [None]:
print (x_minimax_scaled)

In [None]:
print (x_stdscl)

In [18]:
np.save('comp-data/1-preprocessing-comp-data/user-feature-set-orig.npy', x)

In [41]:
np.save('comp-data/1-preprocessing-comp-data/user-feature-set-minimax.npy', x_minimax_scaled)
np.save('comp-data/1-preprocessing-comp-data/user-feature-set-stdscl.npy', x_stdscl)