In [1]:
import numpy as np
import pandas as pd
from sklearn import preprocessing
from tqdm import tqdm,tqdm_notebook

## Initialize Relation Attributes

In [2]:
usr_ratings_cols = 'user id | item id | rating | timestamp'.split(' | ')
movie_info_cols = '''movie id | movie title | release date | video release date | IMDb URL | unknown | Action | Adventure | Animation | Children's | Comedy | Crime | Documentary | Drama | Fantasy | Film-Noir | Horror | Musical | Mystery | Romance | Sci-Fi | Thriller | War | Western'''.split(' | ')
usr_info_cols = 'user id | age | gender | occupation | zip code'.split(' | ')

## Initialize Relation URL's

In [3]:
usr_ratings_url = 'http://files.grouplens.org/datasets/movielens/ml-100k/u.data'
movie_info_url  = 'http://files.grouplens.org/datasets/movielens/ml-100k/u.item'
usr_info_url    = 'http://files.grouplens.org/datasets/movielens/ml-100k/u.user'
genre_info_url  = 'http://files.grouplens.org/datasets/movielens/ml-100k/u.genre'
ocp_info_url    = 'http://files.grouplens.org/datasets/movielens/ml-100k/u.occupation'

## Read CSV's

In [4]:
ratings = pd.read_csv(usr_ratings_url, delimiter='\t', header=None, names=usr_ratings_cols, encoding='latin-1')

movies = pd.read_csv(movie_info_url, delimiter='|', header=None, names=movie_info_cols, encoding='latin-1')

users = pd.read_csv(usr_info_url, delimiter='|', header=None, names=usr_info_cols, encoding='latin-1')

genres = pd.read_csv(genre_info_url, delimiter='|', header=None, names=['genre_id', 'genre_code'], encoding='latin-1')
genres = genres[['genre_id']]

ocps = pd.read_csv(ocp_info_url, delimiter='|', header=None, names=['ocp_id'], encoding='latin-1')
ocps = ocps[['ocp_id']]

In [5]:
movies.head(3)

Unnamed: 0,movie id,movie title,release date,video release date,IMDb URL,unknown,Action,Adventure,Animation,Children's,...,Fantasy,Film-Noir,Horror,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
0,1,Toy Story (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Toy%20Story%2...,0,0,0,1,1,...,0,0,0,0,0,0,0,0,0,0
1,2,GoldenEye (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?GoldenEye%20(...,0,1,1,0,0,...,0,0,0,0,0,0,0,1,0,0
2,3,Four Rooms (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Four%20Rooms%...,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0


### Data Preprocessing Part 1: Movies
  * Drop the video-release-date feature
  * from the release-date feature keep only the year of release; discard the day and month  

In [6]:
movies = movies.drop(['video release date', 'IMDb URL', 'movie title'], axis=1)

In [7]:
from datetime import datetime
years = []
movies = movies.dropna()

for ind, x in enumerate(movies['release date']):
    my_dt_ob = datetime.strptime(x,'%d-%b-%Y')
    date_list=list(my_dt_ob.timetuple())
    years.append(date_list[0])

yse = pd.Series(years)
movies['release-year'] = yse.values

In [8]:
movies = movies.drop(['release date'], axis=1)

### Data Preprocessing Part 2: Categorical Data Processing
  * Gender: **1** for **female**; **0** for **male**

In [9]:
bingend = list(map(lambda x:0 if (x=='M') else 1,users['gender'].values))
col = pd.Series(bingend)
users['gender'] = col.values

  * Occupation: Increasing index for each occupation starting from 0; then normalize the codes

In [10]:
ocp_enc = pd.factorize(users['occupation'], sort=False, order=None, na_sentinel=-1, size_hint=None)

In [11]:
users['ocupation'] = list(ocp_enc[0])
ocp_enc

(array([ 0,  1,  2,  0,  1,  3,  4,  4,  5,  6,  1,  1,  7,  8,  7,  9, 10,
         1, 11, 12,  2,  2, 13, 13, 14, 14, 11,  2, 10,  5, 13,  5,  5,  4,
        12,  5,  5,  1,  9,  8, 14,  4, 11,  0, 10, 15, 15,  4,  5,  2,  7,
         5, 10,  3, 10, 11, 16, 10,  7, 17, 14,  4, 15,  7,  7,  5,  5,  5,
        14, 14,  8,  4,  5,  8,  9,  5,  0,  4,  4,  4,  5, 10,  1,  3,  7,
         4,  4, 11,  4,  7, 15,  9,  3,  5,  4, 13, 13,  3,  5,  3,  5, 10,
         5,  5, 14, 18,  8,  7,  1,  5, 14, 19,  3, 10, 14, 17,  5,  4, 10,
         1, 11,  2, 13,  5,  6,  6, 16, 15, 15, 16,  4,  1, 14, 10,  5,  1,
         7, 20,  5,  5, 10,  1,  0, 10,  9, 13, 11, 14, 15, 13,  4,  7,  5,
         5,  1,  7, 14,  7,  5, 10,  6, 13,  4, 17,  1,  7,  1,  1,  1, 17,
         7, 15,  1,  4,  8,  8, 10,  1,  9,  4,  3, 10,  8, 11, 11,  3,  7,
         5, 13,  4,  4,  7,  5,  4,  8,  2,  0,  5,  2, 10,  2,  7,  5, 11,
         6,  5, 15, 14,  7, 14, 19,  7,  3, 11, 10, 14,  1,  4, 10, 11,  5,
        10, 

In [12]:
users = users.drop(['occupation'], axis = 1)

In [13]:
users.head(3)

Unnamed: 0,user id,age,gender,zip code,ocupation
0,1,24,0,85711,0
1,2,53,1,94043,1
2,3,23,0,32067,2


  * Age: Group each age in groups of 10 (index begins from 0)

In [14]:
agelist = []
for val in users.age:
    agelist.append(val // 10)
col = pd.Series(agelist)
users['age'] = col.values

In [15]:
users.head(10)

Unnamed: 0,user id,age,gender,zip code,ocupation
0,1,2,0,85711,0
1,2,5,1,94043,1
2,3,2,0,32067,2
3,4,2,0,43537,0
4,5,3,1,15213,1
5,6,4,0,98101,3
6,7,5,0,91344,4
7,8,3,0,5201,4
8,9,2,0,1002,5
9,10,5,0,90703,6


In [16]:
movies.head(10)

Unnamed: 0,movie id,unknown,Action,Adventure,Animation,Children's,Comedy,Crime,Documentary,Drama,...,Film-Noir,Horror,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western,release-year
0,1,0,0,0,1,1,1,0,0,0,...,0,0,0,0,0,0,0,0,0,1995
1,2,0,1,1,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,1995
2,3,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,1995
3,4,0,1,0,0,0,1,0,0,1,...,0,0,0,0,0,0,0,0,0,1995
4,5,0,0,0,0,0,0,1,0,1,...,0,0,0,0,0,0,1,0,0,1995
5,6,0,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,1995
6,7,0,0,0,0,0,0,0,0,1,...,0,0,0,0,0,1,0,0,0,1995
7,8,0,0,0,0,1,1,0,0,1,...,0,0,0,0,0,0,0,0,0,1995
8,9,0,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,1995
9,10,0,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,1,0,1996


  * Drop the timestamp (date and time the user made his rating) column

In [17]:
ratings = ratings.drop(['timestamp'], axis=1)

  * Rating: Convert to target with the following approach:

* People who rated a movie with **2 or less** stars **did not like that movie and there's a big chance that they're not going to see that movie again**

In [21]:
tmp = ratings.copy()
tmp['rating'] = tmp['rating'].map(lambda x: int(x >= 3))
tmp.rename(columns={'rating': 'target'}, inplace=True)

In [22]:
ratings3 = tmp.copy() 

In [23]:
ratings3.head(3)

Unnamed: 0,user id,item id,target
0,196,242,1
1,186,302,1
2,22,377,0


## Checkpoint...

In [24]:
movies.to_csv('comp-data/4a-training-dataset-creation/movies.csv', index=False)
users.to_csv('comp-data/4a-training-dataset-creation/users.csv', index=False)
ratings3.to_csv('comp-data/4a-training-dataset-creation/ratings3.csv', index=False)

## Creating the Dataset that will Train the LS and MLP Classifier...

In [25]:
df3 = ((users.merge(ratings3, how='inner', on='user id'))\
.merge(movies, how='inner', left_on='item id', right_on='movie id'))\
.drop(['item id', 'movie id', 'user id'], axis=1)

## Saving the Dataset for future use...

In [26]:
df3.to_csv('comp-data/4a-training-dataset-creation/train.csv', index=False)