# data-preprocessing

In [1]:
usr_ratings_url = 'http://files.grouplens.org/datasets/movielens/ml-100k/u.data'
movie_info_url  = 'http://files.grouplens.org/datasets/movielens/ml-100k/u.item'
usr_info_url    = 'http://files.grouplens.org/datasets/movielens/ml-100k/u.user'
genre_info_url  = 'http://files.grouplens.org/datasets/movielens/ml-100k/u.genre'
ocp_info_url    = 'http://files.grouplens.org/datasets/movielens/ml-100k/u.occupation'

---

## Part 1.  Importing the data
#### Part 1.1. Determining the Columns for each Dataset

In [2]:
import pandas as pd
import lonely_boy as lb

In [3]:
usr_ratings_cols = 'user id | item id | rating | timestamp'.split(' | ')
usr_ratings_cols

['user id', 'item id', 'rating', 'timestamp']

In [4]:
movie_info_cols = '''movie id | movie title | release date | video release date | IMDb URL | unknown | Action | Adventure | Animation | Children's | Comedy | Crime | Documentary | Drama | Fantasy | Film-Noir | Horror | Musical | Mystery | Romance | Sci-Fi | Thriller | War | Western'''.split(' | ')
movie_info_cols

['movie id',
 'movie title',
 'release date',
 'video release date',
 'IMDb URL',
 'unknown',
 'Action',
 'Adventure',
 'Animation',
 "Children's",
 'Comedy',
 'Crime',
 'Documentary',
 'Drama',
 'Fantasy',
 'Film-Noir',
 'Horror',
 'Musical',
 'Mystery',
 'Romance',
 'Sci-Fi',
 'Thriller',
 'War',
 'Western']

In [5]:
usr_info_cols = 'user id | age | gender | occupation | zip code'.split(' | ')
usr_info_cols

['user id', 'age', 'gender', 'occupation', 'zip code']

#### Part 1.2. Importing the Datasets
---
  * **usr_ratings** corresponds to **u.data**
  * **movie_info** corresponds to **u.item**
  * **usr_info** corresponds to **u.user**
---
  * **genre_info** corresponds to **u.genre**
  * **ocp_info** corresponds to **u.occupation**

In [6]:
usr_ratings = pd.read_csv(usr_ratings_url, delimiter='\t', header=None, names=usr_ratings_cols, encoding='latin-1')

In [7]:
usr_ratings.head()

Unnamed: 0,user id,item id,rating,timestamp
0,196,242,3,881250949
1,186,302,3,891717742
2,22,377,1,878887116
3,244,51,2,880606923
4,166,346,1,886397596


In [8]:
movie_info = pd.read_csv(movie_info_url, delimiter='|', header=None, names=movie_info_cols, encoding='latin-1')

In [9]:
movie_info.head()

Unnamed: 0,movie id,movie title,release date,video release date,IMDb URL,unknown,Action,Adventure,Animation,Children's,...,Fantasy,Film-Noir,Horror,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
0,1,Toy Story (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Toy%20Story%2...,0,0,0,1,1,...,0,0,0,0,0,0,0,0,0,0
1,2,GoldenEye (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?GoldenEye%20(...,0,1,1,0,0,...,0,0,0,0,0,0,0,1,0,0
2,3,Four Rooms (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Four%20Rooms%...,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
3,4,Get Shorty (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Get%20Shorty%...,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,5,Copycat (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Copycat%20(1995),0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0


In [10]:
usr_info = pd.read_csv(usr_info_url, delimiter='|', header=None, names=usr_info_cols, encoding='latin-1')

In [11]:
usr_info.head()

Unnamed: 0,user id,age,gender,occupation,zip code
0,1,24,M,technician,85711
1,2,53,F,other,94043
2,3,23,M,writer,32067
3,4,24,M,technician,43537
4,5,33,F,other,15213


In [12]:
genre_info = pd.read_csv(genre_info_url, delimiter='|', header=None, names=['genre_id', 'genre_code'], encoding='latin-1')
genre_info = genre_info[['genre_id']]

In [13]:
genre_info.head()

Unnamed: 0,genre_id
0,unknown
1,Action
2,Adventure
3,Animation
4,Children's


In [14]:
ocp_info = pd.read_csv(ocp_info_url, delimiter='|', header=None, names=['ocp_id'], encoding='latin-1')
ocp_info = ocp_info[['ocp_id']]

In [15]:
ocp_info.head()

Unnamed: 0,ocp_id
0,administrator
1,artist
2,doctor
3,educator
4,engineer


In [17]:
ocp1hot = lb.addOneHot(ocp_info)

In [18]:
gnr1hot = lb.addOneHot(genre_info)

In [19]:
usr_info.merge(ocp1hot, left_on = 'occupation', right_on = 'ocp_id')

Unnamed: 0,user id,age,gender,occupation,zip code,ocp_id,ocp_id_administrator,ocp_id_artist,ocp_id_doctor,ocp_id_educator,...,ocp_id_marketing,ocp_id_none,ocp_id_other,ocp_id_programmer,ocp_id_retired,ocp_id_salesman,ocp_id_scientist,ocp_id_student,ocp_id_technician,ocp_id_writer
0,1,24,M,technician,85711,technician,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
1,4,24,M,technician,43537,technician,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
2,44,26,M,technician,46260,technician,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
3,77,30,M,technician,29379,technician,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
4,143,42,M,technician,08832,technician,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
5,197,55,M,technician,75094,technician,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
6,244,28,M,technician,80525,technician,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
7,294,34,M,technician,92110,technician,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
8,311,32,M,technician,73071,technician,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
9,325,48,M,technician,02139,technician,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0


#### Part 2. Data Preprocessing
---

In [10]:
tmp1 = pd.merge(usr_ratings, movie_info, on='item_id', how='inner')

KeyError: 'song_id'