# data-preprocessing

In [12]:
usr_ratings_url = 'http://files.grouplens.org/datasets/movielens/ml-100k/u.data'
movie_info_url  = 'http://files.grouplens.org/datasets/movielens/ml-100k/u.item'
usr_info_url    = 'http://files.grouplens.org/datasets/movielens/ml-100k/u.user'
genre_info_url  = 'http://files.grouplens.org/datasets/movielens/ml-100k/u.genre'
ocp_info_url    = 'http://files.grouplens.org/datasets/movielens/ml-100k/u.occupation'

---

## Part 1.  Importing the data
#### Part 1.1. Determining the Columns for each Dataset

In [13]:
import pandas as pd

In [14]:
usr_ratings_cols = 'userID | movieID | rating | timestamp'.split(' | ')
#usr_ratings_cols

In [15]:
movie_info_cols = '''movieID | title | releaseDate | videoReleaseDate | IMDbURL | unknown | Action | Adventure | Animation | Children's | Comedy | Crime | Documentary | Drama | Fantasy | Film-Noir | Horror | Musical | Mystery | Romance | Sci-Fi | Thriller | War | Western'''.split(' | ')
#movie_info_cols

In [16]:
usr_info_cols = 'userID | age | gender | occupation | zipCode'.split(' | ')
#usr_info_cols

#### Part 1.2. Importing the Datasets
---
  * **usr_ratings** corresponds to **u.data**
  * **movie_info** corresponds to **u.item**
  * **usr_info** corresponds to **u.user**
---
  * **genre_info** corresponds to **u.genre**
  * **ocp_info** corresponds to **u.occupation**

In [17]:
usr_ratings = pd.read_csv(usr_ratings_url, delimiter='\t', header=None, names=usr_ratings_cols, encoding='latin-1')

In [78]:
usr_ratings.head()

Unnamed: 0,userID,movieID,rating,timestamp
0,196,242,3,881250949
1,186,302,3,891717742
2,22,377,1,878887116
3,244,51,2,880606923
4,166,346,1,886397596


In [18]:
movie_info = pd.read_csv(movie_info_url, delimiter='|', header=None, names=movie_info_cols, encoding='latin-1')

In [80]:
movie_info.head()

Unnamed: 0,movieID,title,releaseDate,videoReleaseDate,IMDbURL,unknown,Action,Adventure,Animation,Children's,...,Fantasy,Film-Noir,Horror,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
0,1,Toy Story (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Toy%20Story%2...,0,0,0,1,1,...,0,0,0,0,0,0,0,0,0,0
1,2,GoldenEye (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?GoldenEye%20(...,0,1,1,0,0,...,0,0,0,0,0,0,0,1,0,0
2,3,Four Rooms (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Four%20Rooms%...,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
3,4,Get Shorty (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Get%20Shorty%...,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,5,Copycat (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Copycat%20(1995),0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0


In [19]:
usr_info = pd.read_csv(usr_info_url, delimiter='|', header=None, names=usr_info_cols, encoding='latin-1')

In [63]:
usr_info.head()

Unnamed: 0,userID,age,gender,occupation,zipCode
0,1,24,M,technician,85711
1,2,53,F,other,94043
2,3,23,M,writer,32067
3,4,24,M,technician,43537
4,5,33,F,other,15213


In [20]:
genre_info = pd.read_csv(genre_info_url, delimiter='|', header=None, names=['genreID', 'genre_code'], encoding='latin-1')
genre_info = genre_info[['genreID']]

In [74]:
genre_info.head()

Unnamed: 0,genreID
0,unknown
1,Action
2,Adventure
3,Animation
4,Children's


In [21]:
ocp_info = pd.read_csv(ocp_info_url, delimiter='|', header=None, names=['ocpID'], encoding='latin-1')
ocp_info = ocp_info[['ocpID']]

In [21]:
ocp_info.head()

Unnamed: 0,ocp_id
0,administrator
1,artist
2,doctor
3,educator
4,engineer


#### Part 2. Data Preprocessing
---
  * For usr_ratings with non-existent rating values replace with neutral value 3.

In [24]:
print (usr_ratings.loc[usr_ratings['userID'] == None])
print (usr_ratings.loc[usr_ratings['userID'].isnull()])

print (usr_ratings.loc[usr_ratings['movieID'] == None])
print (usr_ratings.loc[usr_ratings['movieID'].isnull()])

print (usr_ratings.loc[usr_ratings['rating'] == None])
print (usr_ratings.loc[usr_ratings['rating'].isnull()])

print (usr_ratings.loc[usr_ratings['timestamp'] == None])
print (usr_ratings.loc[usr_ratings['timestamp'].isnull()])

Empty DataFrame
Columns: [userID, movieID, rating, timestamp]
Index: []
Empty DataFrame
Columns: [userID, movieID, rating, timestamp]
Index: []
Empty DataFrame
Columns: [userID, movieID, rating, timestamp]
Index: []
Empty DataFrame
Columns: [userID, movieID, rating, timestamp]
Index: []
Empty DataFrame
Columns: [userID, movieID, rating, timestamp]
Index: []
Empty DataFrame
Columns: [userID, movieID, rating, timestamp]
Index: []
Empty DataFrame
Columns: [userID, movieID, rating, timestamp]
Index: []
Empty DataFrame
Columns: [userID, movieID, rating, timestamp]
Index: []


---
##### Make the Rating Matrix (USERS x MOVIES)
 * Solution: Pivoting the **usr_ratings** matrix with respect to *userID* and *movieID*

In [54]:
import numpy as np

In [73]:
ratings_info = usr_ratings[['userID', 'movieID', 'rating']]
ratings_info = ratings_info.pivot(index='userID', columns='movieID', values='rating').fillna(0)
ratings_info.head(3)

movieID,1,2,3,4,5,6,7,8,9,10,...,1673,1674,1675,1676,1677,1678,1679,1680,1681,1682
userID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,5.0,3.0,4.0,3.0,3.0,5.0,4.0,1.0,5.0,3.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


  * __Naive Implementation:__
 Create a rating matrix where the columns are movies, and the rows are users. From this point we could use euclidean similarity on each row to show how close users are together; however, since the matrix is so sparse (most users have only rated a small selection of the entire set of movies), it likely won’t lead to adaquate results.

 To better understand this understand this case, imagine a situation where both users have similar movie tastes (old action movies), but there aren’t any movies that the both of them have rated. In this case, they wouldn’t be considered similar, even though we know they are.
 
 One Idea is to add the **neutral rate value 3** to the unrated movies so as to avert this inconvinience from happening.

In [74]:
ratings_info.columns = movie_info.title
ratings_info.reset_index(inplace=True)
ratings_info.head(3)

title,userID,Toy Story (1995),GoldenEye (1995),Four Rooms (1995),Get Shorty (1995),Copycat (1995),Shanghai Triad (Yao a yao yao dao waipo qiao) (1995),Twelve Monkeys (1995),Babe (1995),Dead Man Walking (1995),...,Mirage (1995),Mamma Roma (1962),"Sunchaser, The (1996)","War at Home, The (1996)",Sweet Nothing (1995),Mat' i syn (1997),B. Monkey (1998),Sliding Doors (1998),You So Crazy (1994),Scream of Stone (Schrei aus Stein) (1991)
0,1,5.0,3.0,4.0,3.0,3.0,5.0,4.0,1.0,5.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,2,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


  * Saving **Ratings Information (USERS x MOVIES) Matrix** to a csv **[name: ratings.info]**

In [75]:
ratings_info.to_csv('ml-100k-processed-data/ratings.info', sep=',', index=False)

---
##### Preparing the data for Processing
 * Solution:
   *  Dropping the **userID** column
   *  Getting the np.array from the DataFrame and Determining the optimal #clusters.

In [76]:
ratings_info = ratings_info.drop('userID', axis=1)
ratings_info.head(3)

title,Toy Story (1995),GoldenEye (1995),Four Rooms (1995),Get Shorty (1995),Copycat (1995),Shanghai Triad (Yao a yao yao dao waipo qiao) (1995),Twelve Monkeys (1995),Babe (1995),Dead Man Walking (1995),Richard III (1995),...,Mirage (1995),Mamma Roma (1962),"Sunchaser, The (1996)","War at Home, The (1996)",Sweet Nothing (1995),Mat' i syn (1997),B. Monkey (1998),Sliding Doors (1998),You So Crazy (1994),Scream of Stone (Schrei aus Stein) (1991)
0,5.0,3.0,4.0,3.0,3.0,5.0,4.0,1.0,5.0,3.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [None]:
ratings = np.array(ratings_info); ratings

---
##### Saving the ratings numpy array
 * Solution:
   *  Import pickle library
   *  dump pickle data to the pickle dump file [name: **ratings-np-array.pkl**
   *  To load the Saved Variable again, type: 
```python
ratings = open('ml-100k-temporary-data/ratings-np-array.pkl','rb')
ratings = pickle.load(ratings)
    ```

In [79]:
import pickle

In [81]:
with open('ml-100k-temporary-data/ratings-np-array.pkl', 'wb') as handle:
    pickle.dump(ratings, handle, protocol=pickle.HIGHEST_PROTOCOL)