In [1]:
import pandas as pd
import numpy as np
import os
import warnings

In [2]:
warnings.filterwarnings('ignore')

## users.dat

User information is in the file "users.dat" and is in the following
format:

UserID::Gender::Age::Occupation::Zip-code

All demographic information is provided voluntarily by the users and is
not checked for accuracy.  Only users who have provided some demographic
information are included in this data set.

- Gender is denoted by a "M" for male and "F" for female
- Age is chosen from the following ranges:

	*  1:  "Under 18"
	* 18:  "18-24"
	* 25:  "25-34"
	* 35:  "35-44"
	* 45:  "45-49"
	* 50:  "50-55"
	* 56:  "56+"

- Occupation is chosen from the following choices:

	*  0:  "other" or not specified
	*  1:  "academic/educator"
	*  2:  "artist"
	*  3:  "clerical/admin"
	*  4:  "college/grad student"
	*  5:  "customer service"
	*  6:  "doctor/health care"
	*  7:  "executive/managerial"
	*  8:  "farmer"
	*  9:  "homemaker"
	* 10:  "K-12 student"
	* 11:  "lawyer"
	* 12:  "programmer"
	* 13:  "retired"
	* 14:  "sales/marketing"
	* 15:  "scientist"
	* 16:  "self-employed"
	* 17:  "technician/engineer"
	* 18:  "tradesman/craftsman"
	* 19:  "unemployed"
	* 20:  "writer"

In [3]:
columns = ['UserID','Gender','Age','Occupation','Zip-code']
users = pd.read_table('../dataset/ml-1m/users.dat',sep='::',names=columns)

In [4]:
users

Unnamed: 0,UserID,Gender,Age,Occupation,Zip-code
0,1,F,1,10,48067
1,2,M,56,16,70072
2,3,M,25,15,55117
3,4,M,45,7,02460
4,5,M,25,20,55455
...,...,...,...,...,...
6035,6036,F,25,15,32603
6036,6037,F,45,1,76006
6037,6038,F,56,1,14706
6038,6039,F,45,0,01060


In [5]:
n_users = users.UserID.nunique()
print(f'共有{n_users}个用户')

共有6040个用户


# movies.dat

Movie information is in the file "movies.dat" and is in the following
format:

MovieID::Title::Genres

- Titles are identical to titles provided by the IMDB (including
year of release)
- Genres are pipe-separated and are selected from the following genres:

	* Action
	* Adventure
	* Animation
	* Children's
	* Comedy
	* Crime
	* Documentary
	* Drama
	* Fantasy
	* Film-Noir
	* Horror
	* Musical
	* Mystery
	* Romance
	* Sci-Fi
	* Thriller
	* War
	* Western

- Some MovieIDs do not correspond to a movie due to accidental duplicate
entries and/or test entries
- Movies are mostly entered by hand, so errors and inconsistencies may exist

In [6]:
columns = ['MovieID','Title','Genres']
movies = pd.read_table('../dataset/ml-1m/movies.dat',sep='::',names=columns,encoding='ISO-8859-1')

In [7]:
movies

Unnamed: 0,MovieID,Title,Genres
0,1,Toy Story (1995),Animation|Children's|Comedy
1,2,Jumanji (1995),Adventure|Children's|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama
4,5,Father of the Bride Part II (1995),Comedy
...,...,...,...
3878,3948,Meet the Parents (2000),Comedy
3879,3949,Requiem for a Dream (2000),Drama
3880,3950,Tigerland (2000),Drama
3881,3951,Two Family House (2000),Drama


In [8]:
movies['Genres'].apply(lambda x:x.split('|'))

0        [Animation, Children's, Comedy]
1       [Adventure, Children's, Fantasy]
2                      [Comedy, Romance]
3                        [Comedy, Drama]
4                               [Comedy]
                      ...               
3878                            [Comedy]
3879                             [Drama]
3880                             [Drama]
3881                             [Drama]
3882                   [Drama, Thriller]
Name: Genres, Length: 3883, dtype: object

In [9]:
n_movies = movies.MovieID.nunique()
print(f'共有{n_movies}部电影')

共有3883部电影


## ratings.dat

All ratings are contained in the file "ratings.dat" and are in the
following format:

UserID::MovieID::Rating::Timestamp

- UserIDs range between 1 and 6040 
- MovieIDs range between 1 and 3952
- Ratings are made on a 5-star scale (whole-star ratings only)
- Timestamp is represented in seconds since the epoch as returned by time(2)
- Each user has at least 20 ratings

In [10]:
columns = ['UserID','MovieID','Rating','Timestamp']
ratings = pd.read_table('../dataset/ml-1m/ratings.dat',sep='::',names=columns)

In [11]:
ratings

Unnamed: 0,UserID,MovieID,Rating,Timestamp
0,1,1193,5,978300760
1,1,661,3,978302109
2,1,914,3,978301968
3,1,3408,4,978300275
4,1,2355,5,978824291
...,...,...,...,...
1000204,6040,1091,1,956716541
1000205,6040,1094,5,956704887
1000206,6040,562,5,956704746
1000207,6040,1096,4,956715648


In [12]:
avg_rating_num = len(ratings)/n_users
print(f'平均每个用户对 {avg_rating_num:.1f} 部电影进行评分')

平均每个用户对 165.6 部电影进行评分


### 每个用户的平均评分

In [13]:
ratings.groupby(by='UserID').agg({'Rating':np.mean})

Unnamed: 0_level_0,Rating
UserID,Unnamed: 1_level_1
1,4.188679
2,3.713178
3,3.901961
4,4.190476
5,3.146465
...,...
6036,3.302928
6037,3.717822
6038,3.800000
6039,3.878049


### 每部电影的平均得分

In [14]:
ratings.groupby(by='MovieID').agg({'Rating':np.mean})

Unnamed: 0_level_0,Rating
MovieID,Unnamed: 1_level_1
1,4.146846
2,3.201141
3,3.016736
4,2.729412
5,3.006757
...,...
3948,3.635731
3949,4.115132
3950,3.666667
3951,3.900000


## 合并三个表

In [15]:
new_users = users[['UserID','Gender','Age','Occupation']]
new_users = pd.get_dummies(new_users)

In [16]:
new_users

Unnamed: 0,UserID,Age,Occupation,Gender_F,Gender_M
0,1,1,10,1,0
1,2,56,16,0,1
2,3,25,15,0,1
3,4,45,7,0,1
4,5,25,20,0,1
...,...,...,...,...,...
6035,6036,25,15,1,0
6036,6037,45,1,1,0
6037,6038,56,1,1,0
6038,6039,45,0,1,0


In [17]:
new_movies = movies.copy()
new_movies['Time'] = movies['Title'].apply(lambda x:x[-5:-1])
new_movies['Title'] = movies['Title'].apply(lambda x:x[:-6])
new_movies['Genres'] = new_movies['Genres'].apply(lambda x:x.split('|'))

In [18]:
new_movies

Unnamed: 0,MovieID,Title,Genres,Time
0,1,Toy Story,"[Animation, Children's, Comedy]",1995
1,2,Jumanji,"[Adventure, Children's, Fantasy]",1995
2,3,Grumpier Old Men,"[Comedy, Romance]",1995
3,4,Waiting to Exhale,"[Comedy, Drama]",1995
4,5,Father of the Bride Part II,[Comedy],1995
...,...,...,...,...
3878,3948,Meet the Parents,[Comedy],2000
3879,3949,Requiem for a Dream,[Drama],2000
3880,3950,Tigerland,[Drama],2000
3881,3951,Two Family House,[Drama],2000


In [19]:
Genres_type = ['Action','Adventure','Animation','Children\'s','Comedy','Crime','Documentary','Drama','Fantasy','Film-Noir','Horror','Musical','Mystery','Romance','Sci-Fi','Thriller','War','Western']
for itype in Genres_type:
    new_movies[itype] = 0

In [20]:
genres = new_movies.loc[0,['Genres']]
[i for i in genres.values[0]]

['Animation', "Children's", 'Comedy']

In [21]:
for i in range(len(new_movies)):
    genres = new_movies.loc[i,['Genres']]
    for g in genres.values[0]:
        new_movies.loc[i,[g]]=1

In [22]:
del new_movies['Genres']

In [23]:
new_movies

Unnamed: 0,MovieID,Title,Time,Action,Adventure,Animation,Children's,Comedy,Crime,Documentary,...,Fantasy,Film-Noir,Horror,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
0,1,Toy Story,1995,0,0,1,1,1,0,0,...,0,0,0,0,0,0,0,0,0,0
1,2,Jumanji,1995,0,1,0,1,0,0,0,...,1,0,0,0,0,0,0,0,0,0
2,3,Grumpier Old Men,1995,0,0,0,0,1,0,0,...,0,0,0,0,0,1,0,0,0,0
3,4,Waiting to Exhale,1995,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
4,5,Father of the Bride Part II,1995,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3878,3948,Meet the Parents,2000,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
3879,3949,Requiem for a Dream,2000,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3880,3950,Tigerland,2000,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3881,3951,Two Family House,2000,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [24]:
new_users = users.copy()
del new_users['Zip-code']
new_users = pd.get_dummies(new_users)

In [25]:
data = pd.merge(pd.merge(new_users,ratings),new_movies)

In [26]:
data = data[data.Age>1]

In [27]:
data

Unnamed: 0,UserID,Age,Occupation,Gender_F,Gender_M,MovieID,Rating,Timestamp,Title,Time,...,Fantasy,Film-Noir,Horror,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
1,2,56,16,0,1,1193,5,978298413,One Flew Over the Cuckoo's Nest,1975,...,0,0,0,0,0,0,0,0,0,0
2,12,25,12,0,1,1193,4,978220179,One Flew Over the Cuckoo's Nest,1975,...,0,0,0,0,0,0,0,0,0,0
3,15,25,7,0,1,1193,4,978199279,One Flew Over the Cuckoo's Nest,1975,...,0,0,0,0,0,0,0,0,0,0
4,17,50,1,0,1,1193,5,978158471,One Flew Over the Cuckoo's Nest,1975,...,0,0,0,0,0,0,0,0,0,0
5,18,18,3,1,0,1193,4,978156168,One Flew Over the Cuckoo's Nest,1975,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1000204,5949,18,17,0,1,2198,5,958846401,Modulations,1998,...,0,0,0,0,0,0,0,0,0,0
1000205,5675,35,14,0,1,2703,3,976029116,Broken Vessels,1998,...,0,0,0,0,0,0,0,0,0,0
1000206,5780,18,17,0,1,2845,1,958153068,White Boys,1999,...,0,0,0,0,0,0,0,0,0,0
1000207,5851,18,20,1,0,3607,5,957756608,One Little Indian,1973,...,0,0,0,0,0,0,0,0,0,1


## 划分数据集

In [26]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split




In [37]:
data = pd.read_table('../dataset/ml-1m/ratings.dat', sep='::', names = ['userID','itemID','Rating','Zip-code'])
del data['Zip-code']
X_train, X_test, y_train, y_test = train_test_split(data[['userID','itemID']], data[['userID','Rating']],test_size=0.2)


  data = pd.read_table('../dataset/ml-1m/ratings.dat', sep='::', names = ['userID','itemID','Rating','Zip-code'])


In [42]:
y_train

Unnamed: 0,userID,Rating
254907,1551,4
696861,4169,3
273589,1649,5
529624,3272,4
364254,2122,4
...,...,...
931375,5626,5
232435,1411,3
491082,3018,4
327171,1939,4


In [35]:
tra_users = X_train.groupby('userID')['itemID','Rating'].apply(list)
tra_items = X_train.groupby('itemID')['userID'].apply(list).to_dict()

{1193: 5}

In [None]:
del data['Zip-code']
X_train, X_test = train_test_split(data[['userID','itemID','Rating']], test_size=0.2)

In [25]:
tra_users = X_train.groupby('userID')['itemID','Rating'].apply(lambda x: {x.loc['itemID']:x.loc['Rating']})

  tra_users = X_train.groupby('userID')['itemID','Rating'].apply(lambda x: {x.loc['itemID']:x.loc['Rating']})


KeyError: 'itemID'

In [19]:
tra_users = X_train.groupby('userID')['itemID','Rating'].apply(lambda x: {x.loc['itemID']:x.loc['Rating']})

  tra_users = X_train.groupby('userID')['itemID','Rating'].apply(lambda x: {x.loc['itemID']:x.loc['Rating']})


KeyError: "Columns not found: 'Rating'"

In [18]:
tra_users

{'userID': {0: 1,
  1: 2,
  2: 3,
  3: 4,
  4: 5,
  5: 6,
  6: 7,
  7: 8,
  8: 9,
  9: 10,
  10: 11,
  11: 12,
  12: 13,
  13: 14,
  14: 15,
  15: 16,
  16: 17,
  17: 18,
  18: 19,
  19: 20,
  20: 21,
  21: 22,
  22: 23,
  23: 24,
  24: 25,
  25: 26,
  26: 27,
  27: 28,
  28: 29,
  29: 30,
  30: 31,
  31: 32,
  32: 33,
  33: 34,
  34: 35,
  35: 36,
  36: 37,
  37: 38,
  38: 39,
  39: 40,
  40: 41,
  41: 42,
  42: 43,
  43: 44,
  44: 45,
  45: 46,
  46: 47,
  47: 48,
  48: 49,
  49: 50,
  50: 51,
  51: 52,
  52: 53,
  53: 54,
  54: 55,
  55: 56,
  56: 57,
  57: 58,
  58: 59,
  59: 60,
  60: 61,
  61: 62,
  62: 63,
  63: 64,
  64: 65,
  65: 66,
  66: 67,
  67: 68,
  68: 69,
  69: 70,
  70: 71,
  71: 72,
  72: 73,
  73: 74,
  74: 75,
  75: 76,
  76: 77,
  77: 78,
  78: 79,
  79: 80,
  80: 81,
  81: 82,
  82: 83,
  83: 84,
  84: 85,
  85: 86,
  86: 87,
  87: 88,
  88: 89,
  89: 90,
  90: 91,
  91: 92,
  92: 93,
  93: 94,
  94: 95,
  95: 96,
  96: 97,
  97: 98,
  98: 99,
  99: 100,
  100: 1

In [15]:
test_users = X_test.groupby('userID')['itemID'].apply(list).to_dict()
test_items = X_test.groupby('itemID')['userID'].apply(list).to_dict()
