In [1]:
import sys
sys.path.append("..")

import numpy as np
import pandas as pd
import torch
from sklearn.preprocessing import LabelEncoder
from tensorflow.python.keras.preprocessing.sequence import pad_sequences
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

from deepctr_torch.inputs import SparseFeat, VarLenSparseFeat, get_feature_names
from deepctr_torch.models import DeepFM


def split(x):
    key_ans = x.split('|')
    for key in key_ans:
        if key not in key2index:
            # Notice : input value 0 is a special "padding",so we do not use 0 to encode valid feature for sequence input
            key2index[key] = len(key2index) + 1
    return list(map(lambda x: key2index[x], key_ans))

FILE_PATH = '/home/lionkim/data_set/movie_lens/'

sparse_features = ["movie_id", "user_id", "gender", "age", "occupation", "zip", ]
target = ['rating']

In [2]:
# 사용자 정보 읽어들이기
# u_cols = ['user_id', 'age', 'sex', 'occupation', 'zip_code']
movies_cols = ['movie_id', 'title', 'genres']
movies = pd.read_csv (FILE_PATH + 'ml-1m/movies.dat', sep = '::', names = movies_cols)

user_cols = ['user_id', 'gender', 'age', 'occupation', 'zip']
users = pd.read_csv (FILE_PATH + 'ml-1m/users.dat', sep = '::', names = user_cols)

ratings_cols = ['user_id', 'movie_id', 'rating', 'timestamp']
ratings = pd.read_csv (FILE_PATH + 'ml-1m/ratings.dat', sep = '::', names = ratings_cols)

rating_movie = pd.merge (ratings, movies, how = 'left', on = 'movie_id')

movie_lens = pd.merge (rating_movie, users, how = 'left', on = 'user_id')

  after removing the cwd from sys.path.
  import sys
  # Remove the CWD from sys.path while we load stuff.


In [3]:
movie_lens.head ()

Unnamed: 0,user_id,movie_id,rating,timestamp,title,genres,gender,age,occupation,zip
0,1,1193,5,978300760,One Flew Over the Cuckoo's Nest (1975),Drama,F,1,10,48067
1,1,661,3,978302109,James and the Giant Peach (1996),Animation|Children's|Musical,F,1,10,48067
2,1,914,3,978301968,My Fair Lady (1964),Musical|Romance,F,1,10,48067
3,1,3408,4,978300275,Erin Brockovich (2000),Drama,F,1,10,48067
4,1,2355,5,978824291,"Bug's Life, A (1998)",Animation|Children's|Comedy,F,1,10,48067


In [4]:
movie_lens.info ()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1000209 entries, 0 to 1000208
Data columns (total 10 columns):
 #   Column      Non-Null Count    Dtype 
---  ------      --------------    ----- 
 0   user_id     1000209 non-null  int64 
 1   movie_id    1000209 non-null  int64 
 2   rating      1000209 non-null  int64 
 3   timestamp   1000209 non-null  int64 
 4   title       1000209 non-null  object
 5   genres      1000209 non-null  object
 6   gender      1000209 non-null  object
 7   age         1000209 non-null  int64 
 8   occupation  1000209 non-null  int64 
 9   zip         1000209 non-null  object
dtypes: int64(6), object(4)
memory usage: 83.9+ MB


In [5]:
ratings.head ()

Unnamed: 0,user_id,movie_id,rating,timestamp
0,1,1193,5,978300760
1,1,661,3,978302109
2,1,914,3,978301968
3,1,3408,4,978300275
4,1,2355,5,978824291


In [15]:
rated_movie_by1 = ratings ['movie_id'] [ratings ['user_id'] == 1]
rated_movie_by1

0     1193
1      661
2      914
3     3408
4     2355
5     1197
6     1287
7     2804
8      594
9      919
10     595
11     938
12    2398
13    2918
14    1035
15    2791
16    2687
17    2018
18    3105
19    2797
20    2321
21     720
22    1270
23     527
24    2340
25      48
26    1097
27    1721
28    1545
29     745
30    2294
31    3186
32    1566
33     588
34    1907
35     783
36    1836
37    1022
38    2762
39     150
40       1
41    1961
42    1962
43    2692
44     260
45    1028
46    1029
47    1207
48    2028
49     531
50    3114
51     608
52    1246
Name: movie_id, dtype: int64

In [16]:
movies.head ()

Unnamed: 0,movie_id,title,genres
0,1,Toy Story (1995),Animation|Children's|Comedy
1,2,Jumanji (1995),Adventure|Children's|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama
4,5,Father of the Bride Part II (1995),Comedy


In [22]:
no_rated_movies = movies [~movies ['movie_id'].isin (rated_movie_by1)] [: 100]
no_rated_movies

Unnamed: 0,movie_id,title,genres
1,2,Jumanji (1995),Adventure|Children's|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama
4,5,Father of the Bride Part II (1995),Comedy
5,6,Heat (1995),Action|Crime|Thriller
...,...,...,...
97,99,Heidi Fleiss: Hollywood Madam (1995),Documentary
98,100,City Hall (1996),Drama|Thriller
99,101,Bottle Rocket (1996),Comedy
100,102,Mr. Wrong (1996),Comedy


In [24]:
ratings.head ()

Unnamed: 0,user_id,movie_id,rating,timestamp
0,1,1193,5,978300760
1,1,661,3,978302109
2,1,914,3,978301968
3,1,3408,4,978300275
4,1,2355,5,978824291


In [28]:
users.head ()

Unnamed: 0,user_id,gender,age,occupation,zip
0,1,F,1,10,48067
1,2,M,56,16,70072
2,3,M,25,15,55117
3,4,M,45,7,2460
4,5,M,25,20,55455


In [33]:
no1_user = users [users ['user_id'] == 1]
no1_user

Unnamed: 0,user_id,gender,age,occupation,zip
0,1,F,1,10,48067


In [34]:
movie_lens_no1 = pd.concat ([no_rated_movies, no1_user], axis = 1)
movie_lens_no1

Unnamed: 0,movie_id,title,genres,user_id,gender,age,occupation,zip
0,,,,1.0,F,1.0,10.0,48067
1,2.0,Jumanji (1995),Adventure|Children's|Fantasy,,,,,
2,3.0,Grumpier Old Men (1995),Comedy|Romance,,,,,
3,4.0,Waiting to Exhale (1995),Comedy|Drama,,,,,
4,5.0,Father of the Bride Part II (1995),Comedy,,,,,
...,...,...,...,...,...,...,...,...
97,99.0,Heidi Fleiss: Hollywood Madam (1995),Documentary,,,,,
98,100.0,City Hall (1996),Drama|Thriller,,,,,
99,101.0,Bottle Rocket (1996),Comedy,,,,,
100,102.0,Mr. Wrong (1996),Comedy,,,,,


In [None]:


rating_movie = pd.merge (ratings, movies, how = 'left', on = 'movie_id')

movie_lens = pd.merge (rating_movie, users, how = 'left', on = 'user_id')

In [13]:
df = pd.DataFrame({'countries':['US','UK','Germany','China']})
countries = ['UK','China']

df

Unnamed: 0,countries
0,US
1,UK
2,Germany
3,China


In [14]:
# pseudo-code:
df.countries.isin(countries)

0    False
1     True
2    False
3     True
Name: countries, dtype: bool

In [6]:
ratings.info ()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000209 entries, 0 to 1000208
Data columns (total 4 columns):
 #   Column     Non-Null Count    Dtype
---  ------     --------------    -----
 0   user_id    1000209 non-null  int64
 1   movie_id   1000209 non-null  int64
 2   rating     1000209 non-null  int64
 3   timestamp  1000209 non-null  int64
dtypes: int64(4)
memory usage: 30.5 MB


In [7]:
movies.info ()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3883 entries, 0 to 3882
Data columns (total 3 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   movie_id  3883 non-null   int64 
 1   title     3883 non-null   object
 2   genres    3883 non-null   object
dtypes: int64(1), object(2)
memory usage: 91.1+ KB


In [8]:
movies.head ()

Unnamed: 0,movie_id,title,genres
0,1,Toy Story (1995),Animation|Children's|Comedy
1,2,Jumanji (1995),Adventure|Children's|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama
4,5,Father of the Bride Part II (1995),Comedy


In [2]:
from sklearn import preprocessing
le = preprocessing.LabelEncoder()

le.fit([1, 2, 2, 6])
LabelEncoder()

LabelEncoder()

In [3]:
le.classes_

array([1, 2, 6])

In [4]:


le.transform([1, 1, 2, 6])

array([0, 0, 1, 2])

In [5]:
le.inverse_transform([0, 0, 1, 2])

array([1, 1, 2, 6])

In [7]:
le = preprocessing.LabelEncoder ()

le.fit (['paris','paris', 'seoul', 'amsterdam'])

LabelEncoder()

In [8]:
list (le.classes_)

['amsterdam', 'paris', 'seoul']

In [9]:
le.transform (['seoul', 'seoul', 'paris'])

array([2, 2, 1])

In [10]:
list (le.inverse_transform ([2, 2, 1]))

['seoul', 'seoul', 'paris']