# 필요한 패키지 설치

In [1]:
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import os

In [2]:
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import PolynomialFeatures
from sklearn.preprocessing import LabelEncoder
from torch.utils.tensorboard import SummaryWriter
from sklearn.metrics import roc_auc_score
from sklearn.preprocessing import StandardScaler

# 데이터 불러오기

In [3]:
col_names = names=['user_id', 'movie_id', 'ratings', 'timestamp']
rating_df = pd.read_csv("../Data/MovieLens/ml-1m/ratings.dat", sep="::", names=col_names, engine='python')
rating_df.drop('timestamp', axis=1, inplace=True)

In [4]:
col_names = ["movie_id","movie_name", "genre"]
item_df = pd.read_table("../Data/MovieLens/ml-1m/movies.dat", sep="::", names=col_names, encoding = "latin-1", engine='python')
item_df['genre'] = item_df.apply(lambda row: row['genre'].split('|')[0], axis=1)
item_df['movie_year'] = item_df.apply(lambda row : int(row['movie_name'].split("(")[-1][:-1]),axis=1)
item_df.drop(['movie_name'], axis=1, inplace=True)

In [5]:
col_names = ["user_id", "gender", "age", "occupation", "zipcode"]
user_df = pd.read_table('../Data/MovieLens/ml-1m/users.dat', sep="::", names=col_names, engine='python')
user_df['gender'] = (user_df['gender'].apply(lambda x: 1 if x=='M' else 0)).astype(int)
user_df.drop(['zipcode'],axis=1, inplace=True)

# Dataset 전처리

In [6]:
# item-user cartesian product
item_df['key'] = 0
user_df['key'] = 0
user_item_df = user_df.merge(item_df, how='outer')
user_item_df.drop('key', axis=1, inplace=True)

user_item_index_df = user_item_df.iloc[:, [0,4]]
user_item_feature_df = pd.concat([user_item_df.iloc[:,[1,2,3]], user_item_df.iloc[:,5:]], axis=1)
user_item_df = pd.concat([user_item_index_df, user_item_feature_df], axis=1)

user_item_df = pd.merge(user_item_df, rating_df, how='left', on=['user_id', 'movie_id'])
user_item_df.dropna(inplace=True)

In [7]:
Scaler = StandardScaler()
user_item_df['movie_year']=Scaler.fit_transform(user_item_df['movie_year'].values.reshape(-1,1))

## Wide Component 추출

In [8]:
continous_cols = ['movie_year']
cross_cols = ['gender', 'genre', 'age']

In [9]:
continous_df = user_item_df.loc[:, continous_cols]

In [10]:
cross_df = user_item_df.loc[:, cross_cols]
cross_df = pd.get_dummies(cross_df, columns=['age'])
cross_df = pd.get_dummies(cross_df, columns=['genre'])

In [11]:
# interacion
cross_df = PolynomialFeatures(2, interaction_only=True, include_bias=False).fit_transform(cross_df)
cross_df = pd.DataFrame(cross_df)

In [12]:
continous_df.reset_index(drop=True,inplace=True)
cross_df.reset_index(drop=True, inplace=True)
wide_df = pd.concat([continous_df,cross_df],axis=1)

In [14]:
wide_df

Unnamed: 0,movie_year,0,1,2,3,4,5,6,7,8,...,341,342,343,344,345,346,347,348,349,350
0,0.578536,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.578536,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.578536,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,-0.675879,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.439156,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1000204,-0.188051,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1000205,-0.397120,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1000206,-0.954638,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1000207,0.926984,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


## Deep Components 추출

In [15]:
deep_cols = ['user_id', 'movie_id', 'occupation', 'genre', 'gender', 'age']
deep_df =  user_item_df[deep_cols]

In [16]:
le = LabelEncoder()
genre = le.fit_transform(deep_df.loc[:,'genre'])

In [17]:
deep_df.loc[ : ,'genre'] = genre

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  isetter(ilocs[0], value)


In [18]:
deep_df.head()

Unnamed: 0,user_id,movie_id,occupation,genre,gender,age
0,1,1,10,2,0,1
47,1,48,10,2,0,1
148,1,150,10,7,0,1
257,1,260,10,0,0,1
523,1,527,10,7,0,1


## Target 만들기

In [16]:
Y = np.array(user_item_df['ratings'])
Y = np.where(Y>=3, 1, 0)

## train_test split

In [17]:
train_wide_df, test_wide_df = train_test_split(wide_df, test_size=0.3, random_state=22)
train_deep_df, test_deep_df = train_test_split(deep_df, test_size=0.3, random_state=22)
train_Y, test_Y = train_test_split(Y, test_size=0.3, random_state=22)

In [18]:
class PandasDataset(Dataset):
    
    def __init__(self, wide_df, deep_df, Y):
        """
        Args :  wide_df : Features for Wide Learning
                deep_df : Features for Deep Learning
                Y       : target
        """
        super(PandasDataset, self).__init__()
        self.X = wide_df
        self.emb_user = deep_df.iloc[:,0]
        self.emb_movie = deep_df.iloc[:,1]
        self.emb_occupation = deep_df.iloc[:,2]
        self.emb_genre = deep_df.iloc[:,3]
        self.emb_gender = deep_df.iloc[:,4]
        self.emb_age = deep_df.iloc[:,5]
        
        self.y = Y
        
        self.X_value = self.X.values
        self.emb_user_value, self.emb_movie_value, self.emb_occupation_value = self.emb_user.values, self.emb_movie.values, self.emb_occupation.values
        self.emb_genre_value, self.emb_gender_value, self.emb_age_value = self.emb_genre.values, self.emb_gender.values, self.emb_age.values
        
    def __len__(self):
        return len(self.X)
    
    def __getitem__(self, idx):
        return {
            'X' : torch.from_numpy(self.X_value)[idx],
            'emb_user' : torch.from_numpy(self.emb_user_value)[idx],
            'emb_movie' : torch.from_numpy(self.emb_movie_value)[idx],
            'emb_occupation' : torch.from_numpy(self.emb_occupation_value)[idx],
            'emb_genre' : torch.from_numpy(self.emb_genre_value)[idx],
            'emb_gender' : torch.from_numpy(self.emb_gender_value)[idx],
            'emb_age' : torch.from_numpy(self.emb_age_value)[idx],
            'y' : torch.from_numpy(self.y)[idx]
        }

In [19]:
batch_size = 5000

train_dataset = PandasDataset(train_wide_df, train_deep_df, train_Y)
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)

In [20]:
# test data
test_wide_tensor = torch.FloatTensor(test_wide_df.values)
test_emb1_tensor = torch.LongTensor(test_deep_df.iloc[:,0].values)
test_emb2_tensor = torch.LongTensor(test_deep_df.iloc[:,1].values)
test_emb3_tensor = torch.LongTensor(test_deep_df.iloc[:,2].values)
test_emb4_tensor = torch.LongTensor(test_deep_df.iloc[:,3].values)
test_emb5_tensor = torch.LongTensor(test_deep_df.iloc[:,4].values)
test_emb6_tensor = torch.LongTensor(test_deep_df.iloc[:,5].values)
test_y_tensor = torch.FloatTensor(test_Y)

# Wide&Deep Model

In [21]:
os.environ["CUDA_DEVICE_ORDER"]="PCI_BUS_ID"
os.environ["CUDA_VISIBLE_DEVICES"]="0"
torch.cuda.is_available()

True

In [22]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
#device = 'cpu'
print(device)

cuda:0


In [23]:
deep_df.head()

Unnamed: 0,user_id,movie_id,occupation,genre,gender,age
0,1,1,10,2,0,1
47,1,48,10,2,0,1
148,1,150,10,7,0,1
257,1,260,10,0,0,1
523,1,527,10,7,0,1


In [24]:
len(deep_df['age'].unique())

7

In [25]:
class Wide_n_Deep(nn.Module):
    
    def __init__(self):
        super(Wide_n_Deep, self).__init__()
         
        # deep components embedding
        self.embed1 = nn.Embedding(6041,32)
        self.embed2 = nn.Embedding(4000,32)
        self.embed3 = nn.Embedding(22,8)
        self.embed4 = nn.Embedding(19, 8)
        self.embed5 = nn.Embedding(2, 8)
        self.embed6 = nn.Embedding(200, 8)
        
        # deep model
        self.lin1 = nn.Linear(96, 50)
        self.lin2 = nn.Linear(50, 32)
        self.lin3 = nn.Linear(32, 16)
        
        # integrate
        self.lin4 = nn.Linear(368, 1)
        self.sigmoid = nn.Sigmoid()
        
    def forward(self, x, emb_user, emb_movie, emb_occupation, emb_genre, emb_gender, emb_age):
        
        # deep components embedding
        input_emb1_out = self.embed1(emb_user)
        input_emb2_out = self.embed2(emb_movie)
        input_emb3_out = self.embed3(emb_occupation)
        input_emb4_out = self.embed4(emb_genre)
        input_emb5_out = self.embed5(emb_gender)
        input_emb6_out = self.embed6(emb_age)
        
        # deep components integrate
        concat_out = torch.cat((input_emb1_out, input_emb2_out, input_emb3_out, input_emb4_out, input_emb5_out, input_emb6_out), dim=1)
        
        # deep model learning
        lin1_out = self.lin1(concat_out)
        relu1_out = nn.ReLU()(lin1_out)
        lin2_out = self.lin2(relu1_out)
        relu2_out = nn.ReLU()(lin2_out)
        lin3_out = self.lin3(relu2_out)
        deep_out = nn.ReLU()(lin3_out)
        
        # integrate
        wide_deep_input = torch.cat([x, deep_out], dim=1)
        logits = self.lin4(wide_deep_input)
        
        # sigmoid
        output = self.sigmoid(logits)
        
        return output

In [26]:
model = Wide_n_Deep()
model = model.to(device)

In [27]:
loss_fn = nn.BCELoss()
lr = 0.005
optimizer = torch.optim.SGD(model.parameters(),lr=lr)
epochs = 200

## Learning

In [28]:
writer = SummaryWriter()

for epoch in range(epochs):
    total_loss = 0
    total_val_loss = 0
    model.train()
    for i, data in enumerate(train_loader):
        
        # forward pass
        pred = model(data['X'].float().to(device), data['emb_user'].long().to(device),
                     data['emb_movie'].long().to(device), data['emb_occupation'].long().to(device),
                    data['emb_genre'].long().to(device), data['emb_gender'].long().to(device), data['emb_age'].long().to(device))
        loss = loss_fn(pred.squeeze(), data['y'].float().to(device))
        
        # initialize
        optimizer.zero_grad()
        
        # backward
        loss.backward()
        
        # update
        optimizer.step()
        
        total_loss += loss.item()
        
    train_loss = total_loss / len(train_loader)
    
    # evaluation
    model.eval()

    pred = model(test_wide_tensor.cuda(), test_emb1_tensor.cuda(), test_emb2_tensor.cuda(), test_emb3_tensor.cuda(),
                test_emb4_tensor.cuda(),test_emb5_tensor.cuda(),test_emb6_tensor.cuda())
    auc = roc_auc_score(test_y_tensor.cpu().detach().numpy(), pred.cpu().detach().numpy())
    
    writer.add_scalar("train_loss", train_loss, epoch+1)
    writer.add_scalar("AUC", auc, epoch+1)
    
    print("epoch : {}, train_loss : {}, auc : {}".format(epoch+1, train_loss, auc))

epoch : 1, train_loss : 0.6300391783105567, auc : 0.517118057771245
epoch : 2, train_loss : 0.536460577173436, auc : 0.5186190440744675
epoch : 3, train_loss : 0.49474412519881067, auc : 0.5227202925468466
epoch : 4, train_loss : 0.47476847395829275, auc : 0.527538605570456
epoch : 5, train_loss : 0.46391459475172325, auc : 0.5326610601261419
epoch : 6, train_loss : 0.4577722238733413, auc : 0.5376816016130292
epoch : 7, train_loss : 0.45339440390573327, auc : 0.5424925929281192
epoch : 8, train_loss : 0.4508193682694266, auc : 0.5469997551078234
epoch : 9, train_loss : 0.44842425157837834, auc : 0.5513400750563598
epoch : 10, train_loss : 0.44730958321415787, auc : 0.5554157129350165
epoch : 11, train_loss : 0.4463685315551487, auc : 0.559225036232264
epoch : 12, train_loss : 0.4448931929490245, auc : 0.562751705523941
epoch : 13, train_loss : 0.44462843165330007, auc : 0.5660867717828971
epoch : 14, train_loss : 0.4437250704207319, auc : 0.5692105519850992
epoch : 15, train_loss : 0.

epoch : 118, train_loss : 0.4311088951344186, auc : 0.6355886161823093
epoch : 119, train_loss : 0.43045726080312796, auc : 0.635689370639464
epoch : 120, train_loss : 0.4301536717313401, auc : 0.6357852071149857
epoch : 121, train_loss : 0.4303376877561529, auc : 0.6358790825848127
epoch : 122, train_loss : 0.4306768639713314, auc : 0.6359727789092635
epoch : 123, train_loss : 0.43038668573325406, auc : 0.6360712078911425
epoch : 124, train_loss : 0.4307239237406575, auc : 0.6361690075268187
epoch : 125, train_loss : 0.43041371411465584, auc : 0.6362625795023461
epoch : 126, train_loss : 0.4311663102596364, auc : 0.6363585891070163
epoch : 127, train_loss : 0.4305085852636513, auc : 0.6364531566869355
epoch : 128, train_loss : 0.43062293402692103, auc : 0.6365487437329067
epoch : 129, train_loss : 0.4300853119674304, auc : 0.6366414777861192
epoch : 130, train_loss : 0.4298809513132623, auc : 0.6367301920235913
epoch : 131, train_loss : 0.43007450894261084, auc : 0.6368151968501549
ep