In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import warnings
import torch.nn as nn
import torch
from torch.utils import data
import torch.optim as optim
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.datasets import make_moons
import nltk
warnings.filterwarnings(action='ignore')

In [2]:
if torch.cuda.is_available():
    device = torch.device('cuda')
else:
    device = torch.device('cpu')

print('Using PyTorch version:', torch.__version__, ' Device:', device)

Using PyTorch version: 1.10.2  Device: cpu


In [4]:
df_game=pd.read_csv("../data/all_game.csv")
df_game_s=df_game[['title','userscore','summary']]
df_game_s

Unnamed: 0,title,userscore,summary
0,Surgeon Simulator: Experience Reality,4.40,This game is ridiculously fun once you ve plac...
1,Transformers: Rise of the Dark Spark,4.20,Fight your way through both Earth and Cybertro...
2,NBA Live 14,2.60,BounceTek Revolutionary physics based dribb...
3,WWE 2K20,1.60,F the haters This game is still fun And it ...
4,VR Karts,5.60,The checkered flag is about to drop on a new V...
...,...,...,...
14350,Road 96,6.70,Summer 1996 Today is the day You hit the ro...
14351,WRC 9 FIA World Rally Championship,6.00,The game runs nicely 60fps xbox series S The...
14352,Shadow Warrior 3,6.00,Shadow Warrior 3 launches the offbeat first pe...
14353,Blast Brigade vs. the Evil Legion of Dr. Cread,6.53,Blast Brigade vs The Evil Legion of Dr Crea...


In [5]:
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\sj\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\sj\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\sj\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


True

In [7]:
stopwords=nltk.corpus.stopwords.words('english')
df_game_s['token_summary'] = df_game_s['summary'].apply(nltk.word_tokenize)
df_game_s['token_summary'] = df_game_s['token_summary'].apply(lambda x: [item for item in x if item not in stopwords])

In [8]:
positive_summary=np.hstack(df_game_s[df_game_s.userscore>=df_game_s['userscore'].describe()['25%']]['token_summary'].values)
negative_summary=np.hstack(df_game_s[df_game_s.userscore<=df_game_s['userscore'].describe()['25%']]['token_summary'].values)

summary 칼럼을 토큰화하고 하위 25프로의 유저평점과 나머지를 분리해서 단어들을 알아본다.

In [10]:
dic_pos={}
for x in np.unique(positive_summary):
    dic_pos[x]=0
for x in positive_summary:
    dic_pos[x]+=1
dic_pos = dict(sorted(dic_pos.items(),reverse=True,key=lambda item: item[1]))

In [11]:
dic_neg={}
for x in np.unique(negative_summary):
    dic_neg[x]=0
for x in negative_summary:
    dic_neg[x]+=1
dic_neg = dict(sorted(dic_neg.items(),reverse=True,key=lambda item: item[1]))

긍정적인 단어 빈도수

In [10]:
dic_pos

{'game': 8503,
 'The': 5616,
 'new': 3988,
 'I': 3873,
 'world': 2984,
 'players': 1838,
 'one': 1809,
 'time': 1687,
 'play': 1642,
 'story': 1580,
 'like': 1544,
 'It': 1441,
 'action': 1438,
 'A': 1408,
 'games': 1373,
 'In': 1346,
 'You': 1303,
 'adventure': 1266,
 'player': 1246,
 'gameplay': 1206,
 'experience': 1197,
 'This': 1187,
 'way': 1170,
 'first': 1142,
 'characters': 1022,
 '2': 1011,
 'fun': 1010,
 'take': 1002,
 'unique': 965,
 'get': 951,
 'must': 929,
 'great': 905,
 'series': 882,
 'good': 853,
 'well': 840,
 'combat': 829,
 'battle': 826,
 'make': 815,
 'back': 810,
 'even': 793,
 'best': 784,
 'also': 780,
 'set': 779,
 'ever': 767,
 'really': 759,
 'life': 754,
 'based': 740,
 'find': 722,
 'features': 719,
 'None': 700,
 'control': 699,
 'As': 664,
 'different': 659,
 'enemies': 641,
 'much': 640,
 'weapons': 640,
 'original': 639,
 'style': 627,
 'every': 615,
 'friends': 613,
 'including': 598,
 'With': 593,
 'power': 590,
 'fight': 589,
 'across': 583,
 'pla

부정적인 단어 빈도수

In [11]:
dic_neg

{'game': 3043,
 'The': 1989,
 'new': 1567,
 'I': 1504,
 'world': 1054,
 'players': 857,
 'play': 684,
 'time': 675,
 'like': 634,
 'one': 620,
 'story': 585,
 'player': 542,
 'It': 541,
 'experience': 521,
 'A': 485,
 'In': 470,
 'action': 463,
 'first': 462,
 'games': 453,
 'This': 431,
 'way': 428,
 'You': 422,
 'gameplay': 417,
 'take': 411,
 'adventure': 393,
 'fun': 377,
 'get': 360,
 'life': 356,
 'unique': 332,
 'back': 331,
 '2': 322,
 'combat': 318,
 'also': 316,
 'even': 313,
 'good': 311,
 'great': 311,
 'control': 309,
 'must': 305,
 'characters': 302,
 'features': 297,
 'battle': 295,
 'real': 285,
 'weapons': 281,
 'series': 271,
 'set': 269,
 'fight': 265,
 'make': 265,
 'well': 265,
 'best': 264,
 'online': 263,
 'really': 263,
 'every': 262,
 'ever': 261,
 'including': 260,
 'based': 258,
 'friends': 252,
 'team': 249,
 'different': 246,
 'As': 244,
 'mode': 237,
 'across': 236,
 'multiplayer': 236,
 'find': 234,
 'around': 231,
 'system': 227,
 'city': 221,
 'takes': 

테스트셋 트레이닝셋 분리

In [12]:
df_train,df_test = train_test_split(df_game_s, random_state = 1557)

In [13]:
print(df_train.shape,df_test.shape)

(10766, 4) (3589, 4)


In [13]:
vocab=set(np.hstack(df_game_s['token_summary'].values))

In [15]:
print(len(vocab))

46569


In [14]:
vocab = {tkn: i+2 for i, tkn in enumerate(vocab)}
vocab['<unk>'] = 0
vocab['<pad>'] = 1

단어임베딩구하기

In [17]:
embedding_layer = nn.Embedding(num_embeddings=len(vocab), embedding_dim=50,padding_idx=1)

In [18]:
print(embedding_layer.weight)

Parameter containing:
tensor([[ 1.3953, -1.4804,  0.5375,  ...,  0.5386,  0.3954,  0.4236],
        [ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
        [ 0.3729, -0.2503, -0.7438,  ...,  0.6275, -0.1872, -1.0986],
        ...,
        [ 1.2598,  1.0251,  0.9962,  ...,  0.1087, -0.1900,  0.0367],
        [ 1.0677, -0.3866,  0.1556,  ...,  0.6795,  0.8937,  0.7976],
        [ 0.6807, -1.1566, -0.6548,  ...,  1.7627,  0.4633, -1.3812]],
       requires_grad=True)


vocab_train을 통해 df_train의 summary를 정수인코딩

In [15]:
df_train['encoding_summary'] = df_train['token_summary'].apply(lambda x: [vocab[item] for item in x])
df_test['encoding_summary'] = df_test['token_summary'].apply(lambda x: [vocab[item] for item in x])

단어 개수에 비해 로우가 적음(데이터수의 부족)

In [16]:
mlen=0
for x in df_game_s.token_summary.values:
    mlen=max(mlen,len(x))
def mypad(x):
  if(len(x)<mlen):
    for y in range(len(x),mlen):
      x.append(1)
  return x
xtrain=df_train['encoding_summary'].apply(mypad)
xtest=df_test['encoding_summary'].apply(mypad)
xtrain=torch.tensor(list(df_train['encoding_summary'].values),dtype=torch.float32,requires_grad=True)
xtest=torch.tensor(list(df_test['encoding_summary'].values),dtype=torch.float32,requires_grad=True)

In [21]:
ytrain=torch.tensor(list(df_train['userscore'].values),dtype=torch.float32,requires_grad=True)
ytest=torch.tensor(list(df_test['userscore'].values),dtype=torch.float32,requires_grad=True)

모델input을 쓰기위해 크기 통일

In [22]:
print(xtest.shape)
print(xtrain.shape)
print(ytest.shape)
print(ytrain.shape)

torch.Size([3589, 509])
torch.Size([10766, 509])
torch.Size([3589])
torch.Size([10766])


In [23]:
class word_dataset(data.Dataset):
    def __init__(self, wdata, label):
        self.wdata = wdata
        self.label = label.reshape(label.shape[0],1)
        
    def __len__(self):
        return len(self.wdata)
    
    def __getitem__(self,index):
        return self.wdata[index],self.label[index]
testd=word_dataset(xtest,ytest)
traind=word_dataset(xtrain,ytrain)

In [24]:
train_loader = torch.utils.data.DataLoader(dataset=traind, batch_size=10, shuffle=False)
test_loader = torch.utils.data.DataLoader(dataset=testd, batch_size=10, shuffle=False)

CNN모델만들기

In [31]:
class Model_1(nn.Module):
    def __init__(self):
        super(Model_1, self).__init__()      
        self.conv1 = nn.Conv2d(1, 9, kernel_size=20, padding=0)
        self.conv2 = nn.Conv2d(9, 16, kernel_size=20, padding=0)
        self.conv3 = nn.Conv2d(16, 2, kernel_size=20, padding=0)
        self.fc1 = nn.Linear(1710, 300)
        self.fc2 = nn.Linear(300,50)
        self.fc3=nn.Linear(50,1)
        self.embedding = nn.Embedding(len(vocab),300)
        self.relu = torch.nn.ReLU()
        for m in self.modules():
            if isinstance(m, nn.Linear):
                torch.nn.init.kaiming_uniform_(m.weight.data, nonlinearity='relu')
            if isinstance(m,nn.Conv2d):
                torch.nn.init.kaiming_uniform_(m.weight.data, nonlinearity='relu')
            if isinstance(m,nn.Embedding):
                torch.nn.init.kaiming_uniform_(m.weight.data, nonlinearity='relu')
    def forward(self,x):
        x=self.embedding(x.long())
        x=x.unsqueeze(1)
        x = self.relu(self.conv1(x))
        x = torch.nn.functional.avg_pool2d(x, kernel_size=4,stride=2)
        x=self.relu(self.conv2(x))
        x= torch.nn.functional.avg_pool2d(x,kernel_size=5,stride=2)
        x=self.relu(self.conv3(x))
        x= torch.nn.functional.avg_pool2d(x,kernel_size=3,stride=2)
        x=torch.flatten(x,1)
        x=self.relu(self.fc1(x))
        x=self.relu(self.fc2(x))
        x=self.relu(self.fc3(x))
        return x

In [26]:
def training_epoch(train_loader, network, loss_func, optimizer, epoch):
    train_losses = []
    train_correct = 0
    log_interval = 300
    ret=[]
    for batch_idx, (wdata, label) in enumerate(train_loader):
        wdata, label = wdata.to(device), label.to(device)
        # 미분값의 초기화
        optimizer.zero_grad()
        # Forward propagration 계산하기.
        outputs = network(wdata)
        # Cross_entropy 함수를 적용하여 loss를 구하고 저장하기
        loss = loss_func(outputs,label)
        train_losses.append(loss.item())
        # Gradinet 구하기
        loss.backward()
        # weight값 update 하
        optimizer.step()
        # 학습 상황 출력
        if batch_idx % log_interval == 0:
            print('Train Epoch: {} [{}/{} ({:.2f}%)]\tLoss: {:.6f}'
                  .format(epoch, batch_idx * len(label), len(train_loader.dataset),100. * batch_idx / len(train_loader),
                          loss.item()))
            
    return train_losses,ret

In [28]:
def test_epoch(test_loader, network, loss_func, val = False):
    correct = 0
    ret=[]
    test_losses = []
    
    with torch.no_grad():
        for batch_idx, (wdata, label) in enumerate(test_loader):
            wdata, label = wdata.to(device), label.to(device)

            # Forward propagration 계산하기.
            outputs = network(wdata)
            # Cross_entropy 함수를 적용하여 loss를 구하기
            loss = loss_func(outputs,label)
            test_losses.append(loss.item())
            print('Test Loss: {:.6f}'.format(loss.item()))
    return test_losses,ret


In [29]:
def training(network, learning_rate = 0.001):
    
    epoches = 15
    
    cls_loss = nn.MSELoss()
    optimizer = torch.optim.Adam(network.parameters(), lr=learning_rate)
    
    train_losses_per_epoch = []
    test_losses_per_epoch = []
    
    train_accuracies = []
    test_accuracies = []
    ret=[]
    
    for epoch in range(epoches):
                
        # 모델를 학습 중이라고 선언하기
        network.train()
        
        train_losses,temp = training_epoch(train_loader,network,cls_loss,optimizer, epoch)
        ret=temp
        # epoch 별로 loss 평균값, 정확도 구하기
        average_loss = np.mean(train_losses)
        train_losses_per_epoch.append(average_loss)
        
        ### 학습 중에  test 결과 보기
        
        # 모델 test 중인 것을 선언하기
    network.eval()
    with torch.no_grad():
        test_losses,temp2= test_epoch(test_loader, network, cls_loss, False)
        ret.extend(temp2)
    return train_losses_per_epoch, ret

In [32]:
network = Model_1().to(device)
rlt_const,ret = training(network)

Test Loss: 3.563887
Test Loss: 0.978539
Test Loss: 1.660845
Test Loss: 4.074283
Test Loss: 0.660261
Test Loss: 1.104236
Test Loss: 0.953768
Test Loss: 1.434016
Test Loss: 1.329500
Test Loss: 1.043593
Test Loss: 2.200508
Test Loss: 1.507666
Test Loss: 1.423650
Test Loss: 1.263549
Test Loss: 1.283376
Test Loss: 2.360460
Test Loss: 0.467073
Test Loss: 4.464883
Test Loss: 3.623060
Test Loss: 2.951789
Test Loss: 0.853498
Test Loss: 1.966871
Test Loss: 6.160370
Test Loss: 0.122162
Test Loss: 1.306352
Test Loss: 1.157467
Test Loss: 3.364812
Test Loss: 1.485745
Test Loss: 2.888636
Test Loss: 2.000520
Test Loss: 2.559095
Test Loss: 1.517628
Test Loss: 1.466278
Test Loss: 1.605185
Test Loss: 1.269633
Test Loss: 1.654809
Test Loss: 3.698386
Test Loss: 2.643437
Test Loss: 1.916873
Test Loss: 1.093114
Test Loss: 1.817584
Test Loss: 2.934255
Test Loss: 1.660047
Test Loss: 2.420427
Test Loss: 2.012316
Test Loss: 0.534747
Test Loss: 2.114081
Test Loss: 0.484211
Test Loss: 3.484934
Test Loss: 0.622303
