In [1]:
import os
import numpy as np
import pandas as pd
import tensorflow as tf
import pickle
from tensorflow.python.keras.preprocessing.text import Tokenizer
from tensorflow.python.keras.preprocessing.sequence import pad_sequences
from Preprocessing.to_embedding import WordEmbedding
from data_format_phase3 import formatting
from Preprocessing.helper_functions import import_embedding, embedding_matrix_word2vec
from sklearn.model_selection import train_test_split

Using TensorFlow backend.


In [2]:
import keras

config = tf.ConfigProto( device_count = {'GPU': 1 , 'CPU': 8} ) 
config.gpu_options.allow_growth = True
sess = tf.Session(config=config) 
keras.backend.set_session(sess)

keras.backend.tensorflow_backend._get_available_gpus()
from tensorflow.python.client import device_lib
print(device_lib.list_local_devices())

[name: "/device:CPU:0"
device_type: "CPU"
memory_limit: 268435456
locality {
}
incarnation: 13983973283299293097
, name: "/device:GPU:0"
device_type: "GPU"
memory_limit: 4977721344
locality {
  bus_id: 1
  links {
  }
}
incarnation: 8874858971972510762
physical_device_desc: "device: 0, name: GeForce RTX 2070, pci bus id: 0000:01:00.0, compute capability: 7.5"
]


In [3]:
file_path = open("fomatted_data",'rb')
data = pickle.load(file_path)

In [4]:
data = data.drop("year", axis=1)

#data = data[:round(len(data)*.2)]
y = pd.get_dummies(data['polarity'])
X_train, X_dev, y_train, y_dev = train_test_split(data, y, test_size = 0.20, random_state=42)

X_train_nlp, X_dev_nlp = X_train['reviewText'], X_dev['reviewText']

X_train_meta, X_dev_meta = X_train.iloc[:,3:], X_dev.iloc[:,3:]
embedding_size = 300 #number of feature weights in embeddings
max_len = 400

In [5]:
#Basic Vectorization of data
#Review data
tokenizer = Tokenizer()
tokenizer.fit_on_texts(X_train_nlp)
word_index = tokenizer.word_index

def vectorize(data, tokenizer ,max_len):
    sequences = tokenizer.texts_to_sequences(data)
    padding = pad_sequences(sequences, maxlen = max_len)
    
    return padding

X_train_nlp = vectorize(X_train_nlp, tokenizer , max_len)
X_dev_nlp = vectorize(X_dev_nlp, tokenizer, max_len)

print('Found %s unique tokens.' % len(word_index))
print('Shape of train tensor', X_train_nlp.shape)
print('Shape of dev tensor', X_dev_nlp.shape)

Found 38806 unique tokens.
Shape of train tensor (72000, 400)
Shape of dev tensor (18000, 400)


## Game Data

In [6]:
game_df = pd.read_csv("../phase1_video_games-test-hidden.csv")
game_labels = pd.read_csv("../true_labels/true_game_labels.txt", header=None)
merged = pd.concat([game_df, game_labels], axis=1).drop('polarity', axis=1).rename(columns={0: "polarity"})



# get a list of columns

cols = list(merged)
# move the column to head of list using index, pop and insert
cols.insert(0, cols.pop(cols.index('polarity')))
cols

# use ix to reorder
merged = merged.ix[:, cols]
merged.to_csv('merged_df.csv', index=False)



from data_format_phase3 import formatting
game_data = formatting("merged_df.csv", test=True)

game_data = game_data.drop('year', axis=1)
game_data["noText"] = ""

game_y = pd.get_dummies(game_data['polarity'])




"""HVIS DU ÆNDRE HVOR MANGE FEATURES DER ER, SKAL DU ÆNDRE INDEXERING HER:"""
game_train_meta = game_data.iloc[:,3:-1]


game_train_nlp = game_data['noText']

game_train_nlp = vectorize(game_train_nlp, tokenizer , max_len)
game_sets = [game_train_nlp, game_train_meta]



.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/indexing.html#ix-indexer-is-deprecated
  from ipykernel import kernelapp as app


Adding topic features
Test registered, writing topics to dataframe
Opening topic file
Adding test topics
['bad', 'funny', 'new', 'old', 'worth', 'long', 'worst', 'big', 'real', 'little', 'original', 'boring', 'excellent', 'acting', 'good', 'interesting', 'set', 'quality', 'special', 'great', 'better', 'best', 'like']


In [7]:
game_data.info()

#game_data.drop('affin_score', axis=1, inplace=True)





<class 'pandas.core.frame.DataFrame'>
Int64Index: 21142 entries, 0 to 21141
Data columns (total 28 columns):
polarity       21142 non-null object
summary        21142 non-null object
reviewText     21142 non-null object
bad            21142 non-null int64
funny          21142 non-null int64
new            21142 non-null int64
old            21142 non-null int64
worth          21142 non-null int64
long           21142 non-null int64
worst          21142 non-null int64
big            21142 non-null int64
real           21142 non-null int64
little         21142 non-null int64
original       21142 non-null int64
boring         21142 non-null int64
excellent      21142 non-null int64
acting         21142 non-null int64
good           21142 non-null int64
interesting    21142 non-null int64
set            21142 non-null int64
quality        21142 non-null int64
special        21142 non-null int64
great          21142 non-null int64
better         21142 non-null int64
best           21142 non

## Movie Data

In [8]:
movie_df = pd.read_csv("../phase1_movie_reviews-test-hidden.csv")
movie_labels = pd.read_csv("../true_labels/true_movie_labels.txt", header=None)
merged = pd.concat([movie_df, movie_labels], axis=1).drop('polarity', axis=1).rename(columns={0: "polarity"})
# get a list of columns
cols = list(merged)
# move the column to head of list using index, pop and insert
cols.insert(0, cols.pop(cols.index('polarity')))
cols

# use ix to reorder
merged = merged.ix[:, cols]
merged.to_csv('merged_df.csv', index=False)
from data_format_phase3 import formatting
movie_data = formatting("merged_df.csv", test=True)

movie_data = movie_data.drop('year', axis=1)
movie_data["noText"] = ""

movie_y = pd.get_dummies(movie_data['polarity'])



"""HVIS DU ÆNDRE HVOR MANGE FEATURES DER ER, SKAL DU ÆNDRE INDEXERING HER:"""
movie_train_meta = movie_data.iloc[:,3:-1]


movie_train_nlp = movie_data['noText']

movie_train_nlp = vectorize(movie_train_nlp, tokenizer , max_len)

movie_sets = [movie_train_nlp, movie_train_meta]







.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/indexing.html#ix-indexer-is-deprecated
  # This is added back by InteractiveShellApp.init_path()


Adding topic features
Test registered, writing topics to dataframe
Opening topic file
Adding test topics
['bad', 'funny', 'new', 'old', 'worth', 'long', 'worst', 'big', 'real', 'little', 'original', 'boring', 'excellent', 'acting', 'good', 'interesting', 'set', 'quality', 'special', 'great', 'better', 'best', 'like']


In [9]:
movie_data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 10000 entries, 0 to 9999
Data columns (total 28 columns):
polarity       10000 non-null object
summary        10000 non-null object
reviewText     10000 non-null object
bad            10000 non-null int64
funny          10000 non-null int64
new            10000 non-null int64
old            10000 non-null int64
worth          10000 non-null int64
long           10000 non-null int64
worst          10000 non-null int64
big            10000 non-null int64
real           10000 non-null int64
little         10000 non-null int64
original       10000 non-null int64
boring         10000 non-null int64
excellent      10000 non-null int64
acting         10000 non-null int64
good           10000 non-null int64
interesting    10000 non-null int64
set            10000 non-null int64
quality        10000 non-null int64
special        10000 non-null int64
great          10000 non-null int64
better         10000 non-null int64
best           10000 non-

In [10]:
from keras.models import load_model
# Returns a compiled model identical to the previous one
model = load_model('funct_GRU_model_ablation_no_text.h5')


In [11]:
predicted_game = model.predict(x=game_sets,batch_size=200, verbose=1)


#ARGMAX PREDICTIONS GAME
for i in predicted_game:
    max_idx = np.argmax(i)
    if max_idx == 0:
        i[0] = 1
        i[1] = 0
    else:
        i[1] = 1
        i[0] = 0
        
        


predicted_movie = model.predict(x=movie_sets,batch_size=200, verbose=1)
        
#ARGMAX PREDICTIONS MOVIE
for i in predicted_movie:
    max_idx = np.argmax(i)
    if max_idx == 0:
        i[0] = 1
        i[1] = 0
    else:
        i[1] = 1
        i[0] = 0
        




In [12]:
game_y = game_y.as_matrix()
print(game_y)
print(predicted_game)

print(type(game_y))
print(type(predicted_game))

[[0 1]
 [0 1]
 [0 1]
 ...
 [1 0]
 [1 0]
 [1 0]]
[[0. 1.]
 [0. 1.]
 [1. 0.]
 ...
 [1. 0.]
 [1. 0.]
 [1. 0.]]
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>


  """Entry point for launching an IPython kernel.


In [13]:
movie_y = movie_y.as_matrix()
print(movie_y)
print(predicted_movie)

print(type(game_y))
print(type(predicted_movie))

[[0 1]
 [0 1]
 [0 1]
 ...
 [1 0]
 [1 0]
 [1 0]]
[[0. 1.]
 [1. 0.]
 [0. 1.]
 ...
 [1. 0.]
 [0. 1.]
 [1. 0.]]
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>


  """Entry point for launching an IPython kernel.


## Accuracy score Game data: 


In [14]:
from sklearn.metrics import accuracy_score

In [15]:
y_pred_game = predicted_game
y_true_game = game_y
accuracy_score(y_true_game, y_pred_game)

0.7456248226279444

## Accuracy score movie data: 

In [16]:
y_pred_movie = predicted_movie
y_true_movie = movie_y
accuracy_score(y_true_movie, y_pred_movie)

0.7436

## Precision, Recall, f1-score for Game Data

In [17]:
from sklearn.metrics import classification_report

In [18]:
target_names = ['class 0', 'class 1']
print(classification_report(game_y, predicted_game, target_names=target_names))

              precision    recall  f1-score   support

     class 0       0.77      0.71      0.74     10571
     class 1       0.73      0.78      0.75     10571

   micro avg       0.75      0.75      0.75     21142
   macro avg       0.75      0.75      0.75     21142
weighted avg       0.75      0.75      0.75     21142
 samples avg       0.75      0.75      0.75     21142



## Precision, Recall, f1-score for Movie Data

In [19]:
target_names = ['class 0', 'class 1']
print(classification_report(movie_y, predicted_movie, target_names=target_names))

              precision    recall  f1-score   support

     class 0       0.74      0.74      0.74      4973
     class 1       0.74      0.75      0.75      5027

   micro avg       0.74      0.74      0.74     10000
   macro avg       0.74      0.74      0.74     10000
weighted avg       0.74      0.74      0.74     10000
 samples avg       0.74      0.74      0.74     10000

