In [None]:
# mount Google Drive
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
# read a file
gdrive_path = '/content/drive/MyDrive/mydata/'

In [None]:
pip install corextopic



In [None]:
import numpy as np
import scipy.sparse as ss
import pickle
from corextopic import corextopic as ct
from sklearn.feature_extraction.text import CountVectorizer
import pandas as pd
from ast import literal_eval

## Aspect Extraction using CorEx

In [None]:
# read review dataset
data_anchored_corex = pd.read_csv(f'{gdrive_path}PlayStation_Game_Reviews_Cleaned.csv')

# read restructured aspects
train_topics = pd.read_excel(f'{gdrive_path}PS Game Extracted Topics.xlsx')

df_anchored_corex = pd.DataFrame()
data_words_anchored_corex = []
for x in data_anchored_corex['Clean']:
    data_words_anchored_corex.append(' '.join(literal_eval(x)))

keywords = []
for x in train_topics['Keys']:
    keywords.append(literal_eval(x))

In [None]:
vectorizer_anchored_corex = CountVectorizer(stop_words='english', max_features=20000, binary=True)

doc_word_anchored_corex = vectorizer_anchored_corex.fit_transform(data_words_anchored_corex)
doc_word_anchored_corex = ss.csr_matrix(doc_word_anchored_corex)

words = list(np.asarray(vectorizer_anchored_corex.get_feature_names_out()))

In [None]:
#Train the CorEx topic model with 6 topics
topic_model_anchored_corex = ct.Corex(n_hidden=6, words=words, max_iter=1000, verbose=False, seed=2022)
topic_model_anchored_corex.fit(doc_word_anchored_corex, words=words, anchors = keywords, anchor_strength=3);

#Save the model and topics
pickle.dump(topic_model_anchored_corex, open(f'{gdrive_path}Assignment 1/CorEx_Model/Anchored_CorEx_Train_model.sav', 'wb'))
topic_list_anchored_corex = topic_model_anchored_corex.get_topics()

In [None]:
df_anchored_corex['Topics'] = topic_list_anchored_corex
df_anchored_corex.to_excel(f'{gdrive_path}Assignment 1/CorEx_Model/Anchored_CorEx_Topics.xlsx')

In [None]:
# Print all topics from the CorEx topic model
anchored_corex_topics = topic_model_anchored_corex.get_topics()
for n,topic in enumerate(anchored_corex_topics):
    topic_words,_,_ = zip(*topic)
    print('{}: '.format(n) + ', '.join(topic_words))

0: story, character, action, combat, soul, mission, war, boss, fighting, dark
1: great, good, fun, love, amazing, awesome, game, enjoy, entertain, exciting
2: excellent, easy, perfect, difficult, incredible, repetitive, solid, smooth, bug, replay
3: buy, recommend, worth, price, money, highly, value, definitely, sale, cost
4: world, open, music, soundtrack, make, feel, player, enemy, level, little
5: graphic, pretty, beautiful, visual, stunning, wonderful, scene, cinematic, gameplay, compelling


## Label PlayStation Game Review Using CorEx

In [None]:
data = pd.read_csv(f'{gdrive_path}PlayStation_Game_Reviews_Cleaned.csv')

aspect_list = [[],[],[],[],[],[]]
words_set = []
for x in data['Clean']:
    words_set.append(set(literal_eval(x)))

model = pickle.load(open(f'{gdrive_path}Assignment 1/CorEx_Model/Anchored_CorEx_Train_model.sav', 'rb'))
topic_list = []
for i, topic_words in enumerate(model.get_topics()):
  topic_list.append(set([words[0] for words in topic_words if words[1] > 0]))

for words in words_set:
  for i,topic_words in enumerate(topic_list):
      if (words & topic_words):
        aspect_list[i].append(1)
      else :
        aspect_list[i].append(0)

for i in range(6):
  data['Topic ' + str(i)] = aspect_list[i]

data.to_excel(f'{gdrive_path}Assignment 1/PlayStation_Game_Reviews_Labelled_Aspect.xlsx', index=False)

## Sentiment Analysis using BiLSTM-Skip Gram

In [None]:
from keras.models import load_model
from tensorflow.keras.preprocessing.sequence import pad_sequences

# Load biLSTM model from the file
loaded_model = load_model(f'{gdrive_path}Assignment 1/Sentiment_bilstm_skipgram.sav')

# Load tokenizer
loaded_tokenizer = pickle.load(open(f'{gdrive_path}Assignment 1/lstm_w2v_tokenizer.pkl', 'rb'))



In [None]:
review_sentence = data['review'].astype(str).values

loaded_tokenizer.fit_on_texts(data.review.astype(str))
padded_text = pad_sequences(loaded_tokenizer.texts_to_sequences(review_sentence), maxlen=300)

In [None]:
score = loaded_model.predict(padded_text)



In [None]:
label = []
for s in score:
  if s < 0.5:
    label.append('negative')
  else:
    label.append('positive')

data['Score'] = score
data['Sentiment'] = label

data

Unnamed: 0,review_id,product,review,Clean,Topic 0,Topic 1,Topic 2,Topic 3,Topic 4,Topic 5,Score,Sentiment
0,0,Assassin's Creed Origins Standard Edition,it's a great game for who likes a single player.,"['great', 'game', 'like', 'single', 'player']",0,1,0,0,1,0,0.705584,positive
1,0,Assassin's Creed Origins Standard Edition,uncountables missions and come back the old st...,"['uncountable', 'mission', 'come', 'back', 'ol...",1,0,0,0,0,0,0.331598,negative
2,1,Assassin's Creed Origins Standard Edition,i ordered a new copy and received a previously...,"['order', 'new', 'copy', 'receive', 'previousl...",0,0,0,0,1,0,0.257791,negative
3,1,Assassin's Creed Origins Standard Edition,the plastic of the case was broken.,"['plastic', 'case', 'break']",0,0,0,0,0,0,0.341962,negative
4,2,Assassin's Creed Origins Standard Edition,i have lots of fun playing this game.,"['lot', 'fun', 'playing', 'game']",0,1,0,0,0,0,0.331739,negative
...,...,...,...,...,...,...,...,...,...,...,...,...
120470,52446,Tony Hawk's Pro Skater 1 + 2,"accepting the steep learning curve, i would de...","['accept', 'steep', 'learning', 'curve', 'defi...",0,1,0,1,0,0,0.441534,negative
120471,52446,Tony Hawk's Pro Skater 1 + 2,others can probably give more details about ho...,"['other', 'probably', 'give', 'detail', 'eleme...",0,1,0,0,0,0,0.346834,negative
120472,52446,Tony Hawk's Pro Skater 1 + 2,i can't speak to that.,['speak'],0,0,0,0,0,0,0.265817,negative
120473,52446,Tony Hawk's Pro Skater 1 + 2,what i can say is that as a person who always ...,"['say', 'person', 'always', 'pass', 'interest'...",0,1,0,0,0,0,0.075322,negative


In [None]:
data.to_excel(f'{gdrive_path}Assignment 1/PlayStation_Game_Reviews_Final.xlsx', index=False)

## Rank Top 5 Products With Highest Positive Sentiment for Each Topic

In [None]:
# dropping rows with sentiment being negative
data.drop(data[data['Sentiment'] == 'negative'].index, inplace=True)

In [None]:
topics = ['Topic 0', 'Topic 1', 'Topic 2', 'Topic 3', 'Topic 4', 'Topic 5']

for topic in topics:
    data[topic] = data[topic] * data['Score']

In [None]:
# Group by 'product' and calculate the mean sentiment values for each topic
sentiment_topics = data.groupby('product')[['Topic 0', 'Topic 1', 'Topic 2', 'Topic 3', 'Topic 4', 'Topic 5']].mean().reset_index()

# Rename columns
column_mapping = {'product': 'PlayStation Game',
                  'Topic 0': 'Genres',
                  'Topic 1': 'Mood',
                  'Topic 2': 'Game Experience',
                  'Topic 3': 'Price',
                  'Topic 4': 'Soundtrack',
                  'Topic 5': 'Graphics'}

sentiment_topics = sentiment_topics.rename(columns=column_mapping)
sentiment_topics

Unnamed: 0,PlayStation Game,Genres,Mood,Game Experience,Price,Soundtrack,Graphics
0,Assassin's Creed Origins Standard Edition,0.128742,0.442413,0.040628,0.050935,0.088607,0.105529
1,Batman-Return to Arkham Standard Edition,0.079191,0.438686,0.044327,0.089264,0.04481,0.100866
2,Call of Duty-Black Ops Cold War Standard Edition,0.065884,0.429765,0.040979,0.053145,0.067773,0.10388
3,Call of Duty-Modern Warfare II Standard Edition,0.068386,0.426535,0.043776,0.056888,0.055975,0.098342
4,DOOM Eternal Standard Edition,0.109752,0.407997,0.054125,0.052884,0.118373,0.086114
5,DRAGON BALL FighterZ Standard Edition,0.143866,0.482066,0.075896,0.061822,0.066765,0.100645
6,Days Gone,0.161514,0.460581,0.057766,0.058198,0.108703,0.109279
7,Death Stranding Standard Edition,0.164726,0.415096,0.032409,0.067648,0.109477,0.137443
8,Demon's Souls Standard Edition,0.098845,0.434421,0.065715,0.066837,0.077079,0.132813
9,Elden Ring Standard Edition,0.130373,0.43138,0.057593,0.057623,0.135585,0.073487


In [None]:
labelled_topics = ['Genres', 'Mood', 'Game Experience', 'Price', 'Soundtrack', 'Graphics']
result_df = pd.DataFrame()
for t in labelled_topics:
    # Find the 5 largest values in the specified column
    top_5_products = sentiment_topics.nlargest(5, t).reset_index()

    result_df[f'Top 5 based on {t}'] = top_5_products['PlayStation Game']

result_df

Unnamed: 0,Top 5 based on Genres,Top 5 based on Mood,Top 5 based on Game Experience,Top 5 based on Price,Top 5 based on Soundtrack,Top 5 based on Graphics
0,Final Fantasy XVI Standard Edition,Little Big Planet 3,DRAGON BALL FighterZ Standard Edition,Grand Theft Auto V Premium Edition,Elden Ring Standard Edition,Ghost of Tsushima Director's Cut
1,Star Wars Jedi-Survivor Standard Edition,Sackboy-A Big Adventure Standard,Sekiro-Shadows Die Twice Game of the Year Edition,God of War III Remastered Standard Edition,Sackboy-A Big Adventure Standard,Horizon Forbidden West Launch Edition
2,Sekiro-Shadows Die Twice Game of the Year Edition,DRAGON BALL FighterZ Standard Edition,Marvel's Avengers,Batman-Return to Arkham Standard Edition,Tony Hawk's Pro Skater 1 + 2,Marvel's Spider-Man Miles Morales Standard Lau...
3,Marvel's Avengers,Marvel's Spider-Man Game of the Year Edition,Demon's Souls Standard Edition,Far Cry 6 Standard Edition,DOOM Eternal Standard Edition,Death Stranding Standard Edition
4,Ghost of Tsushima Director's Cut,Marvel's Spider-Man Miles Morales Standard Lau...,Star Wars Jedi-Survivor Standard Edition,Marvel's Spider-Man Game of the Year Edition,Hogwarts Legacy Standard Edition,Demon's Souls Standard Edition


In [None]:
result_df.to_excel(f'{gdrive_path}Assignment 1/PlayStation_Game_Top_5.xlsx', index=False)