# SETUP

In [13]:
import pandas as pd
import numpy as np
import nltk
import time
import spacy
import pickle
sp = spacy.load('en_core_web_md')
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
from nltk.corpus import stopwords

# Load data


In [15]:
# wdir="steam/"
# wdir="metacritic/"
wdir="AM_videogames/"
df = pd.read_pickle(wdir+"1_df.pkl")
print(df.shape)
print(df.groupby('game_title').review_id.nunique())
df = df.sample(frac=1).reset_index(drop=True) # random sampling and resetting index

df.head(50)

(2585, 4)
game_title
God of War III                    601
Mario Kart                        472
StarCraft II: Wings of Liberty    370
The Last of Us                    549
diablo III                        593
Name: review_id, dtype: int64


Unnamed: 0,review_id,game_title,review_text,score
0,59292,diablo III,Perfect,5.0
1,63575,Mario Kart,Tons of fun!,5.0
2,51574,StarCraft II: Wings of Liberty,Great game but too short. Great deal for fift...,5.0
3,50465,Mario Kart,This game is not nearly as fun as Mario Kart D...,3.0
4,65493,God of War III,Let me start by saying that I have played the ...,5.0
5,64934,God of War III,Been wanting this forever and it was worth the...,5.0
6,215219,The Last of Us,"Such a great game, I just started playing it a...",5.0
7,72845,diablo III,Done in a few hard core nights.\n\n1) single p...,3.0
8,59315,diablo III,"A fun game, but you'll want the expansions to ...",4.0
9,215349,The Last of Us,Finally get to finish this fantastic game. It'...,5.0


# Preprocessing reviews

## split reviews in sentences + compute sentence polarities

In [5]:
from nltk import tokenize # del
from sentiment_polarity import * 
# nltk.download('punkt') # doesnt re download is exists ) 
from stopwords import *

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\basil\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [16]:
# %%timeit -n 1 -r 1 
sentences_df = extract_sentences(df)


sentences_df['vader_polarity'] = sentences_df["sentence"].apply(getVaderPolarity)


print(sentences_df.shape)
sentences_df.to_pickle(wdir+"2_df.pkl")

sentences_df.head(20)

2585
100% processed     
(15252, 4)


Unnamed: 0,review_id,game_title,sentence,vader_polarity
0,59292,diablo III,Perfect,0.5719
1,63575,Mario Kart,Tons of fun!,0.5562
2,51574,StarCraft II: Wings of Liberty,Great game but too short.,0.3716
3,51574,StarCraft II: Wings of Liberty,Great deal for fifteen dollars.,0.6249
4,51574,StarCraft II: Wings of Liberty,Wasn't that hard to log on to Blizzard.,0.0762
5,51574,StarCraft II: Wings of Liberty,Highly recommend.,0.4201
6,50465,Mario Kart,This game is not nearly as fun as Mario Kart D...,-0.7493
7,50465,Mario Kart,Another thing I don't like is that there are n...,-0.5122
8,50465,Mario Kart,"The online play is the game's saving grace, ot...",0.6369
9,50465,Mario Kart,"Online play is well executed, no noticeable la...",-0.2361


## Stop words


In [7]:
# %%timeit -n 1 -r 1 # number of loop
sentences_df = pd.read_pickle(wdir+"2_df.pkl")

games_list=df['game_title'].unique()

proc_sent_df = remove_stop_prefixes(sentences_df,games_list)
print(proc_sent_df.shape)


proc_sent_df.to_pickle(wdir+"3_df.pkl")
proc_sent_df.head()
games_list

100% processed       % processed    % processed    % processed
(15252, 5)


array(['Mario Kart', 'The Last of Us', 'diablo III', 'God of War III',
       'StarCraft II: Wings of Liberty'], dtype=object)

## POS + Lemmatization
TB and WN, 30 sec for 20k sentences
SP is way longer, 60x longer (30 min )




In [8]:
from POSLemma import *

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\basil\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\basil\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\basil\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


In [9]:
# # %%timeit -n 1 -r 1 # number of loop
# POSLemmaChoice ="SP" # WN TB SP
# proc_sent_df = pd.read_pickle(wdir+"3_df.pkl")


# POSLemma_sent_df = POSLemma_extractor(proc_sent_df,POSLemmaChoice)


# ## Save POSLemma Dataframe
# POSLemma_sent_df.to_pickle(wdir+"4_"+POSLemmaChoice+"_df.pkl")
# print(POSLemma_sent_df.head(5))
# print(POSLemma_sent_df.shape)

# Create FOPs and compute importance

## FOPs extraction

In [10]:
from FOPs import * 

In [11]:
# %%timeit -n 1 -r 1 # number of loop
"""
input: Dataframe of sentences with POS tags, chosen POS method and game title. 
Output: a dictionary of FOPs with their occurence and importance(occurence*sentence polarity) , structured as: 
        {"feature1": 
            {"opinion1": 
                { "count": value, 
                "importance":value } }
            {"opinion2": 
                { "count": value, 
                "importance":value } }
"""

POSLemmaChoice="SP"
FOPs_df = pd.read_pickle(wdir+"4_"+POSLemmaChoice+"_df.pkl")
# print(FOPs_df.groupby('game_title').review_id.nunique())
games_list=df['game_title'].unique()


print(games_list)
chosen_game="diablo III"


FOPs_game_dic = extract_all_FOPs(FOPs_df, chosen_game)

save_FOPs_dic_csv(FOPs_game_dic, wdir,chosen_game )


['Mario Kart' 'The Last of Us' 'diablo III' 'God of War III'
 'StarCraft II: Wings of Liberty']
4864  sentences for the game  diablo III
99  % processed

In [18]:
# FOPs_game_dic

## count and importance FOPs results
Can also select only negatives or only positive, select specific feature(s)

In [25]:
FOPs_df = pd.read_csv(wdir+chosen_game+"_FOPs.csv")  # Creation of the dataframe 

FOPs_df["count"]=FOPs_df["count"]/2
FOPs_df["importance"]=FOPs_df["importance"]/2
FOPs_df["polarity"] = FOPs_df["importance"]/FOPs_df["count"]
FOPs_df["importance"]= abs(FOPs_df["importance"])


FOPs_df = FOPs_df.sort_values('importance', ascending=False)



FOPs_df=FOPs_df.reset_index(drop=True)
# maxCount = FOPs_df.iloc[0]["count"]
# FOPs_df = FOPs_df.loc[FOPs_df["count"] > maxCount/20]
# FOPs_df = FOPs_df.sort_values('importance', ascending=False)
print(FOPs_df.shape)
FOPs_df.head(20)

(23005, 5)


Unnamed: 0,feature,opinion,count,importance,polarity
0,items,lettered,22.0,12.6732,0.576055
1,items,better,11.0,8.1235,0.7385
2,auction,good,10.0,5.7823,0.57823
3,house,good,10.0,5.7823,0.57823
4,time,good,8.0,5.7815,0.722688
5,farm,shotted,6.0,5.724,-0.954
6,fun,shotted,6.0,5.724,-0.954
7,graphics,good,10.0,5.672,0.5672
8,gameplay,terrible,6.0,5.478,0.913
9,bonuses,set,6.0,5.4252,0.9042


In [None]:
sortir les 20-40 graphiques selon l'importance, 
tous les sauver  blabla 



In [None]:
information retrieval 



In [11]:
FOPs_df = FOPs_df.sort_values('count', ascending=False)
FOPs_df=FOPs_df.reset_index(drop=True)

FOPs_df.head(5)

Unnamed: 0,feature,opinion,count,importance,intensity,polarity
0,player,single,102,8.7512,0.085796,negative
1,money,real,78,0.452,0.005795,negative
2,auction,real,56,3.0648,0.054729,negative
3,house,real,52,2.7552,0.052985,negative
4,items,lettered,44,25.3464,0.576055,positive
