# SETUP

In [1]:
import pandas as pd
import numpy as np
import nltk
import time
import spacy
import pickle
sp = spacy.load('en_core_web_md')
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
from nltk.corpus import stopwords

# Load data


In [2]:
# wdir="steam/"
# wdir="metacritic/"
wdir="AM_videogames/"
df = pd.read_pickle(wdir+"1_df.pkl")
print(df.shape)
print(df.groupby('game_title').review_id.nunique())
df = df.sample(frac=1).reset_index(drop=True) # random sampling and resetting index

df.head(50)

(2585, 4)
game_title
God of War III                    601
Mario Kart                        472
StarCraft II: Wings of Liberty    370
The Last of Us                    549
diablo III                        593
Name: review_id, dtype: int64


Unnamed: 0,review_id,game_title,review_text,score
0,215323,The Last of Us,I got this as a gift for myself and I like it.,5.0
1,63636,Mario Kart,My 6 year old loves this game. Very kid freind...,5.0
2,50502,Mario Kart,I've been a LONG TIME Mario Kart fan since SNE...,5.0
3,51671,God of War III,Great game.,5.0
4,51595,StarCraft II: Wings of Liberty,Fantastic game! The game play wonderfully and...,5.0
5,215562,The Last of Us,Very happy with this game. It's extremely ente...,5.0
6,50386,Mario Kart,I was a big fan of Mario Kart for the Super NE...,5.0
7,65050,God of War III,EXCELLENT...,5.0
8,215206,The Last of Us,Really great game love it so much story is aw...,5.0
9,65302,God of War III,This is a good game but I find it difficult to...,3.0


# Preprocessing reviews

## split reviews in sentences + compute sentence polarities

In [3]:
from nltk import tokenize # del
from sentiment_polarity import * 
# nltk.download('punkt') # doesnt re download is exists ) 
from stopwords import *

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\basil\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [4]:
# %%timeit -n 1 -r 1 
sentences_df = extract_sentences(df)


sentences_df['vader_polarity'] = sentences_df["sentence"].apply(getVaderPolarity)


print(sentences_df.shape)
sentences_df.to_pickle(wdir+"2_df.pkl")

sentences_df.head(20)

2585
100% processed     
(15252, 4)


Unnamed: 0,review_id,game_title,sentence,vader_polarity
0,215323,The Last of Us,I got this as a gift for myself and I like it.,0.6597
1,63636,Mario Kart,My 6 year old loves this game.,0.5719
2,63636,Mario Kart,Very kid freindly and lots of fun... Lots of t...,0.8313
3,50502,Mario Kart,I've been a LONG TIME Mario Kart fan since SNE...,0.8854
4,50502,Mario Kart,It's about darn time Nintendo!!,0.0
5,50502,Mario Kart,The Mii feature is a very fun aspect of the Wi...,0.764
6,50502,Mario Kart,Another great feature is the ONLINE PLAY - whi...,0.9619
7,50502,Mario Kart,Being able to race strangers worldwide is very...,0.9459
8,50502,Mario Kart,One poor feature for me and my friends is the ...,0.2112
9,50502,Mario Kart,The remote and nunchuck are surely the way to ...,0.6486


## Stop words


In [5]:
# %%timeit -n 1 -r 1 # number of loop
sentences_df = pd.read_pickle(wdir+"2_df.pkl")

games_list=df['game_title'].unique()

proc_sent_df = remove_stop_prefixes(sentences_df,games_list)
print(proc_sent_df.shape)


proc_sent_df.to_pickle(wdir+"3_df.pkl")
proc_sent_df.head()
games_list

100% processed       % processed    % processed    % processed
(15252, 5)


array(['The Last of Us', 'Mario Kart', 'God of War III',
       'StarCraft II: Wings of Liberty', 'diablo III'], dtype=object)

## POS + Lemmatization
TB and WN, 30 sec for 20k sentences
SP is way longer, 60x longer (30 min )




In [6]:
from POSLemma import *

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\basil\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\basil\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\basil\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


In [7]:
# %%timeit -n 1 -r 1 # number of loop
POSLemmaChoice ="SP" # WN TB SP
proc_sent_df = pd.read_pickle(wdir+"3_df.pkl")


POSLemma_sent_df = POSLemma_extractor(proc_sent_df,POSLemmaChoice)


## Save POSLemma Dataframe
POSLemma_sent_df.to_pickle(wdir+"4_"+POSLemmaChoice+"_df.pkl")
print(POSLemma_sent_df.head(5))
print(POSLemma_sent_df.shape)

7  % processed

KeyboardInterrupt: 

# Create FOPs and compute importance

## FOPs extraction

In [9]:
from FOPs import * 

In [12]:
FOPs_df = pd.read_pickle(wdir+"4_"+POSLemmaChoice+"_df.pkl")
FOPs_df.head()

Unnamed: 0,review_id,game_title,sentence,vader_polarity,processed_sentence,tags
0,59465,diablo III,It's impossible to decide where to begin.,0.0,impossible decide begin,"[(impossible, a), (decide, n), (begin, v)]"
1,59465,diablo III,The only reason this game sold so well is beca...,0.9465,reason sell ii love brilliant gaming,"[(reason, n), (sold, v), (ii, None), (loved, v..."
2,59465,diablo III,The developers keep using this as some ridicul...,0.296,developer ridiculous sign success,"[(developers, n), (ridiculous, a), (sign, n), ..."
3,59465,diablo III,Diablo III is garbage.,0.0,iii garbage,"[(iii, None), (garbage, n)]"
4,59465,diablo III,We only bought Diablo III because Diablo II wa...,0.5859,buy iii ii brilliant decent,"[(bought, v), (iii, n), (ii, None), (brilliant..."


In [None]:
# %%timeit -n 1 -r 1 # number of loop
"""
input: Dataframe of sentences with POS tags, chosen POS method and game title. 
Output: a dictionary of FOPs with their occurence and importance(occurence*sentence polarity) , structured as: 
        {"feature1": 
            {"opinion1": 
                { "count": value, 
                "importance":value } }
            {"opinion2": 
                { "count": value, 
                "importance":value } }
"""

POSLemmaChoice="SP"
FOPs_df = pd.read_pickle(wdir+"4_"+POSLemmaChoice+"_df.pkl")
# print(FOPs_df.groupby('game_title').review_id.nunique())
games_list=df['game_title'].unique()


print(games_list)
chosen_game="diablo III"


FOPs_game_dic = extract_all_FOPs(FOPs_df, chosen_game)

save_FOPs_dic_csv(FOPs_game_dic, wdir,chosen_game )


In [None]:
# FOPs_game_dic

## count and importance FOPs results
Can also select only negatives or only positive, select specific feature(s)

In [None]:
FOPs_df = pd.read_csv(wdir+chosen_game+"_FOPs.csv")  # Creation of the dataframe 

FOPs_df["count"]=FOPs_df["count"]/2
FOPs_df["importance"]=FOPs_df["importance"]/2
FOPs_df["polarity"] = FOPs_df["importance"]/FOPs_df["count"]
FOPs_df["importance"]= abs(FOPs_df["importance"])


FOPs_df = FOPs_df.sort_values('importance', ascending=False)



FOPs_df=FOPs_df.reset_index(drop=True)
# maxCount = FOPs_df.iloc[0]["count"]
# FOPs_df = FOPs_df.loc[FOPs_df["count"] > maxCount/20]
# FOPs_df = FOPs_df.sort_values('importance', ascending=False)
print(FOPs_df.shape)
FOPs_df.head(20)

In [None]:
sortir les 20-40 graphiques selon l'importance, 
tous les sauver  blabla 



In [None]:
information retrieval 



In [None]:
FOPs_df = FOPs_df.sort_values('count', ascending=False)
FOPs_df=FOPs_df.reset_index(drop=True)

FOPs_df.head(5)