In [1]:
import nltk
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.decomposition import PCA
from sklearn.feature_extraction.text import  TfidfVectorizer, ENGLISH_STOP_WORDS
from sklearn.decomposition import NMF, TruncatedSVD, LatentDirichletAllocation
from nltk.stem.wordnet import WordNetLemmatizer
from gensim import corpora, models, similarities, matutils

In [2]:
digital_videogames_df = pd.read_pickle('digital_videogames_df.pkl')

In [4]:
digital_videogames_df.star_rating.value_counts(normalize=True)

5    0.554379
1    0.171692
4    0.140460
3    0.080077
2    0.053391
Name: star_rating, dtype: float64

In [5]:
# removing 3 star ratings:
digital_videogames_df = digital_videogames_df[digital_videogames_df.star_rating!=3]

In [6]:
# creating sentiment analysis
digital_videogames_df['sentiment'] = np.where(digital_videogames_df['star_rating'] >= 4, 'positive', 'negative')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  digital_videogames_df['sentiment'] = np.where(digital_videogames_df['star_rating'] >= 4, 'positive', 'negative')


In [7]:
digital_videogames_df.sentiment.value_counts(normalize=True)

positive    0.755324
negative    0.244676
Name: sentiment, dtype: float64

In [8]:
# dropping all rows that are determined to be non-english
digital_videogames_df = digital_videogames_df.loc[digital_videogames_df['review_language']=='en'].reset_index(drop=True)

In [9]:
digital_videogames_df[digital_videogames_df['product_title'].str.contains("Battlefield")]

Unnamed: 0,marketplace,customer_id,review_id,product_id,product_parent,product_title,product_category,star_rating,helpful_votes,total_votes,vine,verified_purchase,review_headline,review_body,review_date,review_body_clean,review_language,sentiment
193,US,38322383,R14Y5ZK1QWIH85,B00KSLNI82,135925655,Battlefield Hardline,Digital_Video_Games,5,0,0,N,N,I totally disliked the beta of this game,I totally disliked the beta of this game. I or...,2015-08-29,i totally disliked the beta of this game i or...,en,positive
278,US,16180464,R1RH1XFEGV9C0E,B00480OTRS,196465830,Battlefield 2 Complete Collection,Digital_Video_Games,2,0,0,N,Y,"Weak, you can pass this up, save your ...","Weak,you can pass this up,save your money,and ...",2015-08-28,weak you can pass this up save your money and ...,en,negative
412,US,39593156,RAB4CHAGDGYH7,B00452VG02,959055547,Battlefield: Bad Company 2,Digital_Video_Games,5,0,0,N,Y,Five Stars,"Awesome, Beautiful game",2015-08-25,awesome beautiful game,en,positive
420,US,39593156,R1GEG2UVZI1911,B00BXONG7G,240448759,Battlefield 4,Digital_Video_Games,5,0,0,N,Y,Five Stars,"Awesome, Beautiful game",2015-08-25,awesome beautiful game,en,positive
457,US,39593156,R1RWDWUYZD7ZIA,B0087STJLS,296282987,Battlefield 3: Premium Season Pass,Digital_Video_Games,5,0,0,N,Y,Five Stars,"Awesome, Beautiful game",2015-08-25,awesome beautiful game,en,positive
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
119413,US,40733899,RVTS3F5TY1C0Q,B00452VG02,959055547,Battlefield: Bad Company 2,Digital_Video_Games,5,1,1,N,Y,Works Fine,"Game Works Fine Both Online Multi,(Which the B...",2010-11-29,game works fine both online multi which the b...,en,positive
119478,US,29027478,R2O3VNQ360CBJS,B00452VG02,959055547,Battlefield: Bad Company 2,Digital_Video_Games,5,0,0,N,Y,peeka choo,When I played the multiplayer I was worried wh...,2010-11-23,when i played the multiplayer i was worried wh...,en,positive
119510,US,51605174,R2VXJAA20ZMV3K,B00452VG02,959055547,Battlefield: Bad Company 2,Digital_Video_Games,5,7,9,N,Y,This is a Black Friday's bargain,I do not know why the cd-key does not work for...,2010-11-19,i do not know why the cd key does not work for...,en,positive
119524,US,39599508,RXXIM062Z3V14,B00452VG02,959055547,Battlefield: Bad Company 2,Digital_Video_Games,1,5,16,N,N,Never buy used!!!,This game crashes and the code is not transfer...,2010-11-18,this game crashes and the code is not transfer...,en,negative


In [10]:
digital_videogames_df[digital_videogames_df['product_title'].str.contains("Call of Duty")]

Unnamed: 0,marketplace,customer_id,review_id,product_id,product_parent,product_title,product_category,star_rating,helpful_votes,total_votes,vine,verified_purchase,review_headline,review_body,review_date,review_body_clean,review_language,sentiment
168,US,41199441,R2I060426U0BSZ,B00GGUVS5Y,898333217,Call of Duty: Ghosts,Digital_Video_Games,1,0,0,N,Y,One Star,"sucks, sorry I bought it",2015-08-29,sucks sorry i bought it,en,negative
191,US,2004884,R2VT1DHNI32I15,B00JJY2DKI,156164859,Call of Duty: Black Ops II Weaponized 115 Pack...,Digital_Video_Games,4,3,4,N,N,Four Stars,4🌟for the pack content but -1 for PC download ...,2015-08-29,🌟for the pack content but for pc download ...,en,positive
300,US,134399,RUID994N4KYDX,B00ATF5YYI,559945646,Call of Duty: World at War,Digital_Video_Games,5,0,0,N,Y,GET CUSTOM ZOMBIE MAPS,This game provides an endless amount of fun wh...,2015-08-27,this game provides an endless amount of fun wh...,en,positive
372,US,12521601,RUWT4NRBSZ9QD,B00GGUHB32,503156433,Call of Duty Black Ops II: Apocalypse DLC,Digital_Video_Games,5,0,0,N,Y,Five Stars,Download was smooth.Works well.,2015-08-26,download was smooth works well,en,positive
400,US,12521601,R2LN07V4FTN8IB,B00GGUHLXW,473331713,Call of Duty Black Ops II: Vengeance DLC,Digital_Video_Games,5,1,1,N,Y,Five Stars,Download was smooth.Works well.,2015-08-26,download was smooth works well,en,positive
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
115762,US,37846542,RAIPQ4NXAQB68,B004YNEEG0,670797969,Call of Duty 4: Modern Warfare,Digital_Video_Games,5,1,1,N,Y,Fun!,"The nultiplayer is great, if you have a 2010-2...",2011-08-02,the nultiplayer is great if you have a ma...,en,positive
116177,US,37841042,R4YBZ47SN2WI7,B004YNIJ7U,485302177,Call of Duty 2,Digital_Video_Games,5,0,0,N,Y,Great Game,This game is one of the best games I have play...,2011-07-07,this game is one of the best games i have play...,en,positive
116491,US,11404837,R2WUONSNLYISC5,B004YNEEG0,670797969,Call of Duty 4: Modern Warfare,Digital_Video_Games,2,0,6,N,Y,"Graphics excellent, but the game is not exciti...","I downloaded this game from AMAZON, it took ab...",2011-06-18,i downloaded this game from amazon it took ab...,en,negative
116528,US,12161817,R2674EM3JMY0A2,B004YNEEG0,670797969,Call of Duty 4: Modern Warfare,Digital_Video_Games,4,3,3,N,Y,Great Maps and Multiplayer,I've spent a lot of time playing the PC versio...,2011-06-17,i ve spent a lot of time playing the pc versio...,en,positive


In [11]:
digital_videogames_df[~digital_videogames_df['product_title'].str.contains("Card")].product_title.value_counts()

Xbox Live Subscription                                                       6462
Playstation Plus Subscription                                                3688
SimCity - Limited Edition                                                    3190
Battlefield 4                                                                1208
Final Fantasy XIV: A Realm Reborn                                            1081
                                                                             ... 
Ice Doodle Game! [Download]                                                     1
Hoard [Online Game Code]                                                        1
PlayStation Now Subscription Twister Parent                                     1
Strange Adventures Collector’s Edition Vol 1 (Mystery Masters) [Download]       1
SpaceChem                                                                       1
Name: product_title, Length: 6515, dtype: int64

In [12]:
# dropping all reviews below 20 characters
digital_videogames_df = digital_videogames_df[digital_videogames_df['review_body_clean'].map(len) > 20].reset_index(drop=True)

In [None]:
digital_videogames_df

In [13]:
# no null values in review_body column
digital_videogames_df.review_body.isnull().values.any()

False

In [14]:
# no null values in review_body_clean column
digital_videogames_df.review_body_clean.isnull().values.any()

False

In [15]:
digital_videogames_df.review_body_clean.sample(1000)

52864     theres not to much to say about this it is wha...
43573     what can i say  i got this on sale for    whic...
108989    dark void is a game made by some of the same p...
55546     dont download unless you have a really good co...
28238                    great vidyah game at a good price 
                                ...                        
13380                       geat value when on sale        
31400                              great game    recomended
30781                     the code doesn t work in my xbox 
26579               excellent service  with the right price
70261     its a game i sit down to play with a few frien...
Name: review_body_clean, Length: 1000, dtype: object

In [None]:
digital_videogames_df.review_body_clean[0]

In [31]:
stopwords = nltk.corpus.stopwords.words('english')
blockerwords = ['br', 'like', 'really', 'also', 'much', 'well', 'recommend', 'good', 'many', 'plus', 
                'hours', 'love', 'lot', 'well', 'great', 'first', 'two', 'ever', 'thing', 'one', 
               'xbox', 'live', 'psn', 'time', 'new', 'even', 'still', 'way', 'could', 'go', 'back',
               'even', 'want', 'game', 'games', 'would', 'better', 'play', 'played', 'playing', 'best'
               'get', 'got', 'little', 'make', 'find', 'product', 'amazon', 'awesome', 'far']
stopwords.extend(blockerwords)

In [32]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
# document term matrix with CountVectorizer

# vectorizer = CountVectorizer(max_df = 0.95, min_df=3, stop_words=stopwords, ngram_range=(1,1))

vectorizer = TfidfVectorizer(max_df=0.95, min_df=3, stop_words = stopwords, ngram_range=(1,2))

doc_word = vectorizer.fit_transform(digital_videogames_df.review_body_clean)

In [33]:
pd.DataFrame(doc_word.toarray(), index=digital_videogames_df.review_body_clean, columns = vectorizer.get_feature_names()).head(10)

Unnamed: 0_level_0,aa,aa af,aa forced,aaa,aaa developer,aaa examples,aaa gaming,aaa graphics,aaa indie,aaa quality,...,zuma type,zune,zune music,zune pass,zwei,zynga,ítem,über,재미,재미 thumbs
review_body_clean,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
i keep buying madden every year hoping they get back to football this years version is a little better than last years but that s not saying much the game looks great the only thing wrong with the animation is the way the players are always tripping on each other br br the gameplay is still slowed down by the bloated pre play controls what used to take two buttons is now a giant pita to get done before an opponent snaps the ball or the play clock runs out br br the turbo button is back but the player movement is still slow and awkward if you liked last years version i m guessing you ll like this too i haven t had a chance to play anything other than training and a few online games so i m crossing my fingers and hoping the rest is better br br the one thing i can recommend is not to buy the madden bundle the game comes as a download so if you hate it there s no trading it in at gamestop,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
if you are prepping for the end of the world this is one of those things that you should have installed on your end of the world proof pc hail to the great yuri,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
i like the new skills like herbalism in this and camping is fun i also like all the new build mode items,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
excellent fast and secure,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
as has been written by so many others i quickly lost interest in this game i am still playing civ and love it it s a shame because i m ready for an expanded version of civ and have waited for about a decade for a better version of it civ was not an evolution but a total rewrite and it lost all that was good in civ i really hope that when civ comes out they use civ as the starting point and forget civ ever happened failing that there is a place in the market for a strategy game that involves building a civilisation,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
probably the best game for learning aspects of real estate available hipsoft really hit the ball out of the park with this one with high educational value as well as an entertaining game in terms of leading you through the basics of real estate development even though this is several years old know the availability of downloads and apps for this game means this is still a must have for budding real estate moguls of tomorrow,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
cool but it lages alot of the time,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
lames purchase i almost never made,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
pretty good but not as good as the first brink of consciousness game dorian br gray,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
what can i say xbox live,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [34]:
vocab = vectorizer.get_feature_names()

In [35]:
len(vocab)

206497

## LSA

In [36]:
from sklearn.decomposition import TruncatedSVD
from sklearn.decomposition import NMF
from sklearn.metrics.pairwise import cosine_similarity

In [37]:
# lsa with 8 topics
lsa = TruncatedSVD(10)
doc_topic = lsa.fit_transform(doc_word)

In [38]:
lsa.explained_variance_ratio_

array([0.00106129, 0.00295595, 0.00216917, 0.00186776, 0.00179997,
       0.00161389, 0.00160104, 0.00151235, 0.00143621, 0.00140637])

In [39]:
topic_word = pd.DataFrame(lsa.components_.round(3), index=list(range(1,11)), columns=vectorizer.get_feature_names())

In [40]:
topic_word

Unnamed: 0,aa,aa af,aa forced,aaa,aaa developer,aaa examples,aaa gaming,aaa graphics,aaa indie,aaa quality,...,zuma type,zune,zune music,zune pass,zwei,zynga,ítem,über,재미,재미 thumbs
1,0.001,0.0,0.0,0.002,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.001,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,-0.001,-0.0,-0.0,-0.001,-0.0,0.0,-0.0,-0.0,-0.0,-0.0,...,0.0,0.0,0.0,0.0,-0.0,-0.0,0.0,-0.0,-0.0,-0.0
3,0.0,0.0,0.0,0.001,-0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,-0.0,-0.0,-0.0,0.0,-0.0,0.0,0.0,0.0,0.0
4,-0.001,-0.0,-0.0,-0.002,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,...,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0
5,-0.001,-0.0,-0.0,-0.002,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,...,-0.0,0.0,0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0
6,-0.001,-0.0,-0.0,-0.001,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,...,-0.0,0.0,-0.0,0.0,-0.0,0.0,-0.0,0.0,-0.0,-0.0
7,-0.001,-0.0,-0.0,0.001,-0.0,0.0,0.0,-0.0,0.0,0.0,...,0.0,0.0,-0.0,-0.0,0.0,-0.0,0.0,-0.0,-0.0,-0.0
8,0.0,0.0,0.0,-0.0,0.0,0.0,-0.0,0.0,0.0,0.0,...,-0.0,0.0,0.0,0.0,0.0,-0.0,-0.0,-0.0,0.0,0.0
9,-0.0,-0.0,-0.0,-0.0,-0.0,0.0,-0.0,-0.0,0.0,-0.0,...,0.0,0.0,-0.0,0.0,0.0,-0.0,-0.0,0.0,-0.0,-0.0
10,0.0,0.0,0.0,0.001,-0.0,-0.0,0.0,0.0,-0.0,-0.0,...,-0.0,0.0,0.0,-0.0,-0.0,-0.0,0.0,-0.0,0.0,0.0


In [41]:
def display_topics(model, feature_names, no_top_words, topic_names=None):
    for ix, topic in enumerate(model.components_):
        if not topic_names or not topic_names[ix]:
            print("\nTopic ", ix)
        else:
            print("\nTopic: '",topic_names[ix],"'")
        print(", ".join([feature_names[i]
                        for i in topic.argsort()[:-no_top_words - 1:-1]]))

In [42]:
display_topics(lsa, vectorizer.get_feature_names(), 10)


Topic  0
get, code, fun, buy, download, easy, work, money, card, price

Topic  1
code, card, worked, easy, fast, buy, store, get code, purchase, online

Topic  2
fun, easy, code, fast, best, story, price, card, lots, graphics

Topic  3
download, easy, fun, fast, fast easy, quick, easy download, quick easy, easy use, install

Topic  4
fun, code, work, worked, code work, code worked, lots, lots fun, computer, windows

Topic  5
money, get, card, waste, work, buy, waste money, fun, easy, credit

Topic  6
price, buy, fun, money, worth, waste, waste money, deal, best, worth money

Topic  7
card, download, buy, credit, credit card, gift, fun, gift card, store, account

Topic  8
get, work, price, fun, best, get code, get work, card, download, steam

Topic  9
best, work, buy, windows, code work, work windows, fast, worked, computer, works


## NMF

In [67]:
nmf_model = NMF(8)

doc_topic = nmf_model.fit_transform(doc_word)



In [68]:
topic_word = pd.DataFrame(nmf_model.components_.round(3), 
                         index=list(range(1,9)), 
                         columns = vectorizer.get_feature_names())

topic_word

Unnamed: 0,aa,aa af,aa forced,aaa,aaa developer,aaa examples,aaa gaming,aaa graphics,aaa indie,aaa quality,...,zuma type,zune,zune music,zune pass,zwei,zynga,ítem,über,재미,재미 thumbs
1,0.013,0.001,0.0,0.018,0.001,0.0,0.0,0.001,0.0,0.001,...,0.0,0.001,0.0,0.0,0.0,0.001,0.0,0.0,0.001,0.001
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.003,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.001,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.001,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.001,0.0,0.0,0.0
5,0.0,0.0,0.0,0.001,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.001,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6,0.0,0.0,0.0,0.002,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.001,0.0,0.0,0.0
7,0.001,0.0,0.0,0.01,0.0,0.0,0.0,0.0,0.001,0.0,...,0.001,0.001,0.0,0.0,0.001,0.0,0.001,0.0,0.0,0.0
8,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.002,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


much better results!

In [69]:
display_topics(nmf_model, vectorizer.get_feature_names(), 10)


Topic  0
get, story, graphics, people, pc, gameplay, pretty, different, player, think

Topic  1
code, card, worked, online, buy, get code, store, purchase, get, instantly

Topic  2
download, steam, install, computer, origin, downloaded, easy download, able, downloading, tried

Topic  3
easy, fast, fast easy, quick, use, quick easy, easy use, buy, easy buy, easy fast

Topic  4
fun, lots, lots fun, challenging, fun challenging, friends, fun fun, enjoy, worth, levels

Topic  5
money, waste, waste money, buy, worth, worth money, get money, save, save money, wasted

Topic  6
price, best, deal, worth, buy, sale, steam, worth price, bundle, pack

Topic  7
work, windows, get, get work, computer, work windows, code work, work computer, tried, downloaded


### Topic Interpretation:

0. Gameplay/Graphics
1. Product Code Delivery
2. Game Download / Installation
3. Ease/Speed of Transaction
4. Game Challenge
5. Waste of Money
6. Good Price
7. Compatibility

In [71]:
doc_topic[24]

array([0.00373978, 0.        , 0.        , 0.        , 0.05446607,
       0.        , 0.        , 0.        ])

In [73]:
digital_videogames_df['Topic'] = doc_topic.argmax(axis=1)

names = {0:'Gameplay/Graphics', 1:'Product Delivery', 2:'Game Download/Installation', 3:'Ease/Speed of Transaction', 4:'Game Challenge', 5:'Waste of Money', 6:'Good Price', 7:'Compatibility'}

digital_videogames_df['Topic_Name'] = digital_videogames_df['Topic'].map(names)

In [76]:
digital_videogames_df[['review_body_clean', 'Topic_Name']].head(20)

Unnamed: 0,review_body_clean,Topic_Name
0,i keep buying madden every year hoping they ge...,Gameplay/Graphics
1,if you are prepping for the end of the world t...,Gameplay/Graphics
2,i like the new skills like herbalism in this ...,Game Challenge
3,excellent fast and secure,Ease/Speed of Transaction
4,as has been written by so many others i quick...,Gameplay/Graphics
5,probably the best game for learning aspects of...,Gameplay/Graphics
6,cool but it lages alot of the time,Gameplay/Graphics
7,lames purchase i almost never made,Gameplay/Graphics
8,pretty good but not as good as the first brink...,Gameplay/Graphics
9,what can i say xbox live,Gameplay/Graphics


In [78]:
digital_videogames_df['Topic_Name'].value_counts()

Gameplay/Graphics             45111
Product Delivery              17145
Game Download/Installation    11760
Game Challenge                11505
Good Price                    10834
Compatibility                  7122
Ease/Speed of Transaction      6390
Waste of Money                 6228
Name: Topic_Name, dtype: int64

In [79]:
digital_videogames_df.to_pickle("digital_videogames_df_topics.pkl")

Ignore Below (Possible Future Work)

In [None]:
from gensim.utils import simple_preprocess

In [None]:
def word_list(sentences):
    for sentence in sentences:
        yield(simple_preprocess(str(sentence), deacc=True))

In [None]:
def clean_text(text):
    text_list = simple_preprocess(str(text), deacc=True)
    return " ".join([word for word in text_list if word not in blocker_words and word not in stop_words])
#     text = " ".join([word for word in text if word not in stop_words])
#     return text

In [None]:
data = list(digital_videogames_df['review_body_clean'].values)

data_words = list(word_list(data))

In [None]:
data_words

In [None]:
stop_words = nltk.corpus.stopwords.words('english')
blocker_words = ['br']

In [None]:
clean_text(test_string)

In [None]:
digital_videogames_df['review_body_clean'] = digital_videogames_df['review_body_clean'].apply(lambda x: clean_text(x))

In [None]:
digital_videogames_df

In [None]:
corpus = digital_videogames_df['review_body_clean']

In [None]:
tfidf = TfidfVectorizer()

review_word_matrix = tfidf.fit_transform(corpus)

vocab = tfidf.get_feature_names()

In [None]:
len(vocab)

In [None]:
vocab

In [None]:
nmf = NMF(n_components=5)
nmf.fit(review_word_matrix)

### Review/Topic Matrix

In [None]:
review_topic_matrix = nmf.transform(review_word_matrix)

In [None]:
review_topic_matrix_df = pd.DataFrame(review_topic_matrix).add_prefix('topic_')

review_topic_matrix_df[['review_body', 'review_body_clean']] = digital_videogames_df[['review_body', 'review_body_clean']]
# review_topic_matrix_df[['review_body_clean']] = digital_videogames_df[['review_body_clean']]

In [None]:
review_topic_matrix_df.head()

### Word/Topic Matrix

In [None]:
word_topic_matrix_df = pd.DataFrame(nmf.components_, columns=vocab).T.add_prefix('topic_')
word_topic_matrix_df.head()

### Topic Interpretation

In [None]:
for review in review_topic_matrix_df.sort_values(by='topic_3', ascending=False).head(10)['review_body_clean'].values:
    print(review, '\n')

In [None]:
word_topic_matrix_df.sort_values(by='topic_4', ascending=False).head(10)

In [None]:
def top_reviews(review_topic_matrix_df, topic, n_reviews):
    return (review_topic_matrix_df.sort_values(by=topic, ascending=False).head(n_reviews)['review_body_clean'].values)

def top_words(word_topic_matrix_df, topic, n_words):
    return (word_topic_matrix_df.sort_values(by=topic, ascending=False).head(n_words))[topic]

1. decide on minimum word count
2. if len(string) is less than 20
3. compare different products (games) - what are the reviews of one product about, vs the other
4. remove filler words

In [None]:
for review in top_reviews(review_topic_matrix_df, 'topic_1', 15):
    print(review)

In [None]:
for word in top_words(word_topic_matrix_df, 'topic_1', 10):
    print(word)

In [None]:
X = digital_videogames_df.review_body
y = digital_videogames_df.sentiment

In [None]:
# from sklearn.model_selection import train_test_split
# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, 
#                                                    random_state=24)

In [None]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
cv = CountVectorizer(stop_words='english')
X_train_cv = cv.fit_transform(X_train)
X_test_cv = cv.transform(X_test)


# pd.DataFrame(X_train_cv.toarray(), columns=cv.get_feature_names()).head()
nb = MultinomialNB()
nb.fit(X_train_cv, y_train)
nb.score(X_test_cv, y_test)

In [None]:
X_train_cv.shape

In [None]:
from sklearn.linear_model import LogisticRegression
lr = LogisticRegression()
lr.fit(X_train_cv, y_train)
y_pred_cv = lr.predict(X_test_cv)

lr.score(X_test_cv, y_test)

In [None]:
from sklearn.decomposition import PCA
from sklearn.feature_extraction.text import  TfidfVectorizer 

In [None]:
tfidf = TfidfVectorizer(stop_words='english')

pca = PCA()

X_train_tfidf = tfidf.fit_transform(X_train)
X_train_tfidf = pca.fit_transform(X_train_tfidf.toarray())


# X_test_tfidf = tfidf.transform(X_test)

# nb.fit(X_train_tfidf, y_train)
# nb.score(X_test_tfidf, y_test)

In [None]:
def show_variance_explained_plots(pca):
    
    var_exp_array = pca.explained_variance_ratio_
    n_comps = var_exp_array.shape[0] 
    
    fig, ax = plt.subplots(1,2,figsize=(10,4))
    
    ax[0].fill_between(range(n_comps), var_exp_array)
    ax[0].set_title('Variance Explained by Nth Principal Component')
    
    ax[1].fill_between(range(n_comps), np.cumsum(var_exp_array))
    ax[1].set_title('Cumulative Variance Explained by N Components')
    
    plt.show()

In [None]:
show_variance_explained_plots(pca)