## import library requirements

In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns


# Pickle text data for generating text
import pickle

# polarity and subjectivity sentiment
from textblob import TextBlob

# create a document-term matrix using CountVectorizer, and exclude common English stop words
from sklearn.feature_extraction.text import CountVectorizer

# markov chaain model for generating
from gensim import matutils, models
import scipy.sparse

# Create a new document-term matrix using only nouns
from sklearn.feature_extraction import text
from sklearn.feature_extraction.text import CountVectorizer
from collections import defaultdict

import random

# text cleaning techniques
import re
import string

# stopword cleaning and tokenizing
import nltk
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.stem.porter import PorterStemmer
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords

# pull out nouns from a string of text
from nltk import pos_tag

# corpus downnload
nltk.download('wordnet')
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('stopwords')

# column width Visual adjustment
pd.set_option('max_colwidth',150)

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


# Preview Initial Data

In [2]:
df = pd.read_csv("/content/gamestop_product_reviews_dataset_sample.csv")
df.head()


FileNotFoundError: ignored

# preview data


In [None]:

print("number of reviews", df.shape[0])
print("number of columns", df.shape[1])

In [None]:
df.columns

In [None]:
# clear a bunch of unwanted columns
review_data = df[['name', 'brand', 'review_title',
    'review_description', 'recommended_review', 'rating',
    'average_rating', 'reviews_count']]
# preview chopped new features
print("selected review features for data acquisition", list(review_data.columns))


format observation

In [None]:
# check brand column
print("number of brands", len(review_data.brand.unique()))
review_data.brand.unique()

In [None]:
# check column recomended review format 
review_data.recommended_review.unique()

In [None]:
# check rating column format
review_data.rating.unique()

In [None]:
review_data.columns

# Data acquisition

* Binarize reccomended review column form and positive recomended sentiment column 

* Aggregate the sum of reccomended boolean values and calculate a positive reccomended sentiment

* Aggregate remaining columns by name remaining columns (average rating, review count and brand)

* Aggregate text columns by name from the main data source

* Concat datasets and save acquired data

> Binarize reccomended review column 

In [None]:

    
# sort the data by name 
sorting_data = review_data.sort_values('name', ascending = True).set_index('name')
    
# Split the string result is yes and no
temp = sorting_data.recommended_review.str.split(':', expand=True)[0].to_frame()
# name the new dataframe column
temp.columns = ['recommended_review']
# preview
temp.head()

In [None]:
# Encode  the datasets to 1 and 0
encoded = temp.applymap(hot_encode)
# Create two boolean columns of yes and no reccomended 
one_hot = pd.get_dummies(temp,drop_first=False)
# preview
one_hot.head()

In [None]:
# drop original data column and replace with new dummies
acquire_recomended_review = sorting_data.drop(columns = 'recommended_review').copy()
# concat dummies
acquire_recomended_review = pd.concat([acquire_recomended_review, one_hot], axis =1)
# preview cleaning data
acquire_recomended_review.head(9)

> Aggregate the sum of reccomended boolean values and calculate a positive reccomended sentiment


In [None]:
# Group Items and sum number of recomended then concat
grouped_not_recommended = acquire_recomended_review.groupby('name')['recommended_review_No'].sum().to_frame().sort_values('name', ascending = True)
grouped_recommended = acquire_recomended_review.groupby('name')['recommended_review_yes'].sum().to_frame().sort_values('name', ascending = True)
# create a new dataframe
summed_recommended = pd.concat([grouped_not_recommended,grouped_recommended.recommended_review_yes], axis=1).sort_values('name', ascending = True)

# preview new data
summed_recommended.head()

In [None]:
# make column of positive percent of recomendations
pList = []
# iterating over rows using iterrows() calculate percent sentiment
for i, j in summed_recommended.iterrows():
  # yes/(no + sum)
  pList.append(j[1]/(j[0]+j[1]))

# make neew column and fill with percent sentiment list
summed_recommended['recommended_sentiment'] = pList
# give a new index
# Recommended = summed_recommended.sort_values('recommended_review_yes', ascending = False).reset_index()
summed_recommended.head()

Aggregate text columns by name from the main data source

In [None]:
# Group up the review description by the name and sum all the sentences to one column
aggregated_reviews = df.groupby('name')['review_description'].sum().to_frame().apply(' '.join, axis=1)

# Dataframe
aggregated_reviews = aggregated_reviews.to_frame()
aggregated_reviews.rename(columns = {0:'review'}, inplace = True)
# preview frame
aggregated_reviews.head()

Aggregate remaining columns by name average rating, review count and brand 

In [None]:
#list top product names
pList = review_data.sort_values('name', ascending = True).name.unique()

# average rating list
rList = []
# count list
cList = []
# brand list
bList = []

# iterate through main and retrieve average rating, review count and brand list
for i in pList:
  # average rating list
  rList.append(review_data.loc[review_data['name'] == i].average_rating.unique())
  # review count list
  cList.append(review_data.loc[review_data['name'] == i].reviews_count.unique())
  # brand list
  bList.append(review_data.loc[review_data['name'] == i].brand.unique())

# format dict for dataframe
data = {
    'name': pList,
    'brand' : bList,
    'average_rating': rList,
    'reviews_count' : cList
}

# make new dataframe with collected lists sort by name alphabetical order
new_data = pd.DataFrame(data).sort_values('name', ascending = True)
# make new columns and fill with data 
new_data['average_rating'] = new_data['average_rating'].astype(float)
new_data['reviews_count'] = new_data['reviews_count'].astype(int) 
new_data['brand'] = new_data['brand'].astype(str).str.replace("[", "",regex=True).str.replace("]", "",regex=True).str.replace("'", "",regex=True)
# preview new data
new_data = new_data.set_index('name')
new_data.head()

> Concat datasets and save acquired data

In [None]:
# concat new_data to aggregated_reviews
acquired_data = pd.concat([aggregated_reviews, new_data], join = "inner", axis =1)
# concat recomended dataset
completed_acquisition = pd.concat([acquired_data, summed_recommended], join = "inner", axis =1)

# save acquired dataset
completed_acquisition.to_csv('/content/drive/MyDrive/Data files/GamestopFile/completed_acquisition.csv')

# preview data
completed_acquisition.shape

> save raw text corpus

In [None]:
# save raw text for future generating reviews
raw_text = completed_acquisition.reset_index()
raw_text

raw_text = raw_text[['name','review']].set_index('name')
raw_text.to_pickle('/content/drive/MyDrive/Data files/GamestopFile/rawText_corpus.pkl')
raw_text

# Visually analyse Recomended sentiment relative to the number of reviews

In [None]:
# consider reviews with more then 100 review count
sufficient_reviews = completed_acquisition[completed_acquisition.reviews_count >= 100]
print("number of products with more then 100 reviews", len(sufficient_reviews))
# Sort new list by number of reviews
most_reviews = sufficient_reviews.sort_values("reviews_count", ascending = False)
pos_recomendation = sufficient_reviews.sort_values("recommended_sentiment", ascending = False)
sufficient_reviews.to_csv("/content/drive/MyDrive/Data files/GamestopFile/sufficient_reviews.csv")

In [None]:
print("top 5 positively recomended")
print(pos_recomendation["recommended_sentiment"].head())

In [None]:
print("top 5 most review count")
print(most_reviews["reviews_count"].head())

In [None]:
# plot a bar graph according to the product sentiment
rBar = pos_recomendation[["recommended_sentiment"]]
sns.barplot(x = rBar.index, y = rBar.recommended_sentiment)

labels = rBar.index.tolist()
plt.gcf().set_size_inches(15, 7)

# plot asthetics
plt.title('Positive recommendation sentiment', fontsize = 20)
plt.xlabel('Most popular products', fontsize = 15)
plt.ylabel('Sentiment', fontsize = 15)

plt.xticks(ticks = range(len(rBar)) ,labels = labels, rotation = '90')
plt.show()

# Text Preperation

* Create cleaning funtions: Make text lowercase, remove text in square brackets, remove punctuation and remove words containing numbers.

* Apply text cleaning methods

* Select text data and pickle texgt corpus

* Remove stop words and pickle stopwords




In [None]:
# # text cleaning techniques
# import re
# import string

# text cleaning 1
def clean_text_round1(text):
    '''Make text lowercase, remove text in square brackets, remove punctuation and remove words containing numbers.'''
    text = text.lower()
    text = re.sub('\[.*?\]', '', text)
    text = re.sub('[%s]' % re.escape(string.punctuation), ' ', text)
    text = re.sub('\w*\d\w*', '', text)
    return text

# text cleaning 2
def clean_text_round2(text):
    '''Get rid of some additional punctuation and non-sensical text that was missed the first time around.'''
    text = re.sub('[‘’“”…]', '', text)
    text = re.sub('\n', '', text)
    return text

round1 = lambda x: clean_text_round1(x)
round2 = lambda x: clean_text_round2(x)

Apply text cleaning methods


In [None]:
# apply cleaning step 1
# Make text lowercase, remove text in square brackets, remove punctuation and remove words containing numbers.
clean_text = pd.DataFrame(most_reviews.review.apply(round1))
# apply cleaning step 2
# Get rid of some additional punctuation and non-sensical text that was missed the first time around.
clean_text = pd.DataFrame(clean_text.review.apply(round2))

select text data and save a corpus

In [None]:
# copy data for clean text column
text_cleaning = most_reviews.copy()
text_cleaning = text_cleaning.drop(columns = 'review')
text_cleaning['review'] = clean_text.review


# shift column to first position
shiftCol= text_cleaning.pop('review')
# insert column to the front
text_cleaning.insert(0, 'review', shiftCol)

In [None]:
# Save cleaned text corpus
text_cleaning['review'].to_pickle('/content/drive/MyDrive/Data files/GamestopFile/clean_text_corpus.pkl')

Remove stop words and save stopwords

In [None]:
# import nltk
# from nltk.tokenize import sent_tokenize, word_tokenize
# from nltk.corpus import stopwords
# nltk.download('stopwords')
text=text_cleaning['review'].to_frame()
stop_step = text_cleaning.index
stop = set(stopwords.words('english'))
# record stopwords
pickle.dump(stop, open("/content/drive/MyDrive/Data files/GamestopFile/stopwords.pkl", "wb"))

text['review_without_stopwords'] = text_cleaning['review'].apply(lambda x: ' '.join([word for word in x.split() if word not in (stop)]))

In [None]:
textClean = text.drop(columns = ('review')).rename(columns = {'review_without_stopwords':'review'})
textClean.to_pickle("/content/drive/MyDrive/Data files/GamestopFile/clean_text_no_stop_corpus.pkl")

# Product Review Sentiment
> vectorizer 

> format vectorized words to a Document term matrix according to the product names and word occurences.

> Transpose list of words to index.

> Stem transposed list and replace index with new stemmed words

> Filter More words according to repitition for word cloud visualisation.

> Save clean data as csv

> Evaluate sentiment analysis on product titles.

> Plot word cloud for visual word significant representation in product reviews  




Document term matrix

In [None]:
# We are going to create a document-term matrix using CountVectorizer, and exclude common English stop words
from sklearn.feature_extraction.text import CountVectorizer

# drop stop words 
cv = CountVectorizer(stop_words='english')
dataCV = cv.fit_transform(textClean.review)

# Set new dataframe
dataDTM = pd.DataFrame(dataCV.toarray(), columns=cv.get_feature_names())
dataDTM.index = textClean.index

# record stopwords
pickle.dump(cv, open("/content/drive/MyDrive/Data files/GamestopFile/stopwords.pkl", "wb"))

In [None]:
# transpose text 
data = dataDTM.transpose()
print(" number of words", data.shape[0])
# preview transposed dataset
data.head()

# Preview and choose a Stemming and lemmatizing method


Textblob stemming and lemmatizing with pos tags

In [None]:
# # Lemmatize with POS Tag
# from nltk.corpus import wordnet
# from nltk.stem.porter import PorterStemmer
# from nltk.stem import 	WordNetLemmatizer
# # nltk.download('wordnet')

# porter_stemmer  = PorterStemmer()
# wordnet_lemmatizer = WordNetLemmatizer()

# def get_wordnet_pos(word):
#     """Map POS tag to first character lemmatize() accepts"""
#     tag = nltk.pos_tag([word])[0][1][0].upper()
#     tag_dict = {"J": wordnet.ADJ,
#                 "N": wordnet.NOUN,
#                 "V": wordnet.VERB,
#                 "R": wordnet.ADV}

#     return tag_dict.get(tag, wordnet.NOUN)


# # 1. Init Lemmatizer
# lemmatizer = WordNetLemmatizer()

# # 2. Lemmatize Single Word with the appropriate POS tag
# words = [] 
# for word in data.index:
#   words.append(lemmatizer.lemmatize(w, get_wordnet_pos(w)) for w in nltk.word_tokenize(porter_stemmer.stem(word)))

# # replace the data index with proccesed list of words
# data.index = words 
# print("the number of words in main data is ", len(data.index), "the number of unique words after processing is ",  len(pd.DataFrame(words).drop_duplicates()))

Does well but pos tags is a further step forward: unique words 7108

In [None]:
from nltk.stem.porter import PorterStemmer
from nltk.stem import 	WordNetLemmatizer
nltk.download('wordnet')

porter_stemmer  = PorterStemmer()
wordnet_lemmatizer = WordNetLemmatizer()

lList = []
for w in data.index:
  lList.append(wordnet_lemmatizer.lemmatize(porter_stemmer.stem(w)))

data.index = lList

Spacy takes the longest to process: 7108 unique words

In [None]:
# import spacy

# # Initialize spacy 'en' model, keeping only tagger component needed for lemmatization
# nlp = spacy.load('en', disable=['parser', 'ner'])

# words3 = []
# for w in data.index:
#   doc = nlp(porter_stemmer.stem(w))
#   words3.append([token.lemma_ for token in doc])


# len(pd.DataFrame(words2).drop_duplicates())

# Preview of commonly used words accross products
* gather new list of the most commonly used words across products

In [None]:
# Find the top 30 words said by each review
top_dict = {}
for c in data.columns:
    top = data[c].sort_values(ascending=False).head(30)
    top_dict[c]= list(zip(top.index, top.values))

In [None]:
# Print the top 15 words said by each review
for review, top_words in top_dict.items():
    print(review)
    print(', '.join([word for word, count in top_words[0:14]]))
    print('---')

**NOTE:** At this point, we could go on and create word clouds. However, by looking at these top words, you can see that some of them have very little meaning and could be added to a stop words list, so





In [None]:
# Look at the most common top words --> add them to the stop word list
from collections import Counter

# Let's first pull out the top 30 words for each review
words = []
for review in data.columns:
    top = [word for (word, count) in top_dict[review]]
    for t in top:
        words.append(t)   
# aggregate the list and identify the most common words by how many times they occur
Counter(words).most_common()

In [None]:
# If more than X occurences, exclude it from the list
add_stop_words = [word for word, count in Counter(words).most_common() if count > 15]
add_stop_words

## Update dataset with combined new stop words

In [None]:
# Let's update our document-term matrix with the new list of stop words
from sklearn.feature_extraction import text 
from sklearn.feature_extraction.text import CountVectorizer

# Read in cleaned data
data_clean = pd.read_pickle('/content/drive/MyDrive/Data files/GamestopFile/clean_text_corpus.pkl').to_frame()
# Add new stop words
stop_words = text.ENGLISH_STOP_WORDS.union(add_stop_words)

# Recreate document-term matrix
cv = CountVectorizer(stop_words=stop_words)
data_cv = cv.fit_transform(data_clean.review)
data_stop = pd.DataFrame(data_cv.toarray(), columns=cv.get_feature_names())
data_stop.index = data_clean.index

# Pickle it for later use
import pickle
pickle.dump(cv, open("/content/drive/MyDrive/Data files/GamestopFile/cv_stop.pkl", "wb"))
data_stop.to_pickle("/content/drive/MyDrive/Data files/GamestopFile/nameDTM.pkl")

data_stop.head()

# Add sentiment and polarity columns

In [None]:
pol = lambda x: TextBlob(x).sentiment.polarity
sub = lambda x: TextBlob(x).sentiment.subjectivity

data_clean['polarity'] = data_clean['review'].apply(pol)
data_clean['subjectivity'] = data_clean['review'].apply(sub)
data_clean.head()

# Sentiment max and min

In [None]:
print('Most Positive product', data_clean.polarity.idxmax(), "=", data_clean.polarity.max())
print('Most negative product', data_clean.polarity.idxmin(), "=", data_clean.polarity.min() )

In [None]:
observePolarity = data_clean.reset_index()
observePolarity[["name","polarity"]].sort_values("polarity", ascending = False).head(10)

In [None]:
# Let's plot the results
import matplotlib.pyplot as plt

plt.rcParams['figure.figsize'] = [10, 10]

for index, product in enumerate(data_clean.index):
    x = data_clean.polarity.loc[product]
    y = data_clean.subjectivity.loc[product]
    plt.scatter(x, y, color='blue')
    plt.text(x+.001, y+.001, index, fontsize=10)
    # plt.xlim(-.01, .12) 
    
plt.title('Sentiment Analysis', fontsize=20)
plt.xlabel('<-- Negative -------- Positive -->', fontsize=15)
plt.ylabel('<-- Facts -------- Opinions -->', fontsize=15)

plt.show()

# wordcloud preview

In [None]:
# word clouds!
# Terminal / Anaconda Prompt: conda install -c conda-forge wordcloud
from wordcloud import WordCloud

wc = WordCloud(stopwords=stop_words, background_color="white", colormap="Dark2",
               max_font_size=150, random_state=42)

In [None]:
n = 23
wc.generate(data_clean.review[n])
    
plt.imshow(wc, interpolation="bilinear")
plt.axis("off")
plt.title(data_clean.index[n])
plt.show()

In [None]:
data = pd.read_csv('/content/drive/MyDrive/Data files/GamestopFile/sufficient_reviews.csv')
data = data.sort_values('name', ascending = False)
ndata_clean = data_clean.reset_index().sort_values("name",ascending = False)
data["review_polarity"]=list(ndata_clean["polarity"])
data["review_subjectivity"]=list(ndata_clean["subjectivity"])
data.to_csv("/content/drive/MyDrive/Data files/GamestopFile/final.csv")

Plot a bar chart comparing sentiment columns. Recomended sentiment was halfed for visual analysis


In [None]:
data.columns

In [None]:
data = data.sort_values('review_polarity', ascending = False)

In [None]:
import numpy as np 
import matplotlib.pyplot as plt 

X = data.name
Y = data.review_polarity
Z = data.recommended_sentiment/2
  
X_axis = np.arange(len(X))
  
plt.bar(X_axis - 0.2, Y, 0.4, label = 'review_polarity')
plt.bar(X_axis + 0.2, Z, 0.4, label = 'recommended_sentiment')
  
plt.xticks(X_axis, X, rotation= 90)
plt.xlabel("Groups")
plt.ylabel("Sentiment")
plt.title("Product")
plt.legend()
plt.show()

# Summary of product sentiment analisys

> With the amount of words in some of the product reviews the results varied having higher range variances relative to the number of word counts.
* data acquisisition was cleared for products less then 100 reviews 
* non game products functionality that meets the recomended requirements therefore stands out in the visual analysis
* The focus of sentiment analysis is to find pos or neg features in games 

> Also some strong sentiment words were removed as stop words after lemmatizing and dumping into stopwordfile, stop words should be removed and saved before lematizing



# Topic Modeling

> an attempt at analyzing a topic of one negative product review which in this case is the Samsung 49-in Super Ultra-Wide Dual QHD 

In [None]:
# Create a new document-term matrix using only nouns
from sklearn.feature_extraction import text
from sklearn.feature_extraction.text import CountVectorizer

# Let's create a function to pull out nouns from a string of text
def nouns_adj(text):
    '''Given a string of text, tokenize the text and pull out only the nouns and adjectives.'''
    is_noun_adj = lambda pos: pos[:2] == 'NN' or pos[:2] == 'JJ'
    tokenized = word_tokenize(text)
    nouns_adj = [word for (word, pos) in pos_tag(tokenized) if is_noun_adj(pos)] 
    return ' '.join(nouns_adj)

def nouns(text):
    '''Given a string of text, tokenize the text and pull out only the nouns.'''
    is_noun = lambda pos: pos[:2] == 'NN'
    tokenized = word_tokenize(text)
    all_nouns = [word for (word, pos) in pos_tag(tokenized) if is_noun(pos)] 
    return ' '.join(all_nouns)

def setup(product_data, num_topics, passes):
  add_stop_words = ['play', 'game', 'great', 'love', 'like', 'good', 'really', 'fun']
  dataDTM = pd.DataFrame()
  for j in range(3):
    if (j ==0):
      # drop stop words 
      cv = CountVectorizer(stop_words='english')
      data = cv.fit_transform(product_data.review)
      # Set new dataframe
      dataDTM = pd.DataFrame(data.toarray(), columns=cv.get_feature_names())
      dataDTM.index = product_data.index
      # record stopwords
      pickle.dump(cv, open("/content/drive/MyDrive/Data files/GamestopFile/stopwords.pkl", "wb"))
  
    elif (j == 1):
      # Apply the nouns function to the transcripts to filter only on nouns
      product_data = pd.DataFrame(product_data.review.apply(nouns))
      # Re-add the additional stop words since we are recreating the document-term matrix
      stop_words = text.ENGLISH_STOP_WORDS.union(add_stop_words)
      cv= CountVectorizer(stop_words=stop_words)
      data = cv.fit_transform(product_data.review)
      # Set new dataframe
      dataDTM = pd.DataFrame(data.toarray(), columns=cv.get_feature_names())
      dataDTM.index = product_data.index
      # record stopwords
      pickle.dump(cv, open("/content/drive/MyDrive/Data files/GamestopFile/stopwords.pkl", "wb"))
  
    elif (j==2):
      product_data = pd.DataFrame(product_data.review.apply(nouns_adj))
      stop_words = text.ENGLISH_STOP_WORDS.union(add_stop_words)
      cv = CountVectorizer(stop_words=stop_words)
      data = cv.fit_transform(product_data.review)
      # Set new dataframe
      dataDTM = pd.DataFrame(data.toarray(), columns=cv.get_feature_names())
      dataDTM.index = product_data.index
      # record stopwords
      pickle.dump(cv, open("/content/drive/MyDrive/Data files/GamestopFile/stopwords.pkl", "wb"))
  

    # grab our clean data agin
    dtm = dataDTM.transpose()
    # We're going to put the term-document matrix into a new gensim format, from df --> sparse matrix --> gensim corpus
    sparse_counts = scipy.sparse.csr_matrix(dtm)
    corpus = matutils.Sparse2Corpus(sparse_counts)
    # Gensim also requires dictionary of the all terms and their respective location in the term-document matrix
    cv = pickle.load(open("/content/drive/MyDrive/Data files/GamestopFile/stopwords.pkl", "rb"))
    id2word = dict((v, k) for k, v in cv.vocabulary_.items())
    lda = models.LdaModel(corpus=corpus, id2word=id2word, num_topics=num_topics, passes=passes)
    if(j==0):
       print("raw")
    if(j==1):
       print("Nouns")
    if(j==2):
       print("Nouns and adjectives")
    for i in lda.print_topics():
      print(i)
  

I picked out 4 games to observe review topics

In [None]:
ndata= pd.read_pickle("/content/drive/MyDrive/Data files/GamestopFile/rawText_corpus.pkl")
MK = pd.DataFrame([["MortalKombat",ndata.review.loc["Mortal Kombat Vs. DC Universe - PlayStation 3"]]],columns = ("name","review"))
Zelda = pd.DataFrame([["Zelda",ndata.review.loc["The Legend of Zelda: A Link Between Worlds - Nintendo 3DS"]]],columns = ("name","review"))
Forza = pd.DataFrame([["Forza",ndata.review.loc["Forza Horizon 4 - Xbox One"]]],columns = ("name","review"))
Yoshi = pd.DataFrame([["Yoshi",ndata.review.loc["Yoshi's Crafted World - Nintendo Switch"]]],columns = ("name","review")).set_index("name")
Thief = pd.DataFrame([["Thief",ndata.review.loc["Thief - Xbox One"]]],columns = ("name","review")).set_index("name")

raw topics looks best
* topic 0: Fun and easy
* topic 1: slightly challenging levels 

In [None]:
#explore num topics
setup(Yoshi, 2, 200)

nouns
* topic 0: Zelda fans
* topic 1: Game design 

In [None]:
#explore num topics
setup(Zelda, 2, 200)

I suppose just not enough reviews

In [None]:
#explore num topics
setup(Forza, 1, 200)

2 topics doesnt look good


In [None]:
#explore num topics
setup(MK, 2, 200)

In [None]:
#explore num topics
setup(Thief, 2, 20)

# Text Generation: generate a review according to this product....

In [None]:
data = pd.read_pickle('/content/drive/MyDrive/Data files/GamestopFile/rawText_corpus.pkl')
# ndata = data.sort_values("review_polarity", ascending = False).set_index("name")
gMK = ndata.review.loc["Mortal Kombat Vs. DC Universe - PlayStation 3"]
gZelda = ndata.review.loc["The Legend of Zelda: A Link Between Worlds - Nintendo 3DS"]
gForza = ndata.review.loc["Forza Horizon 4 - Xbox One"]
gYoshi = ndata.review.loc["Yoshi's Crafted World - Nintendo Switch"]

## Build a Markov Chain Function

We are going to build a simple Markov chain function that creates a dictionary:

> The keys should be all of the words in the corpus

> The values should be a list of the words that follow the keys

In [None]:
def markov_chain(text):
    '''The input is a string of text and the output will be a dictionary with each word as
       a key and each value as the list of words that come after the key in the text.'''
    
    # Tokenize the text by word, though including punctuation
    words = text.split(' ') 
    # Initialize a default dictionary to hold all of the words and next words
    m_dict = defaultdict(list)
    
    # Create a zipped list of all of the word pairs and put them in word: list of next words format
    for current_word, next_word in zip(words[0:-1], words[1:]):
        m_dict[current_word].append(next_word)

    # Convert the default dict back into a dictionary
    m_dict = dict(m_dict)
    return m_dict

In [None]:
# Create the dictionary for Ali's routine, take a look at it
YoshiDict = markov_chain(gYoshi)
ZeldaDict = markov_chain(gZelda)
ForzaDict = markov_chain(gForza)
MKDict = markov_chain(gMK)

## Create a Text Generator

We're going to create a function that generates sentences. It will take two things as inputs:
* The dictionary you just created
* The number of words you want generated

Here are some examples of generated sentences:

>'Shape right turn– I also takes so that she’s got women all know that snail-trail.'

>'Optimum level of early retirement, and be sure all the following Tuesday… because it’s too.'

In [None]:
def generate_sentence(chain, count=20):
    '''Input a dictionary in the format of key = current word, value = list of next words
       along with the number of words you would like to see in your generated sentence.'''

    # Capitalize the first word
    word1 = random.choice(list(chain.keys()))
    sentence = word1.capitalize()

    # Generate the second word from the value list. Set the new word as the first word. Repeat.
    for i in range(count-1):
        word2 = random.choice(chain[word1])
        word1 = word2
        sentence += ' ' + word2

    # End it with a period
    sentence += '.'
    return(sentence)

In [None]:
generate_sentence(ForzaDict)

All the puctuation is removed for the cost of cleaner markov pairing but a new cleaning stage may prove useful