# MONOPOLY COMMENT TEXT ANALYSIS

# Packages and dataset import

In [None]:
#import libraries
import numpy as np
import pandas as pd
import statistics

#plot packages
import matplotlib.pyplot as plt

# for parsing XML
import requests
import xml.etree.ElementTree as ET   
import xmltodict, json
import pprint

#text mining package
import re
import nltk
from nltk.corpus import stopwords
from wordcloud import WordCloud, STOPWORDS
import spacy
nlp = spacy.load('en_core_web_sm')
from langdetect import detect
from nltk.collocations import BigramAssocMeasures, BigramCollocationFinder
from nltk.tokenize import word_tokenize

#import function used to create the dataframe
from ipynb.fs.full.Functions_dataset import extract_film_info
from ipynb.fs.full.Functions_dataset import extract_comment_rating
from ipynb.fs.full.Functions_dataset import clean_data
from ipynb.fs.full.Functions_dataset import return_dataset

In [None]:
#import dataset
df = pd.read_csv(r'C:\Users\ASUS\Desktop\DSE\2. Text mining and sentiment analysis\project\monopoly_set.csv')
df.head()

# Text cleaning

In [None]:
#remove omoticons from commments
def remove_emoji(text):
    emoji_pattern = re.compile("["
                           u"\U0001F600-\U0001F64F"  # emoticons
                           u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                           u"\U0001F680-\U0001F6FF"  # transport & map symbols
                           u"\U0001F1E0-\U0001F1FF"  # flags 
                           u"\U00002702-\U000027B0"
                           u"\U000024C2-\U0001F251"
                           "]+", flags=re.UNICODE)
    return emoji_pattern.sub(r'', text)

In [None]:
#remove non-english comments 
words = set(nltk.corpus.words.words())
def remove_non_english_comment(comment):
    return " ".join(w for w in nltk.wordpunct_tokenize(comment) 
         if w.lower() in words or not w.isalpha())

In [None]:
def comment_cleaning(column):
    column = column.str.replace('[^\w\s]','')
    column = column.apply(lambda x: remove_emoji(x))
    column= column.apply(remove_non_english_comment)
    #remove stopwords
    stop = stopwords.words('english')
    column = column.apply(lambda x: " ".join(x.lower() for x in x.split() if x not in stop))
    return column

In [None]:
#re-order the dataset needed for the analysis
df['value_txt'] = df['value'].astype(str)
df['comment'] = comment_cleaning(df['value_txt'])
df.dropna(subset=['comment'], inplace=True)
df = df.drop('value', axis = 1)


In [None]:
#lemmatization
import nltk
#nltk.download('wordnet')

w_tokenizer = nltk.tokenize.WhitespaceTokenizer()
lemmatizer = nltk.stem.WordNetLemmatizer()

def lemmatize_text(text):
    return [lemmatizer.lemmatize(w) for w in w_tokenizer.tokenize(text)]

df['commnent'] = df.comment.apply(lemmatize_text)

# Some text understanding and data vizualization

In [None]:
#The usual wordcloud  to understand what are the most used words in our dataset
# Import the wordcloud library
from wordcloud import WordCloud
# Join the different processed titles together.
long_string = ','.join(list(df['comment'].values))
# Create a WordCloud object
wordcloud = WordCloud(background_color="white", max_words=300, contour_width=3, contour_color='steelblue')
# Generate a word cloud
wordcloud.generate(long_string)
# Visualize the word cloud
image = wordcloud.to_image()

#wordcloud.to_file('C:/Users/ASUS/Desktop/DSE/2. Text mining and sentiment analysis/project/plot/word_cloud.jpeg')


In [None]:
#frequencies of words
df_hist = df[["@username", "comment"]]

df_hist.drop_duplicates(subset=["@username"], keep='first', inplace=True)
df_hist = df_hist.reset_index()

# Split the text into a list of words
df_hist['histogram'] = df_hist['comment'].str.split()

# Get the frequency of each word
word_counts = df_hist['histogram'].apply(lambda x: pd.Series(x).value_counts()).sum()
# Get the 20 most frequent words
top_20_words = word_counts.nlargest(15)

# Plot the histogram
top_20_words.plot.bar(figsize=(10,8))

# Add labels and show the plot
plt.xlabel('Words', fontsize = 16)
plt.ylabel('Frequency', fontsize = 16)
plt.title('Frequency of 15 Most Common Words', fontsize = 18)
plt.xticks(rotation=25, fontsize=10)
plt.yticks(fontsize=10)

#plt.savefig('C:/Users/ASUS/Desktop/DSE/2. Text mining and sentiment analysis/project/plot/frequent_words.jpeg')

plt.show()



In [None]:
#frequencies of bigrams
df_hist_bi = df[["@username", "comment"]]

df_hist_bi.drop_duplicates(subset=["@username"], keep='first', inplace=True)
df_hist_bi = df_hist_bi.reset_index()

# Tokenize the text in the column
df_hist_bi['tokens'] = df_hist_bi['comment'].apply(word_tokenize)

# Find the bigrams
finder = BigramCollocationFinder.from_documents(df_hist_bi['tokens'])

# Get the frequency of each bigram
bigram_counts = finder.ngram_fd.items()

# Convert to pandas dataframe
bigram_df = pd.DataFrame(bigram_counts, columns=['bigram', 'count']).sort_values(by='count',ascending=False)
top_20_bigrams = bigram_df.nlargest(15, 'count')

# Define the function to convert tuple to string
def convert_to_string(x):
    return ' '.join(x)
# Apply the function to the 'words' column
top_20_bigrams['sentence'] = top_20_bigrams['bigram'].apply(convert_to_string)

plt.figure(figsize=(10, 6))

plt.bar(top_20_bigrams['sentence'], top_20_bigrams['count'], color='green')
plt.ylabel('Frequency', fontsize = 14)
plt.title('Frequency of 15 Most Common Bigrams', fontsize = 18)
plt.xticks(rotation=25, fontsize = 10)

#plt.savefig('C:/Users/ASUS/Desktop/DSE/2. Text mining and sentiment analysis/project/plot/frequent_bigrams.jpeg')

plt.show()

In [None]:
#trigrams
df_hist_tri = df[["@username", "comment"]]

df_hist_tri.drop_duplicates(subset=["@username"], keep='first', inplace=True)
df_hist_tri = df_hist_tri.reset_index()

from nltk.collocations import TrigramAssocMeasures, TrigramCollocationFinder

# Tokenize the text in the column
df_hist_tri['tokens'] = df_hist_tri['comment'].apply(word_tokenize)

# Find the trigrams
finder = TrigramCollocationFinder.from_documents(df_hist_tri['tokens'])

# Get the frequency of each trigram
trigram_counts = finder.ngram_fd.items()

# Convert to pandas dataframe
trigram_df = pd.DataFrame(trigram_counts, columns=['trigram', 'count']).sort_values(by='count',ascending=False)

# Plot the histogram
top_20_trigrams = trigram_df.nlargest(15, 'count')

# Define the function to convert tuple to string
def convert_to_string(x):
    return ' '.join(x)
# Apply the function to the 'words' column
top_20_trigrams['sentence'] = top_20_trigrams['trigram'].apply(convert_to_string)

plt.figure(figsize=(10, 6))

plt.bar(top_20_trigrams['sentence'], top_20_trigrams['count'],color='red')
plt.xlabel('Trigrams',fontsize = 14)
plt.ylabel('Frequency',fontsize = 14)
plt.title('Frequency of 15 Most Common Trigrams', fontsize = 18)
plt.xticks(rotation=25, ha='right', fontsize = 10)
plt.yticks(fontsize = 11)

#plt.savefig('C:/Users/ASUS/Desktop/DSE/2. Text mining and sentiment analysis/project/plot/frequent_trigrams.jpeg')

plt.show()

In [None]:
df_rating = df[["@username", "rating"]]

df_rating.drop_duplicates(subset=["@username"], keep='first', inplace=True)
df_rating = df_rating.reset_index()

#plot distribution of stars

df_rating["int_rating"] = df_rating["rating"].astype(int)
df_rating["rating_txt"] = df_rating['int_rating'].astype(str)

value_counts = df_rating['rating_txt'].value_counts()

plt.figure(figsize=(10, 6))

value_counts.plot(kind='bar', color = "orange")

plt.xlabel('Rating',fontsize = 14)
plt.ylabel('Count',fontsize = 14)
plt.title('Histogram of Rating frequency', fontsize = 16)
plt.xticks(fontsize = 11)
plt.yticks(fontsize = 11)

#plt.savefig('C:/Users/ASUS/Desktop/DSE/2. Text mining and sentiment analysis/project/plot/rating_frequency.jpeg')

plt.show()


In [None]:
df_rating["int_rating"].describe()

In [None]:
len(df_rating)

# Aspect Based Santiment Analysis

## Simple model: extract most frequent words and compute the mean of ratings of comments in which those words are present

In [None]:
#pd.options.display.max_rows = len(df)
df = df[["@username", "rating", "boardgame_title","boardgamecategory", "boardgamemechanic", "comment"]]

In [None]:
#extract unique comments, delete comments by key
df.drop_duplicates(subset=["@username"], keep='first', inplace=True)
df = df.reset_index()

In [None]:
#create a list for the association word, adjective
lista = []
for k in range(len(df)):
    for i in nlp(df["comment"][k]):
        if i.dep_ == 'nsubj' and (i.pos_ == "NOUN" or i.pos_ == "PROPN"):
            comps = [j for j in i.children if j.pos_ in ["ADJ"]]
            if comps:
                lista.append([comps, i])

In [None]:
#creating the dataframe structure
view = pd.DataFrame(columns=['Adj', 'Noun'])

#from element list to datarframe 
for i in range(len(lista)):
    view = view.append({'Adj':lista[i][0], 'Noun': lista[i][1]},ignore_index=True)

#type problem for the name, new column
view["text_name"] = view['Noun'].astype(str)
view = view.drop('Noun', axis = 1)

In [None]:
#select the most words used to determine the aspects on which users make comments
freq = view['text_name'].value_counts().reset_index(name='count').rename({'index':'word'}, axis = 1)
freq = freq.head(15)

#list of the most frequent names
col_list = freq.word.values.tolist()

In [None]:
#restart from the starting df and create a column in which I inserti the words that I found in the respective comment
df['found_word'] = df['comment'].apply(lambda x: [i for i in x.split() if i in col_list])

df_analisi = df[["rating", "comment","found_word"]]

df_analisi.head(15)

In [None]:
#create a df in which there is a 1 to 1 relationship with every frequent noun and its relative rating
df_analisi = df_analisi.explode("found_word")
df_analisi["rating_int"] = df_analisi["rating"].astype(float)
df_analisi = df_analisi.reset_index()

#create e dictionary in which there is a frequent subject as key and inside the value a list with the rating associated with it
result_dict = df_analisi.groupby("found_word")["rating_int"].apply(list).to_dict()

# filter the dictionary on the frequent words
result_dict = {k:v for k,v in result_dict.items() if k in col_list}

In [None]:
#plot to understand the distribution of the ratings for each frequent word
# Group the data by the category column
grouped_data = df_analisi.groupby('found_word')['rating_int'].apply(list)

# Create the box plot
fig, ax = plt.subplots(figsize=(10,8))
bp = ax.boxplot(grouped_data.values, labels=grouped_data.index, patch_artist = True)

for whisker in bp['whiskers']:
    whisker.set_color('blue')
     
for median in bp['medians']:
    median.set_color('red')
    
for patch in bp['boxes']:
    patch.set_color('lightblue')
    
ax.set_ylabel("Rating", fontsize = 14)
ax.set_xlabel("Subjects", fontsize = 14)
ax.set_title("Box Plot with Subject on the X-axis and Rating Distribution on Y-axis", fontsize = 16)

# Rotate x-axis labels
plt.xticks(rotation=45, fontsize=10)
plt.yticks(fontsize=10)

#plt.savefig('C:/Users/ASUS/Desktop/DSE/2. Text mining and sentiment analysis/project/plot/Word_rating_distribution.jpeg')

plt.show()

#not too much information from these

## Sentiment word approach: for each subject in the comments, extract the adjective and return its polarity. Weighted mean of the polarity of each subject for understand the sentiment of a particular word. 

In [None]:
#create a dataframe the contains the word and mean reating
results = []
for key, value in result_dict.items():
    result = statistics.mean(value)
    results.append([key, result])
    
df_rating = pd.DataFrame(results, columns=['word', 'Mean Rating'])

In [None]:
#make a left join for seing the frequency of a word and the relative rating
result = pd.merge(freq, df_rating, on='word', how='left')

#ordino il precedente dataframe per rating
by_rating = result.sort_values(by='Mean Rating')

In [None]:
#1 to 1 correspondence between words and adjectives
result1 = pd.merge(by_rating, view, left_on='word', right_on='text_name', how='left')
result1 = result1.explode("Adj")

#trasformo aggetivo in un oggetto di testo
result1["text_adj"] = result1['Adj'].astype(str)

In [None]:
result_ordered = result1.sort_values(by='Mean Rating')

In [None]:
#count the combo name-adj
df_solution = result1.pivot_table(index=['text_name','text_adj'], aggfunc='size').reset_index(name='count')
df_grouped = df_solution.groupby('text_name').apply(lambda x: x.sort_values('count', ascending=False))
df_grouped_2 = df_grouped.rename(index={'text_name': 'index'})
df_analisi_sent = df_grouped_2.sort_values(by='count', ascending = False)

In [None]:
from textblob import TextBlob

#Create a function to get the polarity
def getPolarity(text):
    return TextBlob(text).sentiment.polarity
  
#Create two new columns ‘Subjectivity’ & ‘Polarity’
df_analisi_sent['polarity'] =  df_analisi_sent['text_adj'].apply(getPolarity)
df_analisi_sent['tot_polarity'] = df_analisi_sent['polarity'] * df_analisi_sent['count']

df_analisi_sent.reset_index(drop=True, inplace=True)
df2 = df_analisi_sent.groupby('text_name')['count', 'tot_polarity'].sum()

df2['mean_polarity'] = df2['tot_polarity'] / df2['count']
print(df2)


df3 = df2.reset_index()
print(df3)

In [None]:
df_plot_mean = df3[["text_name", "mean_polarity"]]

plt.figure(figsize=(10, 8))
x = df_plot_mean["text_name"]
y = df_plot_mean["mean_polarity"]
plt.bar(x, y)

plt.xlabel("Subjects", fontsize=14)
plt.ylabel("Polarity", fontsize=14)
plt.title("Bar Plot of Mean Polarity by Subjects", fontsize = 16)

# Rotate x-axis labels
plt.xticks(rotation=25,  fontsize=12)
plt.yticks(fontsize=12)

#plt.savefig('C:/Users/ASUS/Desktop/DSE/2. Text mining and sentiment analysis/project/plot/mean_polarity_by_subject.jpeg')

plt.show()

In [None]:
# Group the data by the category column
grouped_data_2 = df_analisi_sent.groupby('text_name')['polarity'].apply(list)

# Create the box plot
fig, ax = plt.subplots(figsize=(10,8))
bp = ax.boxplot(grouped_data_2.values, labels=grouped_data_2.index, patch_artist = True)
for whisker in bp['whiskers']:
    whisker.set_color('blue')

for median in bp['medians']:
    median.set_color('red')
    
for patch in bp['boxes']:
    patch.set_color('lightblue')
    
ax.set_ylabel("Polarity", fontsize = 14)
ax.set_xlabel("Subjects", fontsize = 14)
ax.set_title("Box Plot with Subjects on X-axis and Polarity on Y-axis", fontsize = 16)
# Rotate x-axis labels
plt.xticks(rotation=25, fontsize=12)
plt.yticks(fontsize=12)

#plt.savefig('C:/Users/ASUS/Desktop/DSE/2. Text mining and sentiment analysis/project/plot/polarity_distribution_by_subjects.jpeg')

plt.show()

## Find aspects for the most used words: use pretrained algorithm for vectorize words and make cluster on this words to find possible aspect of the boardgame

In [None]:
from gensim.models import Word2Vec

In [None]:
#pretrained
import gensim.downloader
glove_vectors = gensim.downloader.load('glove-twitter-25')

In [None]:
start = np.zeros((15, 25))
for i in range(len(df3)):
    start[i,:] = glove_vectors[df3['text_name'][i]]  

In [None]:
#SEARCH TOPICS USING UNSUPERVISED LEARNING (K-MEANS)

In [None]:
from sklearn.metrics import silhouette_score
from sklearn.cluster import KMeans

# Try different values of k
for k in range(2, 11):
    kmeans = KMeans(n_clusters=k)
    kmeans.fit(start)
    labels = kmeans.predict(start)
    score = silhouette_score(start, labels)
    print(f'k = {k}, silhouette score = {score}')

In [None]:
# Initialize the KMeans object with the desired number of clusters
import random
random.seed(106)
kmeans = KMeans(n_clusters=6, random_state = 15)

# Fit the model to the data
kmeans.fit(start)

# Get the cluster assignments for each vector
labels = kmeans.predict(start)

In [None]:
df3 = df3.assign(label=labels)
df3["text_label"] = df3['label'].astype(str)
df3.head(15)

In [None]:
from sklearn.decomposition import PCA

pca = PCA(n_components=2)
reduced = pca.fit_transform(start)

x = reduced[:, 0]
y = reduced[:, 1]

df3['x'] = x
df3['y'] = y

In [None]:
# Create a mapping dictionary
mapping = {0: 'technical_aspects', 1: 'action_aspects', 2: 'action_aspects', 3: 'general_game_aspects', 4: 'action_aspects', 5: 'monopoly_aspects'}

# Add a new column 'B' with the mapped values
df3['Aspect'] = df3['label'].map(mapping)
df3.head(15)

In [None]:
plt.figure(figsize=(10, 6))
for key, group in df3.groupby('Aspect'):
    plt.scatter(group['x'], group['y'],s=150, label=key)
plt.legend()

plt.xlabel("First PCA dimension", fontsize=12)
plt.ylabel("Second PCA dimension", fontsize=12)
plt.title("Show the association words-aspect", fontsize = 16)

#plt.savefig('C:/Users/ASUS/Desktop/DSE/2. Text mining and sentiment analysis/project/plot/PCA_words_repr.jpeg')

plt.show()

In [None]:
mean_by_group = df3.groupby('Aspect')['mean_polarity'].mean()

print(mean_by_group)

In [None]:
df_analisi_sent2 = pd.merge(df_analisi_sent, df3, on='text_name', how='left')
df_analisi_sent_aspect = df_analisi_sent2[["tot_polarity_x","count_x","Aspect", "polarity"]]
df_analisi_sent2 = df_analisi_sent2[["tot_polarity_x","count_x","Aspect"]]
df_analisi_sent2.head()

In [None]:
df4 = df_analisi_sent2.groupby('Aspect')['count_x', 'tot_polarity_x'].sum()

df4['mean_polarity_aspect'] = df4['tot_polarity_x'] / df4['count_x']
print(df4)

#inserire un metodo per poter andare a creare una colonna uguale all'indice
df5 = df4.reset_index()
print(df5)

In [None]:
# Group the data by the category column
grouped_data_3 = df_analisi_sent_aspect.groupby('Aspect')['polarity'].apply(list)

# Create the box plot
fig, ax = plt.subplots(figsize=(10,8))
bx = ax.boxplot(grouped_data_3.values, labels=grouped_data_3.index, patch_artist = True)

for patch in bx['boxes']:
    patch.set_color('lightblue')
    
for whisker in bx['whiskers']:
    whisker.set_color('blue')

for median in bx['medians']:
    median.set_color('red')
    
ax.set_ylabel("Polarity", fontsize = 14)
ax.set_xlabel("Aspects", fontsize = 14)
ax.set_title("Box Plot with Aspects on X-axis and Polarity on Y-axis", fontsize = 16)
# Rotate x-axis labels
plt.xticks(rotation=75, fontsize = 10)
plt.yticks(fontsize = 10)

#plt.savefig('C:/Users/ASUS/Desktop/DSE/2. Text mining and sentiment analysis/project/plot/polarity distribution_by_aspect.jpeg')

plt.show()

In [None]:
df_plot_mean2 = df5[["Aspect", "mean_polarity_aspect"]]

plt.figure(figsize=(9, 7))
x = df_plot_mean2["Aspect"]
y = df_plot_mean2["mean_polarity_aspect"]
plt.bar(x, y)

plt.xlabel("Aspects", fontsize=14)
plt.ylabel("Mean Polarity", fontsize=14)
plt.title("Bar Plot of Mean Polarity by Aspects", fontsize = 16)

# Rotate x-axis labels
plt.xticks(rotation=10,  fontsize=11)
plt.yticks(fontsize=11)

#plt.savefig('C:/Users/ASUS/Desktop/DSE/2. Text mining and sentiment analysis/project/plot/mean_polarity_by_aspect.jpeg')

plt.show()