# Sentimental Analysis of Starcraft 2 new patch

## Collecting reviews from their forum

In [1]:
import requests
from bs4 import BeautifulSoup
import os

In [2]:
FILE_R_PATH = 'r.txt' # file where it is saved sc2 data
URL = 'https://us.forums.blizzard.com/en/starcraft/t/new-ladder-season-incoming/2577'

In [3]:
# check if r.txt exists, if not download it
if os.path.exists('r.txt'):
    with open(FILE_R_PATH, 'r') as f:
        r_text = f.read()
else:
    r = requests.get(URL)
    print(r.status_code)
    r_text = r.text

In [4]:
# parse data
soup  = BeautifulSoup(r_text, 'html.parser')

In [5]:
divs = soup.findAll(class_='post')

In [6]:
reviews = []
for div in divs:
    pagraphs = div.findAll('p')
    paragraph = '\n'.join([p.text for p in pagraphs])
    reviews.append(paragraph)

## Analyzing reviews

In [7]:
import pandas as pd
import numpy as np

In [8]:
df = pd.DataFrame(np.array(reviews), columns=['review'])

Let's how many words do we have?

In [9]:
df['word_count'] =  df['review'].apply(lambda x: len(x.split()))

How many characters do we have?

In [10]:
df['char_count'] = df['review'].apply(lambda x: len(x))

Cleaning our data...

In [11]:
df.drop(df[df['char_count'] == 0].index, inplace=True)

Average length of words?

In [12]:
def average_words(x):
    words = x.split()
    big_sum = sum(len(word) for word in words) 
    if big_sum == 0:
        return 0
    return big_sum / len(words)

In [13]:
df['average_word_length'] = df['review'].apply(lambda x: average_words(x))

In [14]:
from nltk.corpus import stopwords

In [16]:
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /home/joseph/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [17]:
stop_words = stopwords.words('english')

In [19]:
len(stop_words)

179

In [24]:
df['stopwords_count'] = df['review'].apply(lambda x: len([word for word in x.split() if word.lower() in stop_words]))

In [27]:
df['stopword_rate'] = df['stopwords_count'] / df['word_count']

In [29]:
df.sort_values(by='stopword_rate')

Unnamed: 0,review,word_count,char_count,average_word_length,stopwords_count,stopword_rate
16,ASL12 maps GOGOGO!!!,3,20,6.0,0,0.0
6,Cydra go Marry me!,4,19,3.75,0,0.0
14,Super genial. Desde Huancayo Perú estamos supe...,14,84,5.071429,1,0.071429
18,Nice. Very good. Saludos desde Argentina,6,40,5.833333,1,0.166667
4,very happy happy happy,4,22,4.75,1,0.25
7,What abour 2vs2 ranked??,4,24,5.25,1,0.25
3,Would be interested in chatting with someone r...,14,107,6.714286,4,0.285714
17,Beautiful news!! peruvian fans was expecting a...,12,73,5.083333,4,0.333333
12,Please use the ASL map pool. You can maybe add...,33,189,4.757576,11,0.333333
10,Big thanks!! A thought maybe more map in map p...,26,134,4.192308,9,0.346154


In [30]:
df.describe()

Unnamed: 0,word_count,char_count,average_word_length,stopwords_count,stopword_rate
count,19.0,19.0,19.0,19.0,19.0
mean,21.894737,121.263158,4.724242,8.421053,0.314848
std,31.867599,177.105067,0.809818,14.492587,0.157337
min,3.0,19.0,3.5,0.0,0.0
25%,5.0,29.5,4.221154,1.0,0.25
50%,11.0,61.0,4.625,4.0,0.346154
75%,27.0,144.0,5.077381,10.0,0.436508
max,142.0,796.0,6.714286,64.0,0.5


## Clean text based data for NLP

In [34]:
df['lowercase'] = df['review'].apply(lambda x: ' '.join(word.lower() for word in x.split()))

In [38]:
df['punctuation'] = df['lowercase'].str.replace('[^\w\s]','', regex=True)

In [45]:
from nltk.corpus import stopwords
stop_words = stopwords.words('english')

In [47]:
df['stopwords'] = df['punctuation'].apply(lambda x: ' '.join(word for word in x.split() if word not in stop_words))

In [54]:
pd.Series(' '.join(df['stopwords']).split()).value_counts()[:30]

map        7
pool       6
like       5
season     4
ladder     4
scr        4
team       3
happy      3
maps       3
thank      3
10         3
longer     2
maybe      2
would      2
thanks     2
use        2
asl        2
time       2
good       2
go         2
nice       2
big        2
someone    2
desde      2
working    2
saludos    2
please     2
dont       2
see        2
well       2
dtype: int64

In [68]:
other_stop_words = ['src', 'maps', '10', 'use', 'asl', 'go', 'desde', 'asl12', 'gogogo', 'argentina', '2v2', 'cydra', 'go', '3', '2vs2', 'ranked', 'huancayo', 'perú', 'estamos', 'map', 'pool', 'starcraft']

In [82]:
df['cleaned_review'] = df['stopwords'].apply(lambda x: ' '.join(word for word in x.split() if word not in other_stop_words))

In [83]:
df['cleaned_review_word_count'] = df['cleaned_review'].apply(lambda x: len(x.split()))
df['clean_rate'] = df['cleaned_review_word_count'] / df['word_count']

In [86]:
df.head()

Unnamed: 0,review,word_count,char_count,average_word_length,stopwords_count,stopword_rate,lowercase,punctuation,stopwords,clean_review,cleanreview,cleaned_review,cleaned_review_word_count,clean_rate
0,"Greetings,\nLadder Season 10 is coming for Sta...",44,236,4.386364,16,0.363636,"greetings, ladder season 10 is coming for star...",greetings ladder season 10 is coming for starc...,greetings ladder season 10 coming starcraft re...,greetings ladder season coming starcraft remas...,greetings ladder season coming remastered next...,greetings ladder season coming remastered next...,22,0.5
2,Thank you guys for such a quick reaction! <3,9,44,4.0,4,0.444444,thank you guys for such a quick reaction! <3,thank you guys for such a quick reaction 3,thank guys quick reaction 3,thank guys quick reaction 3,thank guys quick reaction,thank guys quick reaction,4,0.444444
3,Would be interested in chatting with someone r...,14,107,6.714286,4,0.285714,would be interested in chatting with someone r...,would be interested in chatting with someone r...,would interested chatting someone regarding fu...,would interested chatting someone regarding fu...,would interested chatting someone regarding fu...,would interested chatting someone regarding fu...,8,0.571429
4,very happy happy happy,4,22,4.75,1,0.25,very happy happy happy,very happy happy happy,happy happy happy,happy happy happy,happy happy happy,happy happy happy,3,0.75
5,Thank you for doing this. It’s really importan...,11,61,4.636364,5,0.454545,thank you for doing this. it’s really importan...,thank you for doing this its really important ...,thank really important scene,thank really important scene,thank really important scene,thank really important scene,4,0.363636


## Lemmatize text

In [98]:
from textblob import Word
import nltk
nltk.download('wordnet')
nltk.download('omw-1.4')

[nltk_data] Downloading package wordnet to /home/joseph/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /home/joseph/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


True

In [96]:
nltk.download('wordnet')
nltk.download('wordnet_ic')

[nltk_data] Downloading package wordnet to /home/joseph/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package wordnet_ic to
[nltk_data]     /home/joseph/nltk_data...
[nltk_data]   Unzipping corpora/wordnet_ic.zip.


True

In [99]:
df['lemmatized'] = df['cleaned_review'].apply(lambda x: ' '.join(Word(word).lemmatize() for word in x.split()))