In [1]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import re
from langdetect import detect
import morfeusz2
import unidecode

In [19]:
df = pd.read_excel("game_reviews.xlsx", index_col=0)

In [20]:
df

Unnamed: 0,review,voted_up
0,all,True
1,Battle Chasers: League of Legends 2.0,True
2,Ukończyłem już ponad połowę gry i jestem bardz...,True
3,Amazing. One of the best games I've played thi...,True
4,<3,True
...,...,...
74928,Spoko (=,True
74929,nice i alpha\n,True
74930,narazie bez błedów.\ngrafika obłedna\ngra sie ...,True
74931,8/10,True


In [10]:
class PreprocessingSteamReviews():
    def __init__(self, df_reviews):
        self.df_reviews = df_reviews
        
        self.remove_reviews_under_99_chars(99)
        self.df_reviews['review'] = self.df_reviews['review'].apply(self.remove_newlines_tabs)
        self.df_reviews['review'] = self.df_reviews['review'].apply(self.strip_html_tags)
        self.df_reviews['review'] = self.df_reviews['review'].apply(self.remove_whitespace)
        self.df_reviews['review'] = self.df_reviews['review'].apply(self.remove_non_alphanumeric_chracters)
        self.df_reviews['review'] = self.df_reviews['review'].apply(self.remove_links)
        self.remove_reviews_with_no_alphanumeric_items()
        self.remove_non_polish_reviews()
#         self.remove_reviews_under_99_chars(99)
        self.lowercase_all()
        self.tokenize_all()
        self.remove_reviews_under_n_words(20)
        self.df_reviews['review'] = self.df_reviews['review'].apply(self.remove_polish_stopwords)
        
        self.morf = morfeusz2.Morfeusz()
        self.df_reviews['review'] = self.df_reviews['review'].apply(self.lemmatisation)
        
        self.df_reviews['review'] = self.df_reviews['review'].apply(" ".join)
#         self.df_reviews['review'] = self.df_reviews['review'].apply(self.accented_characters_removal)
        self.df_reviews.drop(['len'], inplace=True, axis=1)
        self.df_reviews.drop(['len2'], inplace=True, axis=1)
        
        
    def remove_reviews_under_99_chars(self, n):
        self.df_reviews['len'] = self.df_reviews['review'].str.len()
        self.df_reviews = self.df_reviews[self.df_reviews['len'] > n]
        
    def remove_reviews_under_n_words(self, n):
        self.df_reviews['len2'] = self.df_reviews['review'].str.len()
        self.df_reviews = self.df_reviews[self.df_reviews['len2'] > n]
    
    def remove_newlines_tabs(self, text):
        # Replacing all the occurrences of \n,\\n,\t,\\ with a space.
        Formatted_text = text.replace('\\n', ' ').replace('\n', ' ').replace('\t',' ').replace('\\', ' ')
        return Formatted_text

    def strip_html_tags(self, text):
        # Initiating BeautifulSoup object soup.
        soup = BeautifulSoup(text, "html.parser")
        # Get all the text other than html tags.
        stripped_text = soup.get_text(separator=" ")
        return stripped_text

    def remove_whitespace(self, text):
        pattern = re.compile(r'\s+') 
        Without_whitespace = re.sub(pattern, ' ', text)
        # There are some instances where there is no space after '?' & ')', 
        # So I am replacing these with one space so that It will not consider two words as one token.
        text = Without_whitespace.replace('?', ' ? ').replace(')', ') ')
        return text

    def remove_non_alphanumeric_chracters(self, text):
        regex = re.compile('[^a-zA-ZAaĄąBbCcĆćDdEeĘęFfGgHhIiJjKkLlŁłMmNnŃńOoÓóPpRrSsŚśTtUuWwYyZzŹźŻż ]')
        text = regex.sub('', text)
        return text
    
    def remove_reviews_with_no_alphanumeric_items(self):
        for row, data in self.df_reviews.T.iteritems():
            if not any(c.isalpha() for c in data['review']):
                self.df_reviews.drop([row], inplace=True)
                
    def remove_non_polish_reviews(self):
        for row, data in self.df_reviews.T.iteritems():
            if detect(data['review']) != 'pl':
                self.df_reviews.drop([row], inplace=True)
                
    def lowercase_all(self):
        self.df_reviews['review'] = self.df_reviews['review'].str.lower()
        
    def tokenize_all(self):
        self.df_reviews['review'] = self.df_reviews['review'].str.split()
    
    def remove_polish_stopwords(self, text):
        stopwords = []
        with open("polish.stopwords.txt", encoding = 'utf-8') as f:
            for line in f:
                stripped_line = line.strip()
                stopwords.append(stripped_line)
        words = [word for word in text if word.lower() not in stopwords]
        return words
    
    def lemmatisation(self, text):
        res = []
        for i in text:
            analysis = self.morf.analyse(i)
            x = analysis[0][2][1]
            x = x.split(':')[0].lower()
            res.append(x)
        return res
    
    def remove_links(self, text):
        remove_https = re.sub(r'http\S+', '', text)
        remove_com = re.sub(r"\ [A-Za-z]*\.com", " ", remove_https)
        remove_pl = re.sub(r"\ [A-Za-z]*\.pl", " ", remove_com)
        return remove_pl

In [11]:
pre = PreprocessingSteamReviews(df)
df_preprocessed = pre.df_reviews

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.df_reviews['review'] = self.df_reviews['review'].apply(self.remove_newlines_tabs)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.df_reviews['review'] = self.df_reviews['review'].apply(self.strip_html_tags)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.df_reviews['review'] = self.df_

In [12]:
df_preprocessed

Unnamed: 0,review,voted_up
2,ukończyć połowa gra bardzo zadowolić zakup poc...,True
12,gram league of legends lato miło koniec pograć...,True
16,witać wstęp powiedzieć nie fan league of legen...,True
19,super gierka zarówno fan lolo chcieć dowiedzie...,True
21,ciekawy rozgrywka gra nie dopracować pare bugo...,True
...,...,...
74911,witać grać wcześnie lost arka serwer ru opinia...,True
74920,gra wyglądać niesamowicie grafika muzyka bajka...,True
74922,gierka spoko dobry diablo iii sporo akcja sześ...,True
74926,halo polski scena growa gra pogrom yesieni pog...,True


In [13]:
df_preprocessed[df_preprocessed['voted_up'] == False]

Unnamed: 0,review,voted_up
53,polska nie polecać gra raczy obsługa amazon gr...,False
60,darmowy gra mało bug system kooperacja leż dan...,False
61,gra przyjemny człowiek chcieć przespacerować p...,False
66,gra wciągnąć godzina każdy aktualizacja tracić...,False
68,nie polecać gra główny konto zostać zbanować n...,False
...,...,...
74797,grać człowiek wojownik kapłan asmodeusa sam lo...,False
74803,bardzo dobry rpg zalecać wstrzymać zakup pewie...,False
74822,chcieć podkreślić pierwsza przygoda pathfinder...,False
74839,bugi dopracować gra nie dokończyć tytuł napraw...,False


In [14]:
df1 = df_preprocessed[df_preprocessed['voted_up'] == False]
len_f = len(df_preprocessed[df_preprocessed['voted_up'] == False])
print(len_f)

1833


In [15]:
df2 = df_preprocessed[df_preprocessed['voted_up'] == True][:len_f]

In [16]:
df_preprocessed = pd.concat([df1,df2])

In [17]:
df_preprocessed = df_preprocessed.sample(frac=1)

In [18]:
df_preprocessed.to_excel("game_reviews_preprocessed_.xlsx")