In [3]:
from sqlalchemy import create_engine
from langdetect import detect
import pandas as pd


In [4]:
'''
1. Load the Steam review data into Pandas dataframe
'''

engine = create_engine('mysql://root:@localhost:3306/steam')
steam_data_query = """SELECT url AS reviewid, content, CAST(recommend AS SIGNED) AS recommend, hours_all, compensation
  FROM latest_review 
	WHERE content IS NOT NULL 
	AND content != ''
	AND content != ' ';
"""

df = pd.read_sql(steam_data_query, engine)


In [5]:
# We remove non-english reviews, as it could bias our results
sentences = df['content']

# Create a language column
df['lang'] = 'null'

# We only want English reviews
empty_reviews = []
non_english_reviews = []
for ind, sentence in sentences.items():
    
    if sentence == None:
        empty_reviews.append(df['reviewid'].iloc[ind])
        continue;
        
    if sentence == '':
        empty_reviews.append(df['reviewid'].iloc[ind])
        continue;
        
    if sentence == ' ':
        empty_reviews.append(df['reviewid'].iloc[ind])
        continue;
        
    try:
        if detect(sentence) == 'en':
            language = detect(sentence)
            df.at[ind, 'lang'] = language
            
        else:
            non_english_reviews.append(df['reviewid'].iloc[ind])
    except:
        non_english_reviews.append(df['reviewid'].iloc[ind])

        


In [14]:
data = df[df['lang'] == 'en']
print(data.head(5))

                                             reviewid  \
4   http://steamcommunity.com/id/--u/recommended/3...   
5   http://steamcommunity.com/id/-7656119804976061...   
7   http://steamcommunity.com/id/-Andrealphus-/rec...   
9   http://steamcommunity.com/id/-asymmetry/recomm...   
10  http://steamcommunity.com/id/-bigboy/recommend...   

                                              content  recommend  hours_all  \
4   It's subjectively a compilation of mediocre mi...          0        1.1   
5   This game made me lose hope,I played this game...          0      308.0   
7          101/101 out of doge coins from Tits Mcguee          1       77.4   
9   Extremely visually pleasing with an enticing s...          1        4.4   
10                  good game, but with a lot of bugs          1      265.9   

   compensation lang  
4       b'\x00'   en  
5       b'\x00'   en  
7       b'\x00'   en  
9       b'\x00'   en  
10      b'\x00'   en  


In [19]:
def convert_to_int(compensation):
    return int.from_bytes(compensation, 'big');

In [21]:
data2 = data.copy(deep = True)
data2['compensation'] = data2['compensation'].map(convert_to_int)
print(data2.head(20)['compensation'])

4     0
5     0
7     0
9     0
10    0
12    0
14    0
15    0
16    0
18    0
19    0
20    0
21    0
22    0
23    0
24    0
25    1
27    0
29    0
32    0
Name: compensation, dtype: int64


In [29]:
# print(data2.head(10)['content'])
big_text = ""
for sentence in data2['content'].values.tolist():
    big_text += sentence + " "
big_text = big_text.strip()
# big_text = [' '.join(sentence) for sentence in data2['content'].values.tolist()]
with open('big_review.txt', 'w') as f:
    f.write(big_text)

In [10]:
import enchant
import numpy as np
import csv
import math, collections
import pandas as pd
import re
import itertools
from nltk import tokenize
import nltk.data


class Sentence_Corrector :
    def __init__(self, training_file) :
        self.laplaceUnigramCounts = collections.defaultdict(lambda: 0)
        self.laplaceBigramCounts = collections.defaultdict(lambda: 0)
        self.total = 0
        self.sentences = []
        self.importantKeywords = set()
        self.d = enchant.Dict("en_US")
        self.tokenize_file(training_file)
        self.train()

    def tokenize_file(self, file) :
        # """
        #   Read the file, tokenize and build a list of sentences
        # """
        tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')
        f = open(file)
        content = f.read()
        for sentence in tokenizer.tokenize(content):
            sentence_clean = [i.lower() for i in re.split('[^a-zA-Z]+', sentence) if i]
            self.sentences.append(sentence_clean)


    def train(self):
        # """
        #   Train unigram and bigram
        # """
        for sentence in self.sentences:
            sentence.insert(0, '<s>')
            sentence.append('</s>')
            for i in range(len(sentence) - 1):
                token1 = sentence[i]
                token2 = sentence[i + 1]
                self.laplaceUnigramCounts[token1] += 1
                self.laplaceBigramCounts[(token1, token2)] += 1
                self.total += 1
            self.total += 1
            self.laplaceUnigramCounts[sentence[-1]] += 1


    def candidate_word(self, word):
        # """
        # Generate similar word for a given word
        # """
        suggests = []
        for candidate in self.importantKeywords:
            if candidate.startswith(word):
                suggests.append(candidate)
        suggests.append(word)

        if len(suggests) == 1:
            suggests = self.d.suggest(word)
            suggests = [suggest.lower() for suggest in suggests][:4]
            suggests.append(word)
            suggests = list(set(suggests))

        return suggests, len(suggests)

    def candidate_sentence(self, sentence):
        # """
        # Takes one sentence, and return all the possible sentences, and also return a dictionary of word : suggested number of words
        # """
        candidate_sentences = []
        words_count = {}
        for word in sentence:
            candidate_sentences.append(self.candidate_word(word)[0])
            words_count[word] = self.candidate_word(word)[1]

        candidate_sentences = list(itertools.product(*candidate_sentences))
        return candidate_sentences, words_count

    def correction_score(self, words_count, old_sentence, new_sentence) :
        # """
        #   Take a old sentence and a new sentence, for each words in the new sentence, if it's same as the orginal sentence, assign 0.95 prob
        #   If it's not same as original sentence, give 0.05 / (count(similarword) - 1)
        # """
        score = 1
        for i in range(len(new_sentence)) :
            if new_sentence[i] in words_count :
                score *= 0.90
            else :
                score *= (0.05 / (words_count[old_sentence[i]] - 1))
        return math.log(score)

    def score(self, sentence):
        # """
        #     Takes a list of strings as argument and returns the log-probability of the
        #     sentence using the stupid backoff language model.
        #     Use laplace smoothing to avoid new words with 0 probability
        # """
        score = 0.0
        for i in range(len(sentence) - 1):
            if self.laplaceBigramCounts[(sentence[i],sentence[i + 1])] > 0:
                score += math.log(self.laplaceBigramCounts[(sentence[i],sentence[i + 1])])
                score -= math.log(self.laplaceUnigramCounts[sentence[i]])
            else:
                score += (math.log(self.laplaceUnigramCounts[sentence[i + 1]] + 1) + math.log(0.4))
                score -= math.log(self.total + len(self.laplaceUnigramCounts))
        return score

    def return_best_sentence(self, old_sentence) :
        # """
        #   Generate all candiate sentences and
        #   Calculate the prob of each one and return the one with highest probability
        #   Probability involves two part 1. correct probability and 2. language model prob
        #   correct prob : p(c | w)
        #   language model prob : use stupid backoff algorithm
        # """
        bestScore = float('-inf')
        bestSentence = []
        old_sentence = [word.lower() for word in old_sentence.split()]
        sentences, word_count = self.candidate_sentence(old_sentence)
        for new_sentence in sentences:
            new_sentence = list(new_sentence)
            score = self.correction_score(word_count, new_sentence, old_sentence)
            new_sentence.insert(0, '<s>')
            new_sentence.append('</s>')
            score += self.score(new_sentence)
            if score >= bestScore:
                bestScore = score
                bestSentence = new_sentence
        bestSentence = ' '.join(bestSentence[1:-1])
        return bestSentence, bestScore

In [11]:
corrector = Sentence_Corrector('big.txt')

In [15]:
corrector.return_best_sentence('this is wron spallin word')
# corrector.return_best_sentence('aoccdrning to a resarch at cmabridge university')
# corrector.return_best_sentence('it does not mttaer in waht oredr the ltteers')
# corrector.return_best_sentence('the olny important tihng is taht')
# corrector.return_best_sentence('hell world')
# corrector.return_best_sentence('This used to belong to thew queen')

('this is wrong appalling world', -38.20485572828627)