## Read first 10000 rows from json file

In [None]:
import json
import pandas as pd
def read_json_nrows(nrows,filename):
    n= 0
    with open(filename) as f:
        while n < nrows:
            if n == 0:
                line = f.readline()
                line = json.loads(line.rstrip())
                train = pd.DataFrame(line,index = [0])
            else:
                temp = pd.DataFrame(json.loads(f.readline().rstrip()),index = [n])
                train = train.append(temp)
            n = n+1
    return train
    

In [None]:
train_10000 = read_json_nrows(10000,'review_train.json')

In [None]:
train_10000.head(5)

## Cleaning data

In [None]:
from collections import Counter
from nltk.util import ngrams
import re
import numpy as np 
from nltk.corpus import words
from nltk.corpus import brown
from nltk.corpus import wordnet
from nltk import wordpunct_tokenize

### Languages

When I first tried to apply the *detect* function I got a 'No features in text' error. So I have to find out which review is empty.

In [None]:
train_10000.loc[6687]

This review is not language, but an emoticon.

In [None]:
def not_language(text):
    # First delete all common emoticons.
    text = re.sub('(?::|;|=)(?:-)?(?:\)|\(|D|P)','',text)
    if re.sub('[\W]+','',text) == '':
        return True
    else:
        return False

For simplicity, just consider emoticons as English.

In [None]:
not_lang = train_10000[train_10000.text.apply(not_language)].index.values

In [None]:
train_10000.loc[not_lang,'lang_type'] = 'english'

from nltk.corpus import stopwords
from nltk import wordpunct_tokenize
languages_ratios = {}
for i in range(10000):
    tokens = wordpunct_tokenize(train_10000.text[i])
    words = [word.lower() for word in tokens]
    for language in stopwords.fileids():
        stopwords_set = set(stopwords.words(language))
        words_set = set(words)
        common_elements = words_set.intersection(stopwords_set)
        languages_ratios[language] = len(common_elements)
    most_rated_language = max(languages_ratios, key=languages_ratios.get)
    train_10000.loc[i,'lang_type'] = most_rated_language

In [None]:
from langdetect import detect
for i in range(10000):
    if i in not_lang:
        continue
    else:
        train_10000.loc[i,'lang_type'] = detect(train_10000.text[i])

In [None]:
re.findall(':[\W]{0,1}',':(nmsl')

In [None]:
train_10000.lang_type.value_counts()

Focus on English only at present.

In [None]:
train_10000_eng = train_10000[train_10000.lang_type == 'en']

Most common words and phrases.

In [None]:
text = ' '.join(train_10000_eng.text.values)
text_trigrams = [i for i in ngrams(text.split(), 1)]
Counter(text_trigrams).most_common(20)

In [None]:
text = ' '.join(train_10000_eng.text.values)
text_trigrams = [i for i in ngrams(text.split(), 2)]
Counter(text_trigrams).most_common(20)

In [None]:
text = ' '.join(train_10000_eng.text.values)
text_trigrams = [i for i in ngrams(text.split(), 3)]
Counter(text_trigrams).most_common(20)

In [None]:
text = ' '.join(train_10000_eng.text.values)
text_trigrams = [i for i in ngrams(text.split(), 4)]
Counter(text_trigrams).most_common(20)

Oddest words

In [None]:
text = ' '.join(train_10000_eng.text.values)
text_trigrams = [i for i in ngrams(text.split(), 1)]
Counter(text_trigrams).most_common()[-500:]

Here I found two sentences in Chinese:
- (('不要在这里吃！',), 1),
- (('我们刚在这里吃午饭，在我们的汤里发现了一个蟑螂，我把它展示给服务员，她说：哦，对不起。你不必付饭费"，这意味着他们的厨房里有很多蟑螂，他们知道，她一点都不惊讶。',),  

Most of these rare words have some punctuations.

In [None]:
text = ' '.join(train_10000_eng.text.values)
text_trigrams = [i for i in ngrams(text.split(), 2)]
Counter(text_trigrams).most_common()[-20:]

### Check bad words

#### First emoticons

In [None]:
def find_emoticons(text):
    emoticons = re.findall('(?::|;|=)(?:-)?(?:\)|\(|D|P)',text)
    if emoticons == []:
        return False
    else:
        return True

In [None]:
eg1 = np.where(train_10000_eng.text.apply(find_emoticons) == True)[0][0]

In [None]:
train_10000_eng.iloc[eg1].text

Here we can find one  :)  together with two '\n' 

In [None]:
re.sub('\\n','',train_10000_eng.iloc[eg1].text)

#### Check typos

In [None]:
# Most packages can't handle some words like 'nooooo'. 
def check_same(word):
    intervals = {}
    for index,letter in enumerate(word):
        if letter == word[index-1]:
            if letter in intervals.keys():
                interval = intervals.pop(letter)
                lastguy = interval[len(interval)-1] 
                if lastguy[1] == (index-1):
                    lastguy = (lastguy[0],index)
                    interval[len(interval)-1] = lastguy
                    intervals[letter] = interval
                else:
                    lastguy1 = (index-1,index)
                    interval.append(lastguy1)
                    intervals[letter] = interval
            else:
                intervals[letter] = [(index-1,index)]
    return intervals

In [None]:
# I have never seen a word with three continuous same letter, so I will delete till two.
def no_more_than_2(word,dupli):
    for key in dupli.keys():
        for interval in dupli[key]:
            length = interval[1]-interval[0]+1
            regex = '(%s'%key + '{%i})'%length
            word = re.sub(regex,key+key,word)
    return word

In [None]:
from pattern.en import suggest
from itertools import combinations
def right_spelling(word,dupli):
    key_list = []
    for key in dupli.keys():
        key_list.append(key)
    n = len(key_list)
    for i in range(n):
        for comb in combinations(key_list,i):
            for letter in comb:
                regex = '(%s'%letter + '{2})'
                new_word = re.sub(regex,letter,word)
                if new_word in brown.words():
                    return new_word
    return suggest(word)[0][0]

In [None]:
def no_typo(word):
    if len(word) == len(set(word)):
        return word
    if suggest(word)[0][1] == 1:
        return suggest(word)[0][0]
    else:
        duplicates = check_same(word)
        two = no_more_than_2(word,duplicates)
        suggest_two = suggest(two)
        if suggest_two[0][1] == 1:
            return suggest_two[0][0]
        else:
            return right_spelling(two,duplicates)
        
        return right_spelling(word)

In [None]:
no_typo('finaaallly')

#### Lemmatization

In [None]:
from nltk.stem import WordNetLemmatizer
from nltk.stem.porter import PorterStemmer
from nltk import word_tokenize, pos_tag
from nltk.corpus import wordnet

In [None]:
from nltk.stem.porter import PorterStemmer
porter = PorterStemmer()
def tokenizer_porter(text):
    return [porter.stem(word) for word in text.split()]

In [None]:
def get_wordnet_pos(tag):
    if tag.startswith('J'):
        return wordnet.ADJ
    elif tag.startswith('V'):
        return wordnet.VERB
    elif tag.startswith('N'):
        return wordnet.NOUN
    elif tag.startswith('R'):
        return wordnet.ADV
    else:
        return None
    
wnl = WordNetLemmatizer()
def lemmatizer(text):
    tokens = word_tokenize(text)
    lemmas = []
    tagged = pos_tag(tokens)
    for tag in tagged:
        wordnet_pos = get_wordnet_pos(tag[1]) or wordnet.NOUN
        lemmas.append(wnl.lemmatize(tag[0], pos=wordnet_pos))
    return lemmas

#### Stop-words

I think words which mean negative are important.

In [None]:
import nltk
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')

In [None]:
from nltk.corpus import stopwords
stop = stopwords.words('english')
stop.pop(stop.index('but'))
stop.pop(stop.index('not'))
preposition = ['of','with','at','from','into','during',
               'including','until','till','against','among',
               'throughout','despite','towards','upon','concerning','to','in',
               'for','on','by','about','like','through','over',
               'before','between','after','since','without','under',
               'within','along','following','across','behind',
               'beyond','plus','except','but','up','out','around','down','off','above','near']
for prep in preposition:
    if prep in stop:
        stop.pop(stop.index(prep))

#### Convert n't to not

In [None]:
def no_abbreviation(text):
    text = re.sub('n\'t',' not',text)
    return text

#### Adversatives

In [None]:
but = ['yet','however','nonetheless','whereas','nevertheless']
although = ['although','though','notwithstanding','albeit']

In [None]:
def change_but(text):
    for x in but:
        text = re.sub(x,'but',text)
    return text
def change_although(text):
    for x in although:
        text = re.sub(x,'although',text)
    return text
def change_adversatives(text):
    text = change_but(text)
    text = change_although(text)
    return text

Now I want to capture the key information near but and although.

def although_phrase(text):
    words = text.split()
    for (index,word) in enumerate(words):
        if word == 'altough.':
            for x in range(index,index-10,-1):
                if re.sub('(.*)\.([a-z])\..*','\\2',str(wordnet.synsets(words[x])[0])) in ['v','adj']

In [None]:
def preprocessing(text):
    # 取表情
    emoticons = re.findall('(?::|;|=)(?:-)?(?:\)|\(|D|P)',text)
    # 去回车
    text = re.sub('\\n',' ',text)
    # not
    text = no_abbreviation(text)
    # 只保留字母
    text = re.sub('[\W]+',' ', text.lower())
    # 统一转折词
    text = change_adversatives(text)
    # 词性还原
    tokens = lemmatizer(text)
    text = ''
    for index, token in enumerate(tokens):
        # 去拼写错误
        #tokens[index] = no_typo(token)
        if token in stop:
            tokens[index] = ''
        else:
            text = text + tokens[index] + ' '
    return {'text':text,'emoticons':emoticons}

In [None]:
from tqdm import tqdm, tqdm_pandas
tqdm.pandas()
dictionary = train_10000_eng.text.progress_apply(preprocessing)

In [None]:
y = train_10000_eng.loc[dictionary.index]["stars"]

In [None]:
emoticons = [dictionary[i]['emoticons'] for i in train_10000_eng.index]

In [None]:
texts = [dictionary[i]['text'] for i in train_10000_eng.index]

In [None]:
texts

In [None]:
from autocorrect import spell

new_texts = ['']
for i in tqdm(range(len(texts))):
    new_texts.append([spell(j) for j in texts[i].split(' ')])

new_texts = new_texts[1:]

In [None]:
result = ['']
for i in range(len(new_texts)):
    result.append(' '.join(new_texts[i]))
    
new_texts = result[1:]

#### Bigrams for phrase

In [None]:
from gensim.models.phrases import Phrases, Phraser

In [None]:
sentence_stream = [sent.split(' ') for sent in new_texts]

In [None]:
bigram = Phraser(Phrases(sentence_stream, min_count=5, threshold=5)) #mincount越小识别出来的越少，threshold higher means fewer phrases

In [None]:
test_num = 100

In [None]:
new_texts[test_num]

In [None]:
print(sentence_stream[test_num])

In [None]:
print(bigram[sentence_stream[test_num]])

In [None]:
len(sentence_stream)

In [None]:
sentence_with_phrase = bigram[sentence_stream]

In [None]:
result = ['']
for i in range(len(new_texts)):
    result.append(' '.join(bigram[sentence_stream[i]]))
    
new_texts = result[1:]

In [None]:
new_texts[5]

#### Most Frequent Words

In [None]:
type([sentence_with_phrase])

In [None]:
from collections import defaultdict
word_freq = defaultdict(int)
for sent in sentence_with_phrase:
    for i in sent:
        word_freq[i] += 1
len(word_freq)

In [None]:
sorted(word_freq, key=word_freq.get, reverse=True)[:100]

### word2vec

In [None]:
import multiprocessing
cores = multiprocessing.cpu_count()
cores


In [None]:
from gensim.models import Word2Vec
w2v_model = Word2Vec(min_count=1,
                     window=2,
                     size=200,
                     sample=6e-5, 
                     alpha=0.03, 
                     min_alpha=0.0007, 
                     negative=20,
                     workers=cores-1,
                     seed = 123,
                     sg=0) # sg默认为0，对应CBOW算法；sg=1则采用skip-gram算法

In [None]:
import time

t = time.clock()

w2v_model.build_vocab(sentence_with_phrase, progress_per=10000)

print('Time to build vocab: {} mins'.format(round((time.clock() - t) / 60, 2)))

In [None]:
t = time.clock()

w2v_model.train(sentence_with_phrase, total_examples=w2v_model.corpus_count, epochs=30, report_delay=1)

print('Time to train the model: {} mins'.format(round((time.clock() - t) / 60, 2)))

In [None]:
w2v_model.wv.most_similar(positive=["awesome"])

In [None]:
w2v_model.wv.most_similar(positive=["huge"])

#### Sensitive Analysis (without tfidf)

In [None]:
design_matrix = np.zeros([len(new_texts),200])

In [None]:
for i in range(design_matrix.shape[0]):
    temp_sent = new_texts[i].split(' ')
    for j in range(len(temp_sent)):
        design_matrix[i] += w2v_model[temp_sent[j]]
    design_matrix[i] = design_matrix[i]/len(temp_sent)

In [None]:
from sklearn.ensemble import RandomForestClassifier
x = design_matrix[:8000,]
clf = RandomForestClassifier(n_estimators=100, max_depth=2,
                             random_state=0)
clf.fit(x, y[:8000])

In [None]:
from sklearn.metrics import accuracy_score
from sklearn.metrics import mean_squared_error
y_true = y[8000:]
y_pred = clf.predict(design_matrix[8000:,])
accuracy_score(y_true, y_pred)
np.sqrt(mean_squared_error(y_true, y_pred))

#### tf-idf

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [None]:
tf = TfidfVectorizer(analyzer='word', min_df = 1, lowercase = False)

In [None]:
response =  tf.fit_transform(new_texts)
feature_names = tf.get_feature_names()
res_df = pd.DataFrame(response.toarray(),columns = feature_names)

In [None]:
response.toarray()

In [None]:
tfidf_train = response.toarray()[:8000]
tfidf_test = response.toarray()[8000:]
y_train = y[:8000]
y_test = y[8000:]

In [None]:
from sklearn.linear_model import LogisticRegression
lr=LogisticRegression(multi_class='multinomial',solver='newton-cg’')
lr.fit(tfidf_train,y_train)
y_pred=lr.predict(tfidf_test)
np.sqrt(mean_squared_error(y_test, y_pred))

In [None]:
accuracy_score(y_test, y_pred)

In [None]:
from sklearn import linear_model
clf = linear_model.Lasso(alpha=0.1)
clf.fit(tfidf_train, y_train)
y_pred=clf.predict(tfidf_test)
np.sqrt(mean_squared_error(y_test, y_pred))

#### tfidf with w2v

In [None]:
def gettoptfidf_dict(df,k): #top k
    list1 = []
    for i in range(len(df)):
        value = sorted(df.iloc[i], reverse = True)[:k] 
        names = np.array(feature_names)[np.argsort(df.iloc[i])[-k:]][::-1]
        dd = dict(zip(names,value))
        list1.append(dd)
    return list1

In [None]:
tfidf_final_dict = gettoptfidf_dict(res_df,10)

In [None]:
def gettoptfidf_word_array(df,k): 
    list2 = []
    for i in range(len(df)):
        names = np.array(feature_names)[np.argsort(df.iloc[i])[-k:]][::-1].tolist()
        list2.append(names)
    return list2

In [None]:
tfidf_final_word_array = gettoptfidf_word_array(res_df,10)

In [None]:
design_matrix2 = np.zeros([len(new_texts),200])
for i in range(design_matrix2.shape[0]):
    temp_sent = tfidf_final_word_array[i]
    for j in range(len(temp_sent)):
        design_matrix2[i] += w2v_model[temp_sent[j]]
    design_matrix2[i] = design_matrix2[i]/len(temp_sent)

In [None]:
design_matrix2.shape

In [None]:
from sklearn.ensemble import RandomForestClassifier
y = train_10000_eng["stars"].values[:9000]
x = design_matrix2[:9000,]
clf = RandomForestClassifier(n_estimators=100, max_depth=2,
                             random_state=0)
clf.fit(x, y)

In [None]:
from sklearn.metrics import accuracy_score
from sklearn.metrics import mean_squared_error
y_true = train_10000_eng["stars"].values[-1000:]
y_pred = clf.predict(design_matrix2[-1000:,])
accuracy_score(y_true, y_pred)
np.sqrt(mean_squared_error(y_true, y_pred))