## Read file

In [1]:
import json
import pandas as pd
def read_json_nrows(nrows,filename):
    n= 0
    with open(filename) as f:
        while n < nrows:
            if n == 0:
                line = f.readline()
                line = json.loads(line.rstrip())
                train = pd.DataFrame(line,index = [0])
            else:
                temp = pd.DataFrame(json.loads(f.readline().rstrip()),index = [n])
                train = train.append(temp)
            n = n+1
    return train
    

In [2]:
train = pd.read_json('review_train.json',orient = 'records',lines = True)
test = pd.read_json('review_test.json',orient = 'records',lines = True)

In [3]:
import re
import numpy as np 
from nltk.stem import WordNetLemmatizer
from nltk.stem.porter import PorterStemmer
from nltk import word_tokenize, pos_tag
from nltk.corpus import wordnet

In [4]:
train = train[:3000000]

#### Language

In [5]:
def not_language(text):
    # First delete all common emoticons.
    text = re.sub('(?::|;|=)(?:-)?(?:\)|\(|D|P)','',text)
    if re.sub('[\W]+','',text) == '':
        return True
    else:
        return False

In [6]:
not_lang_train = train[train.text.apply(not_language)].index.values

KeyboardInterrupt: 

In [None]:
train.loc[not_lang_train,'lang_type'] = 'english'

In [None]:
from langdetect import detect
for i in range(train.shape[0]):
    if i in not_lang:
        continue
    else:
        train.loc[i,'lang_type'] = detect(train.text[i])

In [5]:
train_eng = train

#### Lemmatization

In [6]:
porter = PorterStemmer()
def tokenizer_porter(text):
    return [porter.stem(word) for word in text.split()]

In [7]:
def get_wordnet_pos(tag):
    if tag.startswith('J'):
        return wordnet.ADJ
    elif tag.startswith('V'):
        return wordnet.VERB
    elif tag.startswith('N'):
        return wordnet.NOUN
    elif tag.startswith('R'):
        return wordnet.ADV
    else:
        return None
    
wnl = WordNetLemmatizer()
def lemmatizer(text):
    tokens = word_tokenize(text)
    lemmas = []
    tagged = pos_tag(tokens)
    for tag in tagged:
        wordnet_pos = get_wordnet_pos(tag[1]) or wordnet.NOUN
        lemmas.append(wnl.lemmatize(tag[0], pos=wordnet_pos))
    return lemmas

#### Stop-words

In [8]:
from nltk.corpus import stopwords
stop = stopwords.words('english')
stop.pop(stop.index('but'))
stop.pop(stop.index('not'))
preposition = ['of','with','at','from','into','during',
               'including','until','till','against','among',
               'throughout','despite','towards','upon','concerning','to','in',
               'for','on','by','about','like','through','over',
               'before','between','after','since','without','under',
               'within','along','following','across','behind',
               'beyond','plus','except','but','up','out','around','down','off','above','near']
for prep in preposition:
    if prep in stop:
        stop.pop(stop.index(prep))

#### Convert n't to not

In [9]:
def no_abbreviation(text):
    text = re.sub('n\'t',' not',text)
    return text

#### Adversatives

In [10]:
but = ['yet','however','nonetheless','whereas','nevertheless']
although = ['although','though','notwithstanding','albeit']

In [11]:
def change_but(text):
    for x in but:
        text = re.sub(x,'but',text)
    return text
def change_although(text):
    for x in although:
        text = re.sub(x,'although',text)
    return text
def change_adversatives(text):
    text = change_but(text)
    text = change_although(text)
    return text

In [12]:
def preprocessing(text):
    # 取表情
    # emoticons = re.findall('(?::|;|=)(?:-)?(?:\)|\(|D|P)',text)
    # 去回车
    text = re.sub('\\n',' ',text)
    # not
    # text = no_abbreviation(text)
    # 只保留字母
    text = re.sub('[\W]+',' ', text.lower())
    # 统一转折词
    # text = change_adversatives(text)
    # 词性还原
    # tokens = lemmatizer(text)
    # text = ''
    # for index, token in enumerate(tokens):
        # if token in stop:
        #     tokens[index] = ''
        # else:
        #     text = text + tokens[index] + ' '
    # return {'text':text,'emoticons':emoticons}
    return {'text':text}

In [13]:
from tqdm import tqdm, tqdm_pandas
tqdm.pandas()
dictionary_train = train_eng.text.progress_apply(preprocessing)
dictionary_test = test.text.progress_apply(preprocessing)

100%|██████████| 3000000/3000000 [02:50<00:00, 17626.18it/s]
100%|██████████| 1321274/1321274 [01:17<00:00, 17041.21it/s]


In [14]:
y = train_eng.loc[dictionary_train.index]["stars"]

In [15]:
texts_train = [dictionary_train[i]['text'] for i in train_eng.index]
texts_test = [dictionary_test[i]['text'] for i in test.index]

In [16]:
# 释放空间
import gc
del train_eng
del test
del train
del dictionary_train
del dictionary_test
gc.collect()

2270

In [17]:
num_train = len(texts_train)
num_test = len(texts_test)

In [18]:
texts_train.extend(texts_test)

In [19]:
del texts_test
gc.collect()

749

In [20]:
texts = texts_train

In [21]:
del texts_train
gc.collect()

421

In [27]:
from autocorrect import spell

new_texts = ['']
for i in tqdm(range(len(texts))):
    new_texts.append([spell(j) for j in texts[i].split(' ')])

new_texts = new_texts[1:]

  0%|          | 747/2321274 [00:37<36:12:36, 17.80it/s]

KeyboardInterrupt: 

  0%|          | 747/2321274 [00:50<36:12:36, 17.80it/s]

In [None]:
result = ['']
for i in range(len(new_texts)):
    result.append(' '.join(new_texts[i]))
    
new_texts = result[1:]

In [22]:
new_texts = texts

In [23]:
del texts
gc.collect()

374

#### Bigrams for phrase

In [24]:
from gensim.models.phrases import Phrases, Phraser

In [25]:
sentence_stream = [sent.split(' ') for sent in tqdm(new_texts)]

100%|██████████| 4321274/4321274 [20:50<00:00, 3456.11it/s] 


In [26]:
bigram = Phraser(Phrases(sentence_stream, min_count=5, threshold=5)) #mincount越小识别出来的越少，threshold higher means fewer phrases

In [27]:
sentence_with_phrase = bigram[sentence_stream]

In [28]:
result = ['']
for i in tqdm(range(len(new_texts))):
    result.append(' '.join(bigram[sentence_stream[i]]))
    
new_texts = result[1:]

100%|██████████| 4321274/4321274 [24:26<00:00, 2945.89it/s]


In [29]:
del sentence_stream
del sentence_with_phrase
del result
gc.collect()

13942

#### tf-idf + LR

In [30]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [33]:
tf = TfidfVectorizer(analyzer='word', min_df = 1, lowercase = True)

In [34]:
response =  tf.fit_transform(new_texts)

In [35]:
tfidf_train = response[:num_train]
tfidf_test = response[num_train:]

In [36]:
from sklearn.linear_model import LogisticRegression
lr=LogisticRegression(multi_class='multinomial',solver='newton-cg')
lr.fit(tfidf_train,y)
y_pred=lr.predict(tfidf_test)




In [46]:
response

<4321274x712079 sparse matrix of type '<class 'numpy.float64'>'
	with 295346391 stored elements in Compressed Sparse Row format>

#### save sparse matrix

In [48]:
from scipy.sparse import csr_matrix

def save_sparse_csr(filename, array):
    # note that .npz extension is added automatically
    np.savez(filename, data=array.data, indices=array.indices,
             indptr=array.indptr, shape=array.shape)


save_sparse_csr('response', response)
load_sparse_csr('response')

#### submit

In [38]:
id = np.array(range(1,len(y_pred)+1))

In [39]:
header = np.array([['Id','Expected']])

In [40]:
y_pred = y_pred.reshape([-1,1])

In [41]:
id = id.reshape([-1,1])

In [42]:
ans = np.hstack((id, y_pred))

In [43]:
ans = np.vstack((header, ans))

In [44]:
np.savetxt("TueG1_submit2.csv", ans, delimiter=",", fmt='%s')

In [None]:
def load_sparse_csr(filename):
    # here we need to add .npz extension manually
    loader = np.load(filename + '.npz')
    return csr_matrix((loader['data'], loader['indices'], loader['indptr']),
                      shape=loader['shape'])

response = load_sparse_csr('response')