In [11]:
import pandas as pd
import numpy as np
import gc
import os
import re
import tensorflow as tf
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder
import sys
import math
import datetime 
from random import shuffle
stdout = sys.stdout


In [13]:
#MAIN_DIR = '/root/kaggle/restaurantAvis/cache/' #Alain
MAIN_DIR = '' #Guillaume

TRAINEDMODEL = MAIN_DIR + 'model_multi_lstm1.ckpt'
VOCAB_ALL = MAIN_DIR + 'vocab_all.txt'
MOTS_NECESSAIRES = MAIN_DIR + 'mots_necessaires.txt'
DATASET_DIR = '../'


X_TRAIN_CSV = MAIN_DIR + 'X_train.csv'
Y_TRAIN_CSV = MAIN_DIR + 'y_train.csv'
X_DEV_CSV = MAIN_DIR + 'X_dev.csv'
Y_DEV_CSV = MAIN_DIR + 'y_dev.csv'
PRETRAINED_VOCAB_CSV = MAIN_DIR + 'pretrained_vocab.csv'
TO_TRAIN_VOCAB_CSV = MAIN_DIR + 'to_train_vocab.csv'
ONE_HOT_COLS_CSV = MAIN_DIR + 'one_hot_cols.csv'

EMBS_CSV = MAIN_DIR + 'pretrained_embs_'

We load the data

In [5]:
# Load the data
dev = pd.read_csv(DATASET_DIR + 'Restaurant_Reviews.tsv', delimiter='\t', quoting=3)
train = pd.read_csv(DATASET_DIR + 'newdata/1-restaurant-train.tsv', delimiter='\t', quoting=3)
# train = pd.read_csv(os.path.join(data_path, 'train.csv'), dtype=dtype, low_memory=True, encoding='utf-8')

In [7]:
print(train.count())
train.head()

Review    82065
Liked     82065
dtype: int64


Unnamed: 0,Review,Liked
0,"""Thank you thank you thank you !! I want to t...",4
1,"""A Humane Society store at the Biltmore? Inte...",5
2,Don't buy Nike sneakers if you want to return ...,1
3,"""I have to say I love most things about Sprout...",3
4,"""The tire pressure light came on a day or so a...",5


In [15]:
print(dev.count())
dev.head()

Review    1000
Liked     1000
dtype: int64


Unnamed: 0,Review,Liked
0,Wow... Loved this place.,1
1,Crust is not good.,0
2,Not tasty and the texture was just nasty.,0
3,Stopped by during the late May bank holiday of...,1
4,The selection on the menu was great and so wer...,1


Function to clean the text

In [8]:
def clean_text(data):
        data = data.replace('\\n',' ').replace('\\r',' ')
        lower_case = data.lower()
        letters_only = re.sub("[^A-Za-z]"," ", lower_case)
        words = letters_only.split()
        return " ".join(words)

We get the whole vocabulary and the most used words

In [9]:
def save_list(filepath, a_list):
    with open(filepath, 'w') as f:
        for item in a_list:
            f.write("{}\n".format(item))

In [14]:
vectorizer = CountVectorizer(ngram_range=(1,1),stop_words=frozenset([]))
vectorizer.fit(train["Review"])
mots_utiles = set(vectorizer.vocabulary_)
del vectorizer

gc.collect()
save_list(MOTS_UTILES,mots_utiles)
print(len(mots_utiles))

92334


In [28]:
vectorizer = CountVectorizer(min_df=50,stop_words=frozenset([]))
vectorizer.fit(train["Review"])
mots_necessaires = set(vectorizer.vocabulary_)
del vectorizer
gc.collect()
save_list(MOTS_NECESSAIRES,mots_necessaires)
print(len(mots_necessaires))

9221


On recrée les champs de texte "students" et "project"

In [30]:
#### Preprocess data   
def extract_features(df):
    df['Review'] = df.apply(lambda row: clean_text( 
        str(row['Review'])
        ), axis=1)

extract_features(train)
extract_features(dev)
train.head()

Unnamed: 0,Review,Liked
0,thank you thank you thank you i want to thank ...,4
1,a humane society store at the biltmore interes...,5
2,don t buy nike sneakers if you want to return ...,1
3,i have to say i love most things about sprouts...,3
4,the tire pressure light came on a day or so ag...,5


We estimate the optimal size for the RNN network

In [33]:
train['review_len'] = train['Review'].apply(lambda x: len(x.split()))
dev['review_len'] = dev['Review'].apply(lambda x: len(x.split()))
print(max(train['review_len']))
print(train['review_len'].mean())
print(train['review_len'].quantile(q=0.90))

1023
191.61598732711875
333.0


We load the embedded words vectors

In [40]:
def load_pretrained_glove(dim):
    vocab = np.loadtxt("../tools//embedding/glove.6B/glove.6B." + str(dim) + "d.txt",delimiter = ' ',dtype='str',comments=None,usecols=0)
    vectors = np.loadtxt("../tools//embedding/glove.6B/glove.6B." + str(dim) + "d.txt",delimiter = ' ',comments=None,usecols=(i+1 for i in range(dim)))
    return vocab, vectors


In [41]:
pretrained_vocab, pretrained_embs = load_pretrained_glove(50)
_, pretrained_embs_100 = load_pretrained_glove(100)
_, pretrained_embs_300 = load_pretrained_glove(300)
print(pretrained_vocab[:10])
print(pretrained_embs[:3,:])

['the' ',' '.' 'of' 'to' 'and' 'in' 'a' '"' "'s"]
[[ 4.1800e-01  2.4968e-01 -4.1242e-01  1.2170e-01  3.4527e-01 -4.4457e-02
  -4.9688e-01 -1.7862e-01 -6.6023e-04 -6.5660e-01  2.7843e-01 -1.4767e-01
  -5.5677e-01  1.4658e-01 -9.5095e-03  1.1658e-02  1.0204e-01 -1.2792e-01
  -8.4430e-01 -1.2181e-01 -1.6801e-02 -3.3279e-01 -1.5520e-01 -2.3131e-01
  -1.9181e-01 -1.8823e+00 -7.6746e-01  9.9051e-02 -4.2125e-01 -1.9526e-01
   4.0071e+00 -1.8594e-01 -5.2287e-01 -3.1681e-01  5.9213e-04  7.4449e-03
   1.7778e-01 -1.5897e-01  1.2041e-02 -5.4223e-02 -2.9871e-01 -1.5749e-01
  -3.4758e-01 -4.5637e-02 -4.4251e-01  1.8785e-01  2.7849e-03 -1.8411e-01
  -1.1514e-01 -7.8581e-01]
 [ 1.3441e-02  2.3682e-01 -1.6899e-01  4.0951e-01  6.3812e-01  4.7709e-01
  -4.2852e-01 -5.5641e-01 -3.6400e-01 -2.3938e-01  1.3001e-01 -6.3734e-02
  -3.9575e-01 -4.8162e-01  2.3291e-01  9.0201e-02 -1.3324e-01  7.8639e-02
  -4.1634e-01 -1.5428e-01  1.0068e-01  4.8891e-01  3.1226e-01 -1.2520e-01
  -3.7512e-02 -1.5179e+00  1.2612e-

On enlève les mots inutiles pour traiter le jeu de données (test et train)

In [42]:
# nettoyer des mots non utilisés
mots_inutiles = [i for i in range(len(pretrained_vocab)) if pretrained_vocab[i] not in mots_utiles]
print(len(mots_inutiles))

343793


In [43]:
pretrained_vocab = np.delete(pretrained_vocab, (mots_inutiles), axis=0)
pretrained_embs = np.delete(pretrained_embs, (mots_inutiles), axis=0)
pretrained_embs_100 = np.delete(pretrained_embs_100, (mots_inutiles), axis=0)
pretrained_embs_300 = np.delete(pretrained_embs_300, (mots_inutiles), axis=0)
print(pretrained_vocab.shape)
print(pretrained_embs.shape)
print(pretrained_embs_100.shape)

(56207,)
(56207, 50)
(56207, 100)


On calcule le vocabulaire connu par l'application

In [44]:
only_in_train = mots_necessaires - set(pretrained_vocab)
only_in_train = list(only_in_train)
only_in_train.append("<BLANK>")
vocab = list(pretrained_vocab) + only_in_train
print(len(only_in_train))
print(len(vocab))

349
56556


In [45]:
REVIEW_LENGTH = 333
def preprocess_text(data,length):
        data = str(data).split()
        if len(data) < length :
            data = data +['<BLANK>' for i in range(length - len(data))]
        return ' '.join(data[:length])

In [46]:
train['Review'] = train['Review'].apply(lambda x:preprocess_text(x,REVIEW_LENGTH))

In [47]:
train.head(3)

Unnamed: 0,Review,Liked,review_len
0,thank you thank you thank you i want to thank ...,4,96
1,a humane society store at the biltmore interes...,5,151
2,don t buy nike sneakers if you want to return ...,1,148


In [48]:
X_train, X_dev, y_train, y_dev = train_test_split(train, train['Liked'], test_size=0.2)

In [49]:
X_train.to_csv(X_TRAIN_CSV)
y_train.to_csv(Y_TRAIN_CSV)
X_dev.to_csv(X_DEV_CSV)
y_dev.to_csv(Y_DEV_CSV)

In [51]:
np.savetxt(PRETRAINED_VOCAB_CSV, pretrained_vocab, delimiter=';',fmt='%s')
np.savetxt(TO_TRAIN_VOCAB_CSV, only_in_train, delimiter=';',fmt='%s')
np.savetxt(EMBS_CSV + '50.csv', pretrained_embs, delimiter=';')
np.savetxt(EMBS_CSV + '100.csv', pretrained_embs_100, delimiter=';')
np.savetxt(EMBS_CSV + '300.csv', pretrained_embs_300, delimiter=';')