In [2]:
from __future__ import division
import pyximport
pyximport.install()
import os
import random
import numpy as np
import tensorflow as tf
os.environ['PYTHONHASHSEED'] = '10000'
np.random.seed(10001)
random.seed(10002)
session_conf = tf.ConfigProto(intra_op_parallelism_threads=5, inter_op_parallelism_threads=1)
from keras import backend
tf.set_random_seed(10003)
backend.set_session(tf.Session(graph=tf.get_default_graph(), config=session_conf))
from keras.preprocessing.sequence import pad_sequences
from keras.layers import Input, Dropout, Dense, concatenate, GRU, Embedding, Flatten, Activation, BatchNormalization, PReLU
from keras.initializers import he_uniform
from keras.layers import Conv1D
from keras.layers import GlobalMaxPooling1D, GlobalAveragePooling1D
from keras.optimizers import Adam, SGD
from keras.models import Model 

ModuleNotFoundError: No module named 'tensorflow'

In [None]:
###################################################################
    #GLOBAL VARIABLES
path = '../input/'
split = -1#1400000 # use -1 for submission, otherwise tha value of split is the number of instances in train
cores = 4
max_text_length=60
min_df_one=5
min_df_bi=5

def clean_str(text):
    try:
        text = ' '.join( [w for w in text.split()[:max_text_length]] )        
        text = text.lower()
        text = re.sub(u\"é\", u\"e\", text)
        text = re.sub(u\"ē\", u\"e\", text)
        text = re.sub(u\"è\", u\"e\", text)
        text = re.sub(u\"ê\", u\"e\", text)
        text = re.sub(u\"à\", u\"a\", text)
        text = re.sub(u\"â\", u\"a\", text)
        text = re.sub(u\"ô\", u\"o\", text)
        text = re.sub(u\"ō\", u\"o\", text)
        text = re.sub(u\"ü\", u\"u\", text)
        text = re.sub(u\"ï\", u\"i\", text)
        text = re.sub(u\"ç\", u\"c\", text)
        text = re.sub(u\"\\u2019\", u\"'\", text)
        text = re.sub(u\"\\xed\", u\"i\", text)
        text = re.sub(u\"w\\/\", u\" with \", text)
        
        text = re.sub(u\"[^a-z0-9]\", \" \", text)
        text = u\" \".join(re.split('(\\d+)',text) )
        text = re.sub( u\"\\s+\", u\" \", text ).strip()
        text = ''.join(text)
    except:
        text = np.NaN
    return text

In [None]:
def load_data( ):
    print ('LOAD 1.4M ROWS FOR TRAIN')
    df_train = pd.read_csv(path+'train.tsv', sep='\\t', encoding='utf-8')
    df_train['item_condition_id'].fillna(2, inplace=True)
    df_train['shipping'].fillna(0, inplace=True)
    if split>0:
        df_train = df_train.loc[:split].reset_index(drop=True)
    df_train = df_train.loc[df_train.price>0].reset_index(drop=True)
    df_train['price'] = np.log1p(df_train['price']).astype(np.float32)
    df_train.drop('train_id', axis=1, inplace=True)
    return df_train
    
def create_count_features(df_data):
    def lg(text):
        text = [x for x in text.split() if x!='']
        return len(text)
    df_data['nb_words_item_description'] = df_data['item_description'].apply(lg).astype(np.uint16)
    
def parallelize_dataframe(df, func):
    df_split = np.array_split(df, cores)
    pool = Pool(cores)
    df = pd.concat(pool.map(func, df_split))
    pool.close()
    pool.join()
    return df
def clean_str_df(df):
    return df.apply( lambda s : clean_str(s))


In [None]:
def prepare_data(df_data, train=True):
    print ('Prepare data....')
    
    def fill_brand_name(x):
        try:
            k=[]
            for n in [4,3,2,1]:
                temp =  [' '.join(xi) for xi in ngrams(x.split(' '), n) if ' '.join(xi) in   brand_names  ] 
                if len(temp)>0:
                    k = k+temp
            if len(k) > 0:
                return k[0]
            else:
                return np.NaN
        except:
            return np.NaN
        
    def fill_cat(x, i, new=False):
        try:
            if new:
                return x.split('/')[i-1].strip()
            else:
                return ' '.join( x.split('/') ).strip()
        except:
            return ''
            
    df_data['name'].fillna('', inplace=True)
    df_data['item_description'].fillna('', inplace=True)
    df_data['item_description'] = df_data['item_description'].apply(lambda x : x.replace('No description yet',''))
    
    #create 3 categories and remove / from category name and replace nan
    df_data['category_name'].fillna('//', inplace=True)
    df_data['category1'] = df_data.category_name.apply(lambda x : x.split('/')[0].strip())
    df_data['category2'] = df_data.category_name.apply(lambda x : x.split('/')[1].strip())
    df_data['category3'] = df_data.category_name.apply(lambda x : x.split('/')[2].strip())
    df_data['category_name'] = df_data['category_name'].apply( lambda x : ' '.join( x.split('/') ).strip() )

    create_count_features(df_data)     
    df_data['nb_words_item_description'] /= max_text_length

    df_data['brand_name'] = parallelize_dataframe(df_data['brand_name'], clean_str_df)  
    df_data['name'] = parallelize_dataframe(df_data['name'], clean_str_df)  
    df_data['item_description'] = parallelize_dataframe(df_data['item_description'], clean_str_df)                                                                            
    
    df_data.loc[df_data['brand_name'].isnull(), 'brand_name'] = df_data.loc[df_data['brand_name'].isnull(),
                                                                            'name'].apply(fill_brand_name)
    df_data['brand_name'].fillna('', inplace=True)
    
    if train:        
        for feat in ['brand_name', 'category_name', 'category1', 'category2', 'category3']:
            temp = df_data[feat].unique()
            lb = LabelEncoder()
            df_data[feat] = lb.fit_transform(df_data[feat]).astype(np.uint16)
            labels_dict[feat] = (lb, temp)
    else:   
        for feat in ['brand_name', 'category1', 'category2', 'category3', 'category_name']  :
            idx = labels_dict[feat][1]
            df_data.loc[ -df_data[feat].isin(idx), feat ] = ''
            df_data[feat] = labels_dict[feat][0].transform(df_data[feat]).astype(np.uint16)
    df_data['name_old'] = df_data['name'].copy()    
        
    df_data['brand_cat']  = 'cat1_'+df_data['category1'].astype(str)+' '+\\
    'cat2_'+df_data['category2'].astype(str)+' '+\\
    'cat3_'+df_data['category3'].astype(str)+' '+\\
    'brand_'+df_data['brand_name'].astype(str) 
    
    df_data['name']  = df_data['brand_cat']  + ' ' + df_data['name']
    
    df_data['name_desc']  = df_data['name'] + ' ' +\\
    df_data['item_description'].apply( lambda x : ' '.join( x.split()[:5] ) )
    
    df_data['item_condition_id'] = df_data['item_condition_id']/5.
    return df_data

In [None]:
def word_count(text, dc):
    text = set( text.split(' ') ) 
    for w in text:
        dc[w]+=1

def remove_low_freq(text, dc):
    return ' '.join( [w for w in text.split() if w in dc] )
    
def create_bigrams(text):
    try:
        text = np.unique( [ wordnet_lemmatizer.lemmatize(w) for w in text.split() if w not in stop_words ] )
        lst_bi = []
        for combo in combinations(text, 2):
            cb1=combo[0]+combo[1]
            cb2=combo[1]+combo[0]
            in_dict=False
            if cb1 in word_count_dict_one:
                new_word = cb1
                in_dict=True
            if cb2 in word_count_dict_one:
                new_word = cb2
                in_dict=True
            if not in_dict:
                new_word = combo[0]+'___'+combo[1]
            if len(cb1)>=0:
                lst_bi.append(new_word)
        return ' '.join( lst_bi )
    except:
        return ' '
        
def create_bigrams_df(df):
    return df.apply( create_bigrams )