In [1]:
import pandas as pd
import numpy as np
from bs4 import BeautifulSoup
import re
import seaborn
import nltk
import string
from nltk.corpus import stopwords
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split

In [2]:
uri_re = r'(?i)\b((?:https?://|www\d{0,3}[.]|[a-z0-9.\-]+[.][a-z]{2,4}/)(?:[^\s()<>]+|\(([^\s()<>]+|(\([^\s()<>]+\)))*\))+(?:\(([^\s()<>]+|(\([^\s()<>]+\)))*\)|[^\s`!()\[\]{};:\'".,<>?«»“”‘’]))'

def stripTagsAndUris(x):
    if x:
        # BeautifulSoup on content
        soup = BeautifulSoup(x, "html.parser")
        # Stripping all <code> tags with their content if any
        if soup.code:
            soup.code.decompose()
        # Get all the text out of the html
        text =  soup.get_text()
        # Returning text stripping out all uris
        return re.sub(uri_re, "", text)
    else:
        return ""

def removePunctuation(x):
    # Lowercasing all words
    x = x.lower()
    # Removing non ASCII chars
    x = re.sub(r'[^\x00-\x7f]',r' ',x)
    # Removing (replacing with empty spaces actually) all the punctuations
    return re.sub("["+string.punctuation+"]", " ", x)
lemmatizer = nltk.stem.WordNetLemmatizer()
def removeStopword(sentence):
    return ' '.join(set([lemmatizer.lemmatize(w.lower()) for w in nltk.wordpunct_tokenize(sentence) 
 if ((w not in stopwords.words('english')) and (w not in string.punctuation))]))
def getSentenceLength(sen):
    return len(sen.split())
def normalize_df(filename):
    columns = ["country","sku_id","title","category_lvl_1","category_lvl_2","category_lvl_3","short_description",
           "price","product_type"]
    dat = pd.read_csv(filename,names=columns)
    dat['short_desc_strip'] = dat['short_description'].fillna("").map(stripTagsAndUris).map(removeStopword)
    dat['title'] = dat['title'].fillna("").map(stripTagsAndUris).map(removeStopword)
    dat['len_title'] = dat['title'].map(getSentenceLength)
    dat['len_desc'] = dat['short_desc_strip'].map(getSentenceLength)
    return dat.drop(['sku_id','short_description'],axis=1)

In [3]:
class MultiColumnLabelEncoder:
    def __init__(self,columns = None):
        self.columns = columns # array of column names to encode

    def fit(self,X,y=None):
        return self # not relevant here

    def transform(self,X):
        '''
        Transforms columns of X specified in self.columns using
        LabelEncoder(). If no columns specified, transforms all
        columns in X.
        '''
        output = X.copy()
        if self.columns is not None:
            for col in self.columns:
                output[col] = LabelEncoder().fit_transform(output[col])
        else:
            for colname,col in output.iteritems():
                output[colname] = LabelEncoder().fit_transform(col)
        return output

    def fit_transform(self,X,y=None):
        return self.fit(X,y).transform(X)

In [4]:
columns = ["country","sku_id","title","category_lvl_1","category_lvl_2","category_lvl_3","short_description",
           "price","product_type"]
# dat = pd.read_csv("data/training/data_train.csv",names=columns)
df_train = normalize_df("data/training/data_train.csv")
df_val = normalize_df("data/validation/data_valid.csv")
clarity_lbl = pd.read_csv("data/training/clarity_train.labels",names=["label"])
concise_lbl = pd.read_csv("data/training/conciseness_train.labels",names=["label"])

  '"%s" looks like a filename, not markup. You should probably open this file and pass the filehandle into Beautiful Soup.' % markup)


In [5]:
df_total = df_train.append(df_val)

In [6]:
columns_to_categorical = ['country','category_lvl_1','category_lvl_2','category_lvl_3','product_type']
df_total_encoder = MultiColumnLabelEncoder(columns=columns_to_categorical).fit_transform(df_total)

  flag = np.concatenate(([True], aux[1:] != aux[:-1]))


In [7]:
df_total_encoder.to_csv('data/total_df_with_categorical_and_normalize.csv',index=0,encoding='utf-8')

In [8]:
df_total_encoder[len(df_train):].to_csv("data/validation/data_valid_normalize.csv",index=0,encoding='utf-8')

In [9]:
df_train_norm = df_total_encoder[:len(df_train)]

In [10]:
df_train_norm['clarity'] = clarity_lbl
df_train_norm['concise'] = concise_lbl

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  if __name__ == '__main__':
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  from ipykernel import kernelapp as app


In [11]:
df_train_norm.to_csv("data/training/data_train_normalize.csv",index=0,encoding='utf-8')

In [12]:
X_train,X_val,y_train,y_test = train_test_split(df_train_norm,clarity_lbl,stratify=clarity_lbl,random_state=4111)

In [13]:
X_train.to_csv("train_cat.csv",index=None,encoding='utf-8')
X_val.to_csv("val_cat.csv",index=None,encoding='utf-8')