In [9]:
import pandas as pd
import math
import numpy as np

import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

from sklearn.model_selection import train_test_split
from sklearn import preprocessing
from sklearn.preprocessing import OneHotEncoder
from sklearn.feature_extraction.text import TfidfVectorizer

from sklearn import linear_model
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor

from sklearn.metrics import mean_squared_error
from sklearn.neighbors import KNeighborsRegressor


import nltk
import xlrd
import string
import nltk.corpus
from nltk.corpus import wordnet

from nltk.tokenize import word_tokenize
from nltk.tokenize import sent_tokenize
import re
nltk.download('stopwords')
import nltk.corpus as corpus
nltk.download('vader_lexicon')
import re

stopwords = corpus.stopwords.words("english")

import ast 
from statistics import mean
import itertools
from itertools import chain
from nltk.sentiment.vader import SentimentIntensityAnalyzer

[nltk_data] Downloading package stopwords to /Users/apple/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package vader_lexicon to
[nltk_data]     /Users/apple/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


In [2]:
def get_sentiment(des):
    
    des = list(des)
    
    def get_adj_and_adv(text):
        """
        This functionis to firstly tokenize the words and then select
        the words that is tagged as adverbe and adjective
        """
        text_lower = text.lower()
        text_token = word_tokenize(text_lower)
        result_tags = nltk.pos_tag(text_token)
    
        words = [(word) for word, tag in result_tags if tag in ('JJ','RB')]
        
        return (words)
    
    def get_noun(text):
        """
        This functino is to tokenize the words and select the words
        that is tagged as noun
        """
        text_lower = text.lower()
        text_token = word_tokenize(text_lower)
        result_tags = nltk.pos_tag(text_token)
    
        words = [(word) for word, tag in result_tags if tag in ('NN')]
        return (words)
    
    def nltk_sentiment(sentence):
        """
        This function is to process the sentiment on each tokenized sentences
        and then generate a sentiment value for each sentence
        """
    
        nltk_sentiment = SentimentIntensityAnalyzer()
        score = nltk_sentiment.polarity_scores(sentence)
        return score
    
    sen_tok = [sent_tokenize(des[i]) for i in range(len(des))]

    sen_tok_total = [''.join(sen_tok[i]) for i in range(len(sen_tok))]

    x = [nltk_sentiment(sen_tok_total[i]) for i in range(len(sen_tok_total))]

    x1 = [(list(x[i].items())[-1][1]) for i in range(len(x)) ]
    
    return x1

In [143]:
def missing_value(df):
    """
    Handeling any missing value in the dataframe.
    """
    df["item_description"][df['item_description'] == "No description yet"] = "None"
    return df.fillna("None")

def split_label(cat):
    """
    This function splits the category into three sub categories.
    """
    cat_split = cat.str.split("/",n = 2,expand = True)
    cat_split = cat_split.rename(index = str,columns = {0:'cat1',1:'cat2',2:'cat3'})
    cat_split = cat_split.fillna("None")
    
    return cat_split


def variable_process(df):

    sub_cats = split_label(df["category_name"])
    
    columns = (df[["shipping"]].values, df[["brand_name"]].values, df[["item_condition_id"]].values, 
               sub_cats.values)
    columns_names = ("shipping", "brand_name", "item_condition", "cat1", "cat2", "cat3")


    return pd.DataFrame(np.concatenate(columns, axis = 1), columns = columns_names)
    
## !!! encode() only for train 
def encode(sub_cat_train):
    """
    This function one hot encode category variables for the training set and the testing set.
    """
    
    from sklearn.preprocessing import OneHotEncoder
    
    global onehotencoder
    
    onehotencoder = OneHotEncoder(handle_unknown='ignore')
    
    one_hot_train = onehotencoder.fit_transform(sub_cat_train.values).toarray()

    return one_hot_train

def linear_encoder(sub_cat):
    le = preprocessing.LabelEncoder()
    label_cat = le.fit_transform(sub_cat)
    return label_cat
    

def get_length_of_des(des):
    #des = list(df['item_description'])
    text_token = [word_tokenize(x) for x in des]
    length = [len(t) for t in text_token]
    
    return length

def price(df):
    price = df['price'].values
    return np.log(price+1)
    

In [86]:
# Return: the input for modeling
# !!! data_preparation_train() Only for TRAIN DATA

def data_preparation_train(df):

    df = missing_value(df)
    df_label = split_label(df["category_name"])
    df1 = variable_process(df)
    df2 = encode(df1)

    df["sent"] = get_sentiment(df["item_description"])
    df2 = np.append(df2, df["sent"].values.reshape(-1,1), axis=1)
    
    df["length"] = get_length_of_des(df["item_description"])
    df2 = np.append(df2, df["length"].values.reshape(-1,1), axis=1)

    return df2


# data_preparation_test only for test 

def data_preparation_test(df):

    df = missing_value(df)
    df_label = split_label(df["category_name"])
    df1 = variable_process(df)
    df2 = onehotencoder.transform(df1).toarray()

    df["sent"] = get_sentiment(df["item_description"])
    df2 = np.append(df2, df["sent"].values.reshape(-1,1), axis=1)
    
    df["length"] = get_length_of_des(df["item_description"])
    df2 = np.append(df2, df["length"].values.reshape(-1,1), axis=1)

    return df2


In [77]:
def data_preparation_linear_train(df):
    df = missing_value(df)
    df_label = split_label(df["category_name"])
    df1 = variable_process(df)
    df2 = linear_encoder(df1)

    df["sent"] = get_sentiment(df["item_description"])
    df2 = np.append(df2, df["sent"].values.reshape(-1,1), axis=1)
    
    df["length"] = get_length_of_des(df["item_description"])
    df2 = np.append(df2, df["length"].values.reshape(-1,1), axis=1)

    return df2

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self._update_inplace(new_data)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


In [5]:
df = pd.read_csv("data/train.tsv", delimiter = "\t", encoding = "utf-8", index_col = False)
try_df = df.iloc[:8000, :]

In [87]:
X_train, X_test, y_train, y_test = train_test_split(try_df, try_df["price"], test_size=0.3,random_state = 0)

In [88]:
X_train = data_preparation_train(X_train)
X_test = data_preparation_test(X_test)

y_train = np.log(y_train + 1)
y_test = np.log(y_test + 1)


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self._update_inplace(new_data)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.

In [116]:
def models(X_train, X_test, y_train, y_test):
    
    # RANDOM FOREST
    rf = RandomForestRegressor(n_estimators = 1000, random_state = 42)
    rf.fit(X_train, y_train)

    # Lasso
    clf = linear_model.Lasso(alpha=0.1)
    clf.fit(X_train, y_train)

    # KNN
    neigh = KNeighborsRegressor(n_neighbors=2)
    neigh.fit(X_train, y_train) 
    
    # predictions
    rf_pred = rf.predict(X_test)
    cl_pred = clf.predict(X_test)
    knn_pred = neigh.predict(X_test)

    dic = {
        "rf_pred" : rf_pred,
        "cl_pred" : cl_pred,
        "knn_pred": knn_pred,
        "y_test" : y_test
    }
    
    df = pd.DataFrame(dic)
    
    df["mean"] = df[["rf_pred", "cl_pred", "knn_pred"]].mean(axis = 1)
    
    return df

In [117]:
predictions = models(X_train, X_test, y_train, y_test)

In [131]:
def mse(preds, actual = y_test):
    return np.sqrt(mean_squared_error(actual, preds))

In [140]:
predictions[["rf_pred", "cl_pred", "knn_pred", "mean"]].apply(lambda x: mse(x))

rf_pred     0.628738
cl_pred     0.740228
knn_pred    0.771328
mean        0.652519
dtype: float64