In [1]:
# #Python Libraries
import numpy as np
import scipy as sp
import pandas as pd
import pandas_profiling

%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns

import os
import sys
import time
import requests
import datetime
import math

import missingno as msno

from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import Ridge, LogisticRegression
from sklearn.metrics import mean_squared_error, mean_squared_log_error
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.preprocessing import LabelBinarizer

from scipy.sparse import csr_matrix, hstack

In [2]:
df_train = pd.read_csv(filepath_or_buffer="data/train.tsv", sep="\t", )
df_test = pd.read_csv(filepath_or_buffer="data/test.tsv", sep="\t", )

In [5]:
df_input = pd.concat([df_train, df_test], 0)
nrow_train = df_train.shape[0]
# df_input = df_train

In [6]:
df_input['category_name'].fillna(value='other', inplace=True)
df_input['brand_name'].fillna(value='other', inplace=True)
df_input['item_description'].fillna(value='other', inplace=True)

df_test['category_name'].fillna(value='other', inplace=True)
df_test['brand_name'].fillna(value='other', inplace=True)
df_test['item_description'].fillna(value='other', inplace=True)

In [7]:
df_input['category_name'] = df_input['category_name'].astype('category')
df_input['brand_name'] = df_input['brand_name'].astype('category')
df_input['item_condition_id'] = df_input['item_condition_id'].astype('category')

df_test['category_name'] = df_test['category_name'].astype('category')
df_test['brand_name'] = df_test['brand_name'].astype('category')
df_test['item_condition_id'] = df_test['item_condition_id'].astype('category')

In [8]:
def func_count_vectorizer(df_in, var_col):
    model_cv = CountVectorizer(min_df=10)
    return model_cv.fit_transform(df_in[var_col])

In [9]:
mat_input_name_cv = func_count_vectorizer(df_input, "name")
mat_input_category_name_cv = func_count_vectorizer(df_input, "category_name")
mat_input_brand_name_cv = func_count_vectorizer(df_input, "brand_name")
mat_input_item_desc_cv = func_count_vectorizer(df_input, "item_description")

In [10]:
def func_tfidf_vectorizer(df_in, var_col):
    model_tfidf = TfidfVectorizer(max_features = 55000, ngram_range = (1,3), stop_words = "english")
    return model_tfidf.fit_transform(df_in[var_col])

In [11]:
mat_input_name_tfidf = func_count_vectorizer(df_input, "name")
mat_input_category_name_tfidf = func_count_vectorizer(df_input, "category_name")
mat_input_brand_name_tfidf = func_count_vectorizer(df_input, "brand_name")
mat_input_item_desc_tfidf = func_count_vectorizer(df_input, "item_description")

In [12]:
mat_input_stack = sp.sparse.hstack((mat_input_name_cv, 
                                       mat_input_category_name_cv,
                                       mat_input_brand_name_cv,
                                       mat_input_item_desc_cv,
                                       mat_input_name_tfidf,
                                       mat_input_category_name_tfidf,
                                       mat_input_brand_name_tfidf,
                                       mat_input_item_desc_tfidf)).tocsr()

In [13]:
X = mat_input_stack

X_train = X[:nrow_train]
X_test = X[nrow_train:]
y_train = np.log1p(df_train["price"])

In [14]:
model_2 = Ridge(solver = "lsqr", fit_intercept=False)

In [15]:
# model_2 = RandomForestRegressor(max_depth=15, random_state=0, n_jobs=-1)
model_2.fit(X_train, y_train)

Ridge(alpha=1.0, copy_X=True, fit_intercept=False, max_iter=None,
   normalize=False, random_state=None, solver='lsqr', tol=0.001)

In [16]:
y_predict = model_2.predict(X_test)

In [18]:
df_model_2_submission = pd.DataFrame()
df_model_2_submission['test_id'] = df_test['test_id']
df_model_2_submission['price'] = pd.Series(abs(y_predict))
df_model_2_submission.to_csv("submissions/model_2_submission_2.csv", index=False)