In [1]:
import lzma
import random
import numpy as np
import implicit
from collections import defaultdict
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.metrics import mean_squared_error
import math

In [27]:
test_mode = True

In [28]:
if test_mode:
    filename = "train.tsv"
else:
    filename = "../input/unzipped/train.tsv"

In [3]:
allData = []

In [4]:
with open(filename, encoding='utf8') as infile:
    for row in infile:
        allData.append(row)

In [5]:
# remove column labels
column_labels = allData[0].split('\t')

In [6]:
allData = allData[1:]

In [8]:
# get data
data = []
for row in allData:
    arr = row.split('\t')
    features = []
    features.append(float(arr[0]))  # train_id
    features.append(arr[1])         # name
    features.append(float(arr[2]))  # item_condition_id
    features.append(arr[3])         # category_name (removing)
    features.append(arr[4])         # brand_name
    features.append(float(arr[5]))  # price
    features.append(float(arr[6]))  # shipping
    features.append(arr[7])         # item_description
    data.append(features)

In [9]:
# map each category to an index
"""
Format of categories:
{0: '', 1: 'Women', 2: 'Men', 3: 'Beauty', 4: 'Kids'....

"""
(m0, m1, m2, m3, m4) = (0,0,0,0,0)
categories = [set(),  set(),  set(),  set(),  set()]
for row in data:
    tmp = row[3].split('/')
    for i in range(len(tmp)):
        categories[i].add(tmp[i])
categories = list(categories)
for category in range(len(categories)):
    s = {}
    i = 0
    for col in categories[category]:
        s[col] = i
        i += 1
    categories[category] = s

In [15]:
# create new dataset
total_features = []
total_labels = []
for row in data:
    tmp = row[3].split('/')
    new_feature = []
    # append item condition, shipping
    new_feature += [row[2], row[6]]
    # append categorical ids
    for category in range(5):
        if category >= len(tmp):
            new_feature.append(-1)
        else:
            new_feature.append(categories[category][tmp[category]])
    # append new features
    total_features.append(new_feature)
    # append labels
    total_labels.append(row[5])

In [10]:
train_features = total_features[len(total_features)//2:]
train_labels = total_labels[len(total_labels)//2:]

valid_features = total_features[len(total_features)//2:]
valid_labels = total_labels[len(total_labels)//2:]

In [12]:
clf = GradientBoostingRegressor(n_estimators=100)
clf = clf.fit(valid_features, valid_labels)

In [14]:
predictions = clf.predict(valid_features)
rmsle = 0
bad = 0
for i in range(len(predictions)):
    if predictions[i]+1 < 0:
        predictions[i] = 0.0
        bad += 1
    rmsle += (math.log(predictions[i]+1.0) - math.log(valid_labels[i]+1.0)) ** 2
rmsle /= len(predictions)
print(math.sqrt(rmsle))
print(bad)

0.7209151674227384
3


In [16]:
clf = GradientBoostingRegressor(n_estimators=100)
clf = clf.fit(total_features, total_labels)

In [None]:
if test_mode:
    filename = "test.tsv"
else:
    filename = "../input/unzipped/test.tsv"

In [18]:
allTestData = []
with open(filename, encoding='utf8') as infile:
    for row in infile:
        allTestData.append(row)
# remove column labels
column_labels = allTestData[0].split('\t')
allTestData = allTestData[1:]

In [21]:
# get data
testData = []
for row in allTestData:
    arr = row.split('\t')
    features = []
    features.append(float(arr[0]))  # train_id
    features.append(arr[1])         # name
    features.append(float(arr[2]))  # item_condition_id
    features.append(arr[3])         # category_name (removing)
    features.append(arr[4])         # brand_name
#     features.append(float(arr[5]))  # price
    features.append(float(arr[5]))  # shipping
    features.append(arr[6])         # item_description
    testData.append(features)

In [22]:
(m0, m1, m2, m3, m4) = (0,0,0,0,0)
categories = [set(),  set(),  set(),  set(),  set()]
for row in testData:
    tmp = row[3].split('/')
    for i in range(len(tmp)):
        categories[i].add(tmp[i])
categories = list(categories)
for category in range(len(categories)):
    s = {}
    i = 0
    for col in categories[category]:
        s[col] = i
        i += 1
    categories[category] = s

In [23]:
# get test features and labels
test_features = []
for row in testData:
    tmp = row[3].split('/')
    new_feature = []
    # append item condition, shipping
    new_feature += [row[2], row[5]]
    # append categorical ids
    for category in range(5):
        if category >= len(tmp):
            new_feature.append(-1)
        else:
            new_feature.append(categories[category][tmp[category]])
    # append new features
    test_features.append(new_feature)

In [24]:
predictions = clf.predict(test_features)

In [25]:
for i in range(10):
    testid = int(testData[i][0])
    pred = predictions[i]
    print(testid, pred)

0 27.6127500819
1 19.427225325
2 29.1532813925
3 26.414523083
4 19.2845774181
5 13.310701571
6 28.8131201961
7 27.045861298
8 26.4258121404
9 16.0596094374


In [26]:
with open("sample_submission.csv", 'w') as predictions_file:
    predictions_file.write("test_id,price")
    for i in range(len(testData)):
        testid = int(testData[i][0])
        pred = predictions[i]
        output = str(testid)+","+str(pred)+"\n"
        predictions_file.write(output)