In [1]:
import os, argparse
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import string
import re
from collections import Counter
import text_utils

from sklearn.model_selection import train_test_split, StratifiedShuffleSplit

import fastai # pip install fastai
from fastai import *
from fastai.text import * 

import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

# nltk.download('stopwords')
# nltk.download('punkt')
stopwords = stopwords.words('english')

seed = 100
data_path = './data/'
models_dir = './models'


# Mobile

## LM

In [74]:
df = pd.read_csv(os.path.join(data_path, 'train_' + 'mobile_image' + '.csv'))

# Text cleaning
df['title'] = text_utils.clean_text(df['title'], stopwords)


train, val, _, _, _, _ = text_utils.data_split(df, seed)
df_trn = train[['Category', 'title']]
df_val = val[['Category', 'title']]

In [75]:
df_trn.head()

Unnamed: 0,Category,title
76283,31,iphone gb grey second kondisi fullset
62667,35,google pixel versi gb terkunci jaringan nego
35558,35,promo cuci gudang beli gratiss ac polytron pac...
20280,31,iphone gb gold
145882,32,beli gratis wa promo penghabisan stock brg sam...


In [76]:
# Language model data
data_lm = TextLMDataBunch.from_df(
    train_df = df_trn, valid_df = df_val, 
    path = "", bs=32
    ) # reduced bs from 64->32

In [77]:
learn = language_model_learner(data_lm, arch=AWD_LSTM, drop_mult=0.7) # AWD_LSTM requires cuda

  with open(fpath, 'r') as yaml_file: return yaml.load(yaml_file)


In [78]:
learn.fit_one_cycle(1, 1e-1)

epoch,train_loss,valid_loss,accuracy,time
0,3.837586,3.458266,0.34167,08:47


In [79]:
learn.unfreeze()
learn.fit_one_cycle(1, 1e-2)

epoch,train_loss,valid_loss,accuracy,time
0,3.149425,2.994756,0.390078,10:52


In [80]:
learn.save_encoder('ft_enc_{}'.format('mobile_image'))

## CL

In [129]:
test_df = pd.read_csv(os.path.join(data_path, 'test_mobile_image.csv'))
df_test = test_df['title']

In [82]:
# Classifier model data
data_clas = TextClasDataBunch.from_df(
    path = "", train_df = df_trn, valid_df = df_val,
    vocab=data_lm.train_ds.vocab, bs=32
    ) # reduced bs from 64->32

In [83]:
data_clas.add_test(items=df_test)

In [84]:
iter_dl = iter(data_clas.test_dl)
_ = next(iter_dl)
x,y = next(iter_dl)

In [85]:
x.shape

torch.Size([32, 19])

In [86]:
y # zero cause unlabelled!!!

tensor([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0], device='cuda:0')

In [87]:
learn = text_classifier_learner(data_clas, arch=AWD_LSTM, drop_mult=0.7)

  with open(fpath, 'r') as yaml_file: return yaml.load(yaml_file)


In [88]:
learn.load_encoder('ft_enc_{}'.format('mobile_image'))

In [89]:
learn.fit_one_cycle(1, 5e-2)

epoch,train_loss,valid_loss,accuracy,time
0,0.879396,0.771298,0.780702,08:43


In [90]:
learn.unfreeze()
# lower LR for tuning all layers
learn.fit_one_cycle(10, slice(2e-3/100, 2e-3)) # 2e-3/100 for lower half, 2e-3 for upper half

epoch,train_loss,valid_loss,accuracy,time
0,0.833954,0.757712,0.783759,27:37
1,0.870575,0.710987,0.788062,27:26
2,0.831711,0.675296,0.797168,27:34
3,0.73557,0.658302,0.800755,27:22
4,0.738424,0.643797,0.805526,27:23
5,0.75123,0.630209,0.806805,27:44
6,0.711001,0.626456,0.810173,27:47
7,0.684061,0.614768,0.811576,28:07
8,0.717806,0.614875,0.809019,29:28
9,0.66714,0.612397,0.812106,35:51


1.545378	1.005613	0.752448

In [91]:
learn.save_encoder('ft_classifier_{}'.format('mobile_image'))

In [92]:
learn.show_results()

text,target,prediction
xxbos asus zenfone max zc kl gb gb black new segel bnib g u info order cara pemesanan chat via wa,35,43
xxbos minat chat aja lewat wa kami harga promo bli bonus unit new oppo f ram gb rom gb free ongkir,41,41
xxbos laptop xxunk p w k uhd x xxunk corei ram gb untuk info lebih lanjut chat via wa kami,35,34
xxbos gome k iris recognition gb ram gb rom helio p mtk ghz octa core inch fhd g lte,34,35
xxbos spesial promo di bulan januari sale cuci gudang new sony xperia z big second lte g ram gb,33,33


In [93]:
# get predictions for test
preds, labels = learn.get_preds(DatasetType.Test, ordered=True)

In [94]:
preds_mobile = preds

In [103]:
preds_mobile.shape

torch.Size([40417, 27])

In [104]:
torch.save(preds_mobile, 'preds_mobile_image.pt.')

In [121]:
predictions = np.argmax(preds_mobile, axis = 1)

In [171]:
classes = data_clas.classes
print(classes)

[31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57]


In [177]:
output_csv_path = 'submission_mobile_image.csv'

with open(output_csv_path, 'a') as f:
    for i in range(len(preds)):
        pred_class = classes[predictions[i].item()]
        row = '{},{}\n'.format(test_df['itemid'][i],pred_class)
        f.write(row)

# Beauty

## LM

In [2]:
df = pd.read_csv(os.path.join(data_path, 'train_' + 'beauty_image' + '.csv'))

# Text cleaning
df['title'] = text_utils.clean_text(df['title'], stopwords)


train, val, _, _, _, _ = text_utils.data_split(df, seed)
df_trn = train[['Category', 'title']]
df_val = val[['Category', 'title']]

In [3]:
df_trn.head()

Unnamed: 0,Category,title
167343,3,new wardah luminous face powder refill
158212,0,promo milani everyday eyes powder eyeshadow co...
77775,4,laneige white plus renew capsule sleeping ball
250035,3,bestt buy laneige bb cushion whitening compact...
81603,9,milani prime perfection hydrating pore minimiz...


In [4]:
# Language model data
data_lm = TextLMDataBunch.from_df(
    train_df = df_trn, valid_df = df_val, 
    path = "", bs=32
    ) # reduced bs from 64->32

In [5]:
learn = language_model_learner(data_lm, arch=AWD_LSTM, drop_mult=0.7) # AWD_LSTM requires cuda

  with open(fpath, 'r') as yaml_file: return yaml.load(yaml_file)


In [6]:
learn.fit_one_cycle(1, 1e-1)

epoch,train_loss,valid_loss,accuracy,time
0,4.261026,3.752662,0.355206,20:03


In [7]:
learn.unfreeze()
learn.fit_one_cycle(2, 1e-2)

epoch,train_loss,valid_loss,accuracy,time
0,3.251658,3.046092,0.433086,31:29
1,2.907561,2.778002,0.468673,20:06


In [8]:
learn.save_encoder('ft_enc_{}'.format('beauty_image'))

## CL

In [5]:
test_df = pd.read_csv(os.path.join(data_path, 'test_beauty_image.csv'))
df_test = test_df['title']

In [6]:
# Classifier model data
data_clas = TextClasDataBunch.from_df(
    path = "", train_df = df_trn, valid_df = df_val,
    vocab=data_lm.train_ds.vocab, bs=32
    ) # reduced bs from 64->32

In [7]:
data_clas.add_test(items=df_test)

In [8]:
iter_dl = iter(data_clas.test_dl)
_ = next(iter_dl)
x,y = next(iter_dl)

In [9]:
x.shape

torch.Size([32, 18])

In [10]:
y # zero cause unlabelled!!!

tensor([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0], device='cuda:0')

In [11]:
learn = text_classifier_learner(data_clas, arch=AWD_LSTM, drop_mult=0.7)

  with open(fpath, 'r') as yaml_file: return yaml.load(yaml_file)


In [12]:
learn.load_encoder('ft_enc_{}'.format('beauty_image'))

In [13]:
learn.fit_one_cycle(7, 5e-2)

epoch,train_loss,valid_loss,accuracy,time
0,1.152532,1.037893,0.688906,16:05
1,1.223688,1.125973,0.675716,14:42
2,1.26668,1.059511,0.685242,14:52
3,1.251037,1.032484,0.693791,14:49
4,1.173931,0.96429,0.71136,14:48
5,1.0874,0.912155,0.723956,14:56
6,1.093058,0.894343,0.730673,14:52


71 ish%

In [14]:
learn.save_encoder('pre_ft_classifier_{}'.format('beauty_image'))

In [20]:
#learn.fit_one_cycle(2, 5e-2)

epoch,train_loss,valid_loss,accuracy,time
0,1.308821,8.949591,0.680322,19:39
1,0.997253,3.26627,0.740478,32:59


In [15]:
learn.freeze_to(-2)
learn.fit_one_cycle(3, slice(5e-3/100, 5e-3))

epoch,train_loss,valid_loss,accuracy,time
0,0.937231,0.8406,0.749219,18:14
1,0.88944,0.907072,0.756582,18:18
2,0.825426,0.776028,0.762182,18:43


1.545378	1.005613	0.752448

In [16]:
learn.save_encoder('ft_classifier_{}'.format('beauty_image'))

In [17]:
learn.show_results()

text,target,prediction
xxbos new product sk ii u fsk u fskii trial set fte rna power cream g anti aging free ongkir,4,4
xxbos buat apa kaya klw wajah dan kulit mu kusam skii sk crystal clear skin set xxunk xxunk kini ada,4,4
xxbos limited edition wet n wild mega glo dual ended contour stick light medium e big sale bulan ini,11,11
xxbos msi ion silver sbg pengganti p k praktis dn xxunk aman digunakan bayi org dewasa isi ml xxunk,4,4
xxbos big promo beli gratis sk ii sk skii loose powder uk g bedak tabur spf mohon cek info,4,3


In [18]:
# get predictions for test
preds, labels = learn.get_preds(DatasetType.Test, ordered=True)

In [19]:
preds_beauty = preds

In [20]:
preds_beauty.shape

torch.Size([76545, 17])

In [21]:
torch.save(preds_beauty, 'preds_beauty_image.pt.')

In [22]:
predictions = np.argmax(preds_beauty, axis = 1)

In [23]:
classes = data_clas.classes
print(classes)

[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16]


In [24]:
output_csv_path = 'submission_beauty_image.csv'

with open(output_csv_path, 'a') as f:
    for i in range(len(preds)):
        pred_class = classes[predictions[i].item()]
        row = '{},{}\n'.format(test_df['itemid'][i],pred_class)
        f.write(row)

# Fashion

## LM

In [2]:
df = pd.read_csv(os.path.join(data_path, 'train_' + 'fashion_image' + '.csv'))

# Text cleaning
df['title'] = text_utils.clean_text(df['title'], stopwords)


train, val, _, _, _, _ = text_utils.data_split(df, seed)
df_trn = train[['Category', 'title']]
df_val = val[['Category', 'title']]

In [3]:
df_trn.head()

Unnamed: 0,Category,title
104295,18,dress midi slim sexy lengan panjang warna polos
71817,18,ioi long dress maxi model shoulder untuk pesta...
85031,18,dress mini lengan pendek casual bahan lace
100351,19,yiiya new neck simple knee length dinner bride...
126983,30,kemeja wanita lengan panjang model longgar mot...


In [4]:
# Language model data
data_lm = TextLMDataBunch.from_df(
    train_df = df_trn, valid_df = df_val, 
    path = "", bs=32
    ) # reduced bs from 64->32

In [5]:
learn = language_model_learner(data_lm, arch=AWD_LSTM, drop_mult=0.7) # AWD_LSTM requires cuda

  with open(fpath, 'r') as yaml_file: return yaml.load(yaml_file)


In [6]:
learn.fit_one_cycle(1, 1e-1)

epoch,train_loss,valid_loss,accuracy,time
0,3.906811,3.547845,0.328511,19:04


In [7]:
learn.save_encoder('pre_ft_enc_{}'.format('fashion_image'))

In [6]:
learn.load_encoder('pre_ft_enc_{}'.format('fashion_image'))

In [7]:
learn.freeze_to(-2)
learn.fit_one_cycle(1, 1e-2)

epoch,train_loss,valid_loss,accuracy,time
0,3.332413,3.144544,0.382135,20:13


In [8]:
learn.save_encoder('ft_enc_{}'.format('fashion_image'))

## CL

In [9]:
test_df = pd.read_csv(os.path.join(data_path, 'test_fashion_image.csv'))
df_test = test_df['title']

In [10]:
# Classifier model data
data_clas = TextClasDataBunch.from_df(
    path = "", train_df = df_trn, valid_df = df_val,
    vocab=data_lm.train_ds.vocab, bs=32
    ) # reduced bs from 64->32

In [11]:
data_clas.add_test(items=df_test)

In [12]:
iter_dl = iter(data_clas.test_dl)
_ = next(iter_dl)
x,y = next(iter_dl)

In [13]:
x.shape

torch.Size([32, 18])

In [14]:
y # zero cause unlabelled!!!

tensor([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0], device='cuda:0')

In [15]:
learn = text_classifier_learner(data_clas, arch=AWD_LSTM, drop_mult=0.7)

  with open(fpath, 'r') as yaml_file: return yaml.load(yaml_file)


In [16]:
learn.load_encoder('ft_enc_{}'.format('fashion_image'))

In [17]:
learn.fit_one_cycle(1, 5e-2)

epoch,train_loss,valid_loss,accuracy,time
0,1.803964,1.836404,0.517262,14:37


In [18]:
learn.save_encoder('pre_ft_classifier_{}'.format('fashion_image'))

In [19]:
learn.fit_one_cycle(1, 5e-2)

epoch,train_loss,valid_loss,accuracy,time
0,1.812414,1.489269,0.515441,13:59


In [20]:
learn.save_encoder('pre2_ft_classifier_{}'.format('fashion_image'))

In [21]:
learn.freeze_to(-2)
learn.fit_one_cycle(3, 5e-3)

epoch,train_loss,valid_loss,accuracy,time
0,1.589448,1.440727,0.554425,17:44
1,1.499342,1.401277,0.554357,20:28
2,1.478532,1.333949,0.561321,24:00


In [22]:
learn.save_encoder('ft_classifier_{}_unfreeze_3'.format('fashion_image'))

1.545378	1.005613	0.752448

In [None]:
learn.load_encoder('ft_classifier_{}_unfreeze_3'.format('fashion_image'))

In [None]:
learn.fit_one_cycle(3, 5e-3)

In [None]:
learn.save_encoder('ft_classifier_{}'.format('fashion_image'))

In [None]:
learn.show_results()

In [None]:
# get predictions for test
preds, labels = learn.get_preds(DatasetType.Test, ordered=True)

In [None]:
preds_fashion = preds

In [None]:
preds_fashion.shape

In [None]:
torch.save(preds_fashion, 'preds_fashion_image.pt.')

In [None]:
predictions = np.argmax(preds_fashion, axis = 1)

In [None]:
classes = data_clas.classes
print(classes)

In [None]:
output_csv_path = 'submission_fashion_image.csv'

with open(output_csv_path, 'a') as f:
    for i in range(len(preds)):
        pred_class = classes[predictions[i].item()]
        row = '{},{}\n'.format(test_df['itemid'][i],pred_class)
        f.write(row)