# 이미지 Feature vector를 활용해보기!

- 여러 논문에서 소개된것처럼 pre-trained CNN으로 image features를 추출해봅니다.
- image features를 비교하여 실제로 비슷한지 판단하고, rating정보와 함께 분석해봅니다.


In [3]:
path = '../data/amazon_reviews'

## Load dataset

1. AMAZON_FASHION_5.json
2. All_Beauty_5.json
3. Luxury_Beauty_5.json

In [4]:
import os, json
import pandas as pd

In [5]:
def load_json(filename):
    data = []
    with open(os.path.join(path, filename), 'r', encoding='utf-8') as f:
        for l in f:
            data.append(json.loads(l.strip()))

    df = pd.DataFrame.from_dict(data)

    # get rows which contains image 
    df = df[~df['image'].isnull()]

    return df

In [7]:
fashion_df = load_json('AMAZON_FASHION_5.json')
print(fashion_df.shape)

fashion_df.head()

(106, 12)


Unnamed: 0,overall,verified,reviewTime,reviewerID,asin,style,reviewerName,reviewText,summary,unixReviewTime,vote,image
164,5.0,True,"04 18, 2018",A2YZERYQTLB8NG,B001IKJOLW,"{'Size:': ' 9.5 B(M) US', 'Color:': ' Black/Wh...",Lenci,Best tennis shoes I've had all my life. Very c...,Very Comfortable,1524009600,,[https://images-na.ssl-images-amazon.com/image...
172,5.0,True,"04 7, 2018",A1CKPC88NHMYGR,B001IKJOLW,"{'Size:': ' 11 B(M) US', 'Color:': ' Wolf Grey...",Cynthia Foyer,,Five Stars,1523059200,,[https://images-na.ssl-images-amazon.com/image...
179,5.0,True,"03 22, 2018",A3KKVVAINMZF9D,B001IKJOLW,"{'Size:': ' 9 B(M) US', 'Color:': ' Blue Tint/...",Nadege Marcellus,"Straight out of the box, these shoes are great...",these shoes are great! Very lightweight and fi...,1521676800,,[https://images-na.ssl-images-amazon.com/image...
192,5.0,True,"02 27, 2018",A3TLWN2BRF1QH5,B001IKJOLW,"{'Size:': ' 8.5 B(M) US', 'Color:': ' Blue Tin...",Brittany C.,These are so cute and comfortable and very lig...,Great fit!,1519689600,,[https://images-na.ssl-images-amazon.com/image...
197,5.0,True,"02 20, 2018",A3RNGBSBRJ3YAQ,B001IKJOLW,"{'Size:': ' 7.5 B(M) US', 'Color:': ' Wolf Gre...",Andrea L Hogan,They were a gift to my daughter she loved them,Their Cute and Pink,1519084800,,[https://images-na.ssl-images-amazon.com/image...


In [9]:
beauty_df = load_json('All_Beauty_5.json')
print(beauty_df.shape)
beauty_df.head()

(98, 12)


Unnamed: 0,overall,verified,reviewTime,reviewerID,asin,style,reviewerName,reviewText,summary,unixReviewTime,vote,image
19,5.0,True,"04 23, 2018",AX0ZEGHH0H525,B00006L9LC,{'Size:': ' Small'},Aida A,Suffered from itchiness under my hair for coup...,Scalp-healing,1524441600,5,[https://images-na.ssl-images-amazon.com/image...
20,5.0,True,"04 22, 2018",A1L0QECT7J93ZP,B00006L9LC,{'Size:': ' Small'},Elena,Got this product for me and my daughter. I ca...,For any type of hair,1524355200,4,[https://images-na.ssl-images-amazon.com/image...
21,5.0,True,"04 21, 2018",A1VN560NNZQIR0,B00006L9LC,{'Size:': ' Small'},Shablinska,Cleansing properties are above any praise! Sup...,The best treat for my hair!,1524268800,4,[https://images-na.ssl-images-amazon.com/image...
34,1.0,True,"03 27, 2018",A2V608ILSK1M5R,B00006L9LC,{'Size:': ' Small'},CDART815,My product was not sealed and either used or s...,Beware,1522108800,2,[https://images-na.ssl-images-amazon.com/image...
47,5.0,True,"02 23, 2018",A22V1MD93T2FW9,B00006L9LC,{'Size:': ' Small'},Heather Sharp,I bought this for my husband. Hed been having ...,Really great shampoo for sensitive skin that h...,1519344000,20,[https://images-na.ssl-images-amazon.com/image...


In [18]:
luxury_df = load_json('Luxury_Beauty_5.json')
luxury_df.head()
luxury_df = luxury_df[:50]
print(luxury_df.shape)

(50, 12)


## Download images

In [11]:
from tqdm import tqdm
import requests

In [13]:
def download_images(path, df, category):
    folder_path = os.path.join(path, category)

    if not os.path.exists(folder_path):
        os.makedirs(folder_path)

    for index in tqdm(df.index):
        url_list = df['image'].loc[index]
        for url_index, url in enumerate(url_list):
            if not os.path.exists(os.path.join(folder_path, f'{str(index)}_{str(url_index)}.jpg')):
                img_data = requests.get(url).content
                with open(os.path.join(folder_path, f'{str(index)}_{str(url_index)}.jpg'), 'wb') as handler:
                    handler.write(img_data)

    print(f'{category}: {len(df.index)} images downloaded or already exist...')

In [14]:
download_images(path, beauty_df, 'beauty')

100%|██████████| 98/98 [00:28<00:00,  3.44it/s]

beauty: 98 images downloaded or already exist...





In [15]:
download_images(path, fashion_df, 'fashion')

100%|██████████| 106/106 [00:34<00:00,  3.11it/s]

fashion: 106 images downloaded or already exist...





In [19]:
download_images(path, luxury_df, 'luxury')

100%|██████████| 50/50 [00:29<00:00,  1.68it/s]

luxury: 50 images downloaded or already exist...





## Use pre-trained CNN

In [20]:
import torch
import torch.nn as nn
import torchvision.models as models
import torchvision.transforms as transforms
from torch.autograd import Variable
from PIL import Image

In [21]:
# Load the pretrained model
model = models.resnet18(pretrained=True)
# Use the model object to select the desired layer
layer = model._modules.get('avgpool')

Downloading: "https://download.pytorch.org/models/resnet18-5c106cde.pth" to /Users/ohyeji/.cache/torch/hub/checkpoints/resnet18-5c106cde.pth


HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=46827520.0), HTML(value='')))




In [22]:
# Set model to evaluation mode
model.eval()

ResNet(
  (conv1): Conv2d(3, 64, kernel_size=(7, 7), stride=(2, 2), padding=(3, 3), bias=False)
  (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (relu): ReLU(inplace=True)
  (maxpool): MaxPool2d(kernel_size=3, stride=2, padding=1, dilation=1, ceil_mode=False)
  (layer1): Sequential(
    (0): BasicBlock(
      (conv1): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
      (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (relu): ReLU(inplace=True)
      (conv2): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
      (bn2): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    )
    (1): BasicBlock(
      (conv1): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
      (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (relu): ReLU(inplace=True)
  

In [23]:
scaler = transforms.Scale((224, 224))
normalize = transforms.Normalize(mean=[0.485, 0.456, 0.406],
                                 std=[0.229, 0.224, 0.225])
to_tensor = transforms.ToTensor()



In [24]:
def get_vector(image_name):
    # 1. Load the image with Pillow library
    img = Image.open(image_name)
    # 2. Create a PyTorch Variable with the transformed image
    t_img = Variable(normalize(to_tensor(scaler(img))).unsqueeze(0))
    # 3. Create a vector of zeros that will hold our feature vector
    #    The 'avgpool' layer has an output size of 512
    my_embedding = torch.zeros(512)
    # 4. Define a function that will copy the output of a layer
    def copy_data(m, i, o):
        my_embedding.copy_(o.data.reshape(o.data.size(1)))
    # 5. Attach that function to our selected layer
    h = layer.register_forward_hook(copy_data)
    # 6. Run the model on our transformed image
    model(t_img)
    # 7. Detach our copy function from the layer
    h.remove()
    # 8. Return the feature vector
    return my_embedding.cpu().detach().numpy()

In [25]:
# Test
category = 'beauty'
for image_file in os.listdir(os.path.join(path, category))[:2]:
    print(f"{image_file} feature vectors")
    print(get_vector(os.path.join(path, category+'/'+image_file)))

4854_0.jpg feature vectors
[1.68059599e+00 3.31894696e-01 1.30647445e+00 4.90886033e-01
 3.24745595e-01 5.49084842e-01 8.98036957e-01 1.62820801e-01
 1.29270983e+00 5.38413003e-02 7.69401863e-02 6.03342474e-01
 1.69143379e-02 2.32753992e+00 1.07282296e-01 9.16272998e-01
 5.90273701e-02 6.49269342e-01 2.28403640e+00 1.82134703e-01
 1.20188892e+00 2.77085155e-01 1.74444342e+00 0.00000000e+00
 1.77385926e+00 1.14356838e-02 8.97069633e-01 3.69778067e-01
 1.34051716e+00 2.00102225e-01 6.60273075e-01 5.76899946e-01
 8.59333724e-02 3.64483297e-01 1.38429746e-01 1.10896461e-01
 1.36040688e-01 2.96539712e+00 1.35082960e+00 5.92131726e-02
 1.71393335e-01 7.30559886e-01 6.38134718e-01 1.81596470e+00
 5.44576585e-01 1.32290530e+00 6.33052051e-01 1.00981688e+00
 1.12135537e-01 7.69737005e-01 1.63792312e+00 1.27294302e+00
 1.46582913e+00 6.13689303e-01 4.69012201e-01 2.09266856e-01
 2.06072912e-01 8.86440873e-01 3.81845266e-01 1.62030685e+00
 8.13375533e-01 9.62493658e-01 6.03688508e-02 1.61330774e-

## Preprocess dataset

- Remove unnecessary columns
- Remove all other columns except `overall`, `reviewerID`, `asin`, `image` 
- Create new column with image filename

In [27]:
def add_image_filenames(category, df):
    # Remove unnecessary columns
    df = df[['overall','reviewerID', 'asin', 'image']]

    filenames = []
    for row_index in df.index:
        each_files = []
        for idx in range(len(df.loc[row_index]['image'])):
            each_files.append(os.path.join(path, category+'/'+f'{row_index}_{idx}.jpg'))
        filenames.append(each_files)

    # Add new column
    df.drop('image', axis=1, inplace=True)
    df['image_filename'] = list(filenames)

    return df


In [28]:
luxury_df = add_image_filenames('luxury', luxury_df)
beauty_df = add_image_filenames('beauty', beauty_df)
fashion_df = add_image_filenames('fashion', fashion_df)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  errors=errors,
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


In [29]:
luxury_df.head()

Unnamed: 0,overall,reviewerID,asin,image_filename
68,5.0,A2BHOZILR7SY9,B000142FVW,[../data/amazon_reviews/luxury/68_0.jpg]
75,5.0,ACMSQCH1H7JZD,B000142FVW,[../data/amazon_reviews/luxury/75_0.jpg]
86,5.0,A2L77YQRAEA1YZ,B000142FVW,[../data/amazon_reviews/luxury/86_0.jpg]
88,5.0,A28W77RPDZK7AZ,B00014351Q,"[../data/amazon_reviews/luxury/88_0.jpg, ../da..."
104,5.0,A2IV70BWQBUF32,B00014351Q,[../data/amazon_reviews/luxury/104_0.jpg]


In [30]:
data_list = []
dataframe_list = [('luxury',luxury_df), ('beauty', beauty_df), ('fashion', fashion_df)]
for dataframe in dataframe_list:
    category = dataframe[0]
    df = dataframe[1]
    for index, row in df.iterrows():
        for filename in row['image_filename']:
            data_tuple = (category, row['overall'], row['reviewerID'], row['asin'], filename)
            data_list.append(data_tuple)

In [32]:
combined_df = pd.DataFrame(data=data_list,columns=['category', 'overall', 'reviewerID', 'asin', 'filename'])
print(combined_df.shape)
combined_df.head()

(391, 5)


Unnamed: 0,category,overall,reviewerID,asin,filename
0,luxury,5.0,A2BHOZILR7SY9,B000142FVW,../data/amazon_reviews/luxury/68_0.jpg
1,luxury,5.0,ACMSQCH1H7JZD,B000142FVW,../data/amazon_reviews/luxury/75_0.jpg
2,luxury,5.0,A2L77YQRAEA1YZ,B000142FVW,../data/amazon_reviews/luxury/86_0.jpg
3,luxury,5.0,A28W77RPDZK7AZ,B00014351Q,../data/amazon_reviews/luxury/88_0.jpg
4,luxury,5.0,A28W77RPDZK7AZ,B00014351Q,../data/amazon_reviews/luxury/88_1.jpg


### Get image feature vectors

In [33]:
combined_df['image_vec'] = combined_df['filename'].apply(lambda x: get_vector(x))
combined_df.to_csv(os.path.join(path, 'image_dataset.csv'), sep='\t')


In [34]:
combined_df.head()

Unnamed: 0,category,overall,reviewerID,asin,filename,image_vec
0,luxury,5.0,A2BHOZILR7SY9,B000142FVW,../data/amazon_reviews/luxury/68_0.jpg,"[0.57558465, 0.7388489, 0.10173568, 0.21581173..."
1,luxury,5.0,ACMSQCH1H7JZD,B000142FVW,../data/amazon_reviews/luxury/75_0.jpg,"[1.5841694, 1.0663635, 0.13369095, 0.3091502, ..."
2,luxury,5.0,A2L77YQRAEA1YZ,B000142FVW,../data/amazon_reviews/luxury/86_0.jpg,"[0.9460139, 1.269538, 0.1632403, 0.69132954, 0..."
3,luxury,5.0,A28W77RPDZK7AZ,B00014351Q,../data/amazon_reviews/luxury/88_0.jpg,"[0.86120635, 0.44911617, 0.25928155, 1.4042616..."
4,luxury,5.0,A28W77RPDZK7AZ,B00014351Q,../data/amazon_reviews/luxury/88_1.jpg,"[1.2655925, 0.3834538, 1.1492652, 1.5618992, 0..."


## K-means clustering

In [35]:
from sklearn.model_selection import train_test_split
from sklearn.cluster import KMeans
import numpy as np

In [36]:
def check_vector(vector):
    return np.array([0.0 if str(x) == '' else float(x) for x in vector])[:512]



In [37]:
df = pd.read_csv(os.path.join(path, 'image_dataset.csv'), sep='\t', index_col=0,
                 converters={"image_vec": lambda x: x.strip("[]").replace('\n','').split(" ")})


In [38]:
df.head()

Unnamed: 0,category,overall,reviewerID,asin,filename,image_vec
0,luxury,5.0,A2BHOZILR7SY9,B000142FVW,../data/amazon_reviews/luxury/68_0.jpg,"[5.75584650e-01, 7.38848925e-01, 1.01735681e-0..."
1,luxury,5.0,ACMSQCH1H7JZD,B000142FVW,../data/amazon_reviews/luxury/75_0.jpg,"[1.58416939e+00, 1.06636345e+00, 1.33690953e-0..."
2,luxury,5.0,A2L77YQRAEA1YZ,B000142FVW,../data/amazon_reviews/luxury/86_0.jpg,"[9.46013927e-01, 1.26953804e+00, 1.63240299e-0..."
3,luxury,5.0,A28W77RPDZK7AZ,B00014351Q,../data/amazon_reviews/luxury/88_0.jpg,"[0.86120635, 0.44911617, 0.25928155, 1.4042616..."
4,luxury,5.0,A28W77RPDZK7AZ,B00014351Q,../data/amazon_reviews/luxury/88_1.jpg,"[1.2655925, , 0.3834538, , 1.1492652, , 1.5618..."


In [39]:
df['image_vec'] = df['image_vec'].apply(lambda x: check_vector(x))

In [41]:
df.head()

Unnamed: 0,category,overall,reviewerID,asin,filename,image_vec
0,luxury,5.0,A2BHOZILR7SY9,B000142FVW,../data/amazon_reviews/luxury/68_0.jpg,"[0.57558465, 0.738848925, 0.101735681, 0.21581..."
1,luxury,5.0,ACMSQCH1H7JZD,B000142FVW,../data/amazon_reviews/luxury/75_0.jpg,"[1.58416939, 1.06636345, 0.133690953, 0.309150..."
2,luxury,5.0,A2L77YQRAEA1YZ,B000142FVW,../data/amazon_reviews/luxury/86_0.jpg,"[0.946013927, 1.26953804, 0.163240299, 0.69132..."
3,luxury,5.0,A28W77RPDZK7AZ,B00014351Q,../data/amazon_reviews/luxury/88_0.jpg,"[0.86120635, 0.44911617, 0.25928155, 1.4042616..."
4,luxury,5.0,A28W77RPDZK7AZ,B00014351Q,../data/amazon_reviews/luxury/88_1.jpg,"[1.2655925, 0.0, 0.3834538, 0.0, 1.1492652, 0...."


In [42]:
train_df, test_df = train_test_split(df, test_size=0.2, random_state=1234)
print(train_df.shape)
print(test_df.shape)

(312, 6)
(79, 6)


In [43]:
X_train = np.array([list(x) for x in train_df['image_vec'].values])

In [44]:
X_train[:3]

array([[1.14149129, 0.97845006, 1.26295602, ..., 0.23935166, 0.27402246,
        5.02150917],
       [0.70760679, 0.05854635, 0.64922899, ..., 0.72796828, 0.32886657,
        0.80799866],
       [0.30770081, 1.03539789, 2.69267702, ..., 0.87885767, 0.14589608,
        0.00641873]])

In [45]:
kmeans = KMeans(n_clusters=3, random_state=0).fit(X_train)

In [46]:
kmeans.labels_

array([1, 0, 2, 2, 0, 0, 1, 2, 0, 2, 0, 2, 0, 0, 0, 0, 0, 0, 2, 2, 0, 1,
       0, 1, 2, 0, 0, 1, 0, 0, 2, 0, 2, 2, 2, 1, 0, 2, 1, 0, 0, 0, 2, 1,
       0, 0, 0, 2, 0, 0, 2, 0, 0, 0, 0, 0, 2, 0, 0, 0, 2, 0, 0, 1, 0, 1,
       0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 2, 0, 2,
       0, 0, 0, 2, 1, 0, 2, 2, 0, 2, 0, 0, 2, 2, 2, 2, 0, 2, 0, 2, 0, 0,
       0, 0, 1, 2, 1, 0, 2, 2, 1, 0, 2, 2, 0, 0, 0, 1, 0, 0, 1, 1, 2, 0,
       0, 0, 0, 2, 0, 0, 2, 2, 1, 1, 1, 1, 2, 2, 0, 2, 0, 0, 0, 0, 0, 0,
       2, 2, 1, 0, 0, 0, 2, 1, 1, 0, 0, 0, 1, 1, 2, 0, 0, 0, 0, 0, 2, 0,
       0, 0, 0, 2, 0, 0, 1, 2, 1, 2, 0, 1, 0, 0, 0, 1, 2, 0, 0, 2, 0, 0,
       2, 1, 0, 1, 2, 1, 2, 0, 1, 0, 2, 2, 2, 0, 0, 0, 0, 1, 2, 0, 2, 0,
       2, 0, 0, 1, 0, 0, 2, 2, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1,
       1, 2, 0, 1, 2, 0, 2, 1, 1, 0, 0, 0, 1, 0, 2, 0, 0, 0, 0, 2, 2, 2,
       0, 2, 0, 1, 0, 0, 0, 1, 2, 0, 0, 1, 0, 1, 0, 1, 2, 2, 0, 2, 0, 0,
       0, 0, 1, 0, 0, 0, 0, 0, 2, 1, 0, 2, 2, 0, 2,

In [47]:
test_df.head()

Unnamed: 0,category,overall,reviewerID,asin,filename,image_vec
40,luxury,5.0,A25QBCHO0KFT0P,B00014GT8W,../data/amazon_reviews/luxury/577_0.jpg,"[0.0468260944, 0.879412174, 0.468813539, 0.381..."
44,luxury,5.0,AX5ZDQUSZG8MI,B00014GT8W,../data/amazon_reviews/luxury/602_1.jpg,"[0.0649163723, 1.51313007, 0.60124737, 0.21510..."
304,fashion,5.0,AEUUYBIO73RLA,B009MA34NY,../data/amazon_reviews/fashion/1445_0.jpg,"[0.643320262, 0.577247977, 0.365769535, 1.0960..."
59,luxury,3.0,A1A25TP5D0L22V,B0002RI2PG,../data/amazon_reviews/luxury/1247_1.jpg,"[0.592981756, 0.134454787, 0.929966509, 1.1438..."
267,fashion,1.0,A3SM5XENOINDNH,B0058YEJ5K,../data/amazon_reviews/fashion/821_0.jpg,"[0.0297243893, 0.150493458, 0.899268091, 0.654..."


### Evaluation

In [48]:
# kmeans.predict([test_df['image_vec'].iloc[0]])
test_df['prediction'] = test_df['image_vec'].apply(lambda x: kmeans.predict([x])[0])
test_df.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


Unnamed: 0,category,overall,reviewerID,asin,filename,image_vec,prediction
40,luxury,5.0,A25QBCHO0KFT0P,B00014GT8W,../data/amazon_reviews/luxury/577_0.jpg,"[0.0468260944, 0.879412174, 0.468813539, 0.381...",0
44,luxury,5.0,AX5ZDQUSZG8MI,B00014GT8W,../data/amazon_reviews/luxury/602_1.jpg,"[0.0649163723, 1.51313007, 0.60124737, 0.21510...",0
304,fashion,5.0,AEUUYBIO73RLA,B009MA34NY,../data/amazon_reviews/fashion/1445_0.jpg,"[0.643320262, 0.577247977, 0.365769535, 1.0960...",0
59,luxury,3.0,A1A25TP5D0L22V,B0002RI2PG,../data/amazon_reviews/luxury/1247_1.jpg,"[0.592981756, 0.134454787, 0.929966509, 1.1438...",0
267,fashion,1.0,A3SM5XENOINDNH,B0058YEJ5K,../data/amazon_reviews/fashion/821_0.jpg,"[0.0297243893, 0.150493458, 0.899268091, 0.654...",1


In [49]:
test_df.groupby('category')['prediction'].count()

category
beauty     24
fashion    30
luxury     25
Name: prediction, dtype: int64

In [50]:
test_df.groupby('category').count()

Unnamed: 0_level_0,overall,reviewerID,asin,filename,image_vec,prediction
category,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
beauty,24,24,24,24,24,24
fashion,30,30,30,30,30,30
luxury,25,25,25,25,25,25


In [51]:
test_df.groupby('prediction').count()

Unnamed: 0_level_0,category,overall,reviewerID,asin,filename,image_vec
prediction,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0,51,51,51,51,51,51
1,12,12,12,12,12,12
2,16,16,16,16,16,16


In [52]:
print(test_df[(test_df.prediction == 0) & (test_df.category == 'luxury')].shape)
print(test_df[(test_df.prediction == 1) & (test_df.category == 'luxury')].shape)
print(test_df[(test_df.prediction == 2) & (test_df.category == 'luxury')].shape)

(22, 7)
(0, 7)
(3, 7)


## K-Nearest Neighbors


In [53]:
from sklearn.neighbors import KNeighborsClassifier

In [54]:
neigh = KNeighborsClassifier(n_neighbors=3)

In [57]:
y_train = train_df['overall'].values
print(y_train)

[5. 5. 5. 4. 5. 5. 5. 5. 5. 5. 5. 5. 5. 5. 5. 5. 5. 5. 3. 5. 5. 5. 1. 5.
 5. 5. 1. 5. 1. 5. 5. 4. 5. 5. 5. 5. 5. 5. 5. 1. 5. 5. 5. 5. 5. 5. 5. 3.
 5. 5. 5. 5. 5. 5. 5. 5. 3. 5. 1. 5. 5. 5. 5. 1. 4. 5. 4. 5. 1. 5. 3. 5.
 5. 5. 3. 5. 5. 1. 5. 5. 5. 5. 5. 5. 5. 5. 5. 5. 5. 5. 5. 5. 5. 1. 5. 5.
 5. 4. 5. 5. 5. 5. 5. 5. 4. 4. 5. 5. 5. 3. 5. 5. 1. 5. 5. 4. 5. 5. 1. 5.
 5. 4. 5. 3. 5. 5. 5. 5. 5. 5. 3. 5. 4. 5. 5. 3. 5. 5. 5. 5. 1. 5. 5. 5.
 5. 5. 4. 5. 5. 5. 5. 5. 5. 5. 3. 5. 1. 3. 5. 3. 1. 5. 1. 1. 5. 5. 5. 5.
 5. 4. 5. 5. 1. 1. 5. 5. 1. 5. 5. 5. 5. 5. 5. 5. 1. 5. 4. 5. 5. 4. 5. 1.
 3. 5. 5. 3. 5. 1. 4. 5. 4. 5. 5. 1. 3. 1. 5. 5. 5. 5. 5. 3. 5. 4. 1. 5.
 5. 5. 5. 5. 5. 5. 5. 1. 5. 1. 5. 5. 4. 5. 4. 5. 4. 4. 5. 5. 3. 4. 4. 5.
 4. 5. 5. 5. 1. 5. 5. 4. 3. 5. 1. 5. 1. 5. 1. 5. 3. 4. 4. 5. 5. 5. 5. 5.
 5. 5. 5. 5. 4. 5. 1. 5. 5. 1. 5. 5. 5. 1. 5. 5. 3. 5. 5. 3. 5. 5. 3. 5.
 5. 5. 1. 1. 5. 1. 5. 5. 1. 5. 3. 5. 5. 5. 5. 4. 5. 4. 5. 5. 5. 5. 5. 5.]


In [58]:
neigh.fit(X_train, y_train)

KNeighborsClassifier(n_neighbors=3)

In [59]:
test_df['prediction'] = test_df['image_vec'].apply(lambda x: neigh.predict([x])[0])


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


In [60]:
test_df.head()

Unnamed: 0,category,overall,reviewerID,asin,filename,image_vec,prediction
40,luxury,5.0,A25QBCHO0KFT0P,B00014GT8W,../data/amazon_reviews/luxury/577_0.jpg,"[0.0468260944, 0.879412174, 0.468813539, 0.381...",5.0
44,luxury,5.0,AX5ZDQUSZG8MI,B00014GT8W,../data/amazon_reviews/luxury/602_1.jpg,"[0.0649163723, 1.51313007, 0.60124737, 0.21510...",5.0
304,fashion,5.0,AEUUYBIO73RLA,B009MA34NY,../data/amazon_reviews/fashion/1445_0.jpg,"[0.643320262, 0.577247977, 0.365769535, 1.0960...",5.0
59,luxury,3.0,A1A25TP5D0L22V,B0002RI2PG,../data/amazon_reviews/luxury/1247_1.jpg,"[0.592981756, 0.134454787, 0.929966509, 1.1438...",5.0
267,fashion,1.0,A3SM5XENOINDNH,B0058YEJ5K,../data/amazon_reviews/fashion/821_0.jpg,"[0.0297243893, 0.150493458, 0.899268091, 0.654...",1.0


In [61]:
test_df[test_df.overall == test_df.prediction].count()

category      67
overall       67
reviewerID    67
asin          67
filename      67
image_vec     67
prediction    67
dtype: int64

In [62]:
len(test_df)

79

In [64]:
67/79

0.8481012658227848