# Train and Predict Pawpularity using VGG16 + Regression methods

If you want run this code, please download dataset from [kaggle](https://www.kaggle.com/competitions/petfinder-pawpularity-score/data) and put all data into data folder.

### Import packages below 

In [1]:
# Import pacakages.
# Packages to process csv files.
import pandas as pd
import os
import numpy as np

# Packages to process images.
# tqdm is a progress bar.
import cv2
from tqdm import tqdm


# Packages for model training.
from keras.utils import load_img, img_to_array
from keras.applications.vgg16 import VGG16
from keras.applications.vgg16 import preprocess_input
from sklearn.model_selection import KFold, train_test_split
from sklearn.metrics import mean_squared_error

# LinearRegression
from sklearn.linear_model import LinearRegression

# SVR
from sklearn.svm import SVR

# XGBoost
import xgboost

# Lightgbm
from lightgbm import LGBMRegressor


# Packages for model persistence
import pickle



### Explore data in train.csv 

In [2]:
# Read the csv file.
train_csv = pd.read_csv('./data/train.csv')

# Explore the csv file
headers = list(train_csv.columns)
print(headers)

# As we extract features from image using VGG. We do not use features showed here.
# We only care about `Id` and `Pawpularity`. Explore the range of `Pawpularity`
df_pawpularity = train_csv['Pawpularity']
df_max = df_pawpularity.max()
df_min = df_pawpularity.min()
df_mean = df_pawpularity.mean()
df_median = df_pawpularity.median()

print(df_max)
print(df_min)
print(df_mean)
print(df_median)

['Id', 'Subject Focus', 'Eyes', 'Face', 'Near', 'Action', 'Accessory', 'Group', 'Collage', 'Human', 'Occlusion', 'Info', 'Blur', 'Pawpularity']
100
1
38.03904358353511
33.0


### Read all images.

In [3]:
# Read image files.
image_path = './data/train'
image_files = os.listdir(image_path)
print(f'Total number of images is: {len(image_files)}')

Total number of images is: 9912


### Extract image features using VGG16 pre-trained by imagenet.

- Remove classifier layers and using weights trained using imagenet.
- It's very important to normalize images. This will make the matrix sparse. If don't do this, the parameters will become to huge (2.2TB) that cannot be easy handle.

In [4]:


model = VGG16(weights='imagenet', input_shape = (128, 128, 3), include_top=False)
for layer in model.layers:
    layer.trainable = False

model.summary()

# Process images to conform the input size of VGG16.
train_images = []
for image_file in tqdm(image_files):
    full_path = os.path.join(image_path, image_file)
    img = load_img(full_path, target_size=(128, 128, 3))
    image_array = img_to_array(img) / 255
    
    train_images.append(image_array)


train_images = np.array(train_images)    

features = model.predict(train_images)

expand_features = features.reshape(features.shape[0], -1)




Model: "vgg16"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_1 (InputLayer)        [(None, 128, 128, 3)]     0         
                                                                 
 block1_conv1 (Conv2D)       (None, 128, 128, 64)      1792      
                                                                 
 block1_conv2 (Conv2D)       (None, 128, 128, 64)      36928     
                                                                 
 block1_pool (MaxPooling2D)  (None, 64, 64, 64)        0         
                                                                 
 block2_conv1 (Conv2D)       (None, 64, 64, 128)       73856     
                                                                 
 block2_conv2 (Conv2D)       (None, 64, 64, 128)       147584    
                                                                 
 block2_pool (MaxPooling2D)  (None, 32, 32, 128)       0     

100%|██████████| 9912/9912 [00:31<00:00, 317.21it/s]




### Normalize the pawpulrity.

In previous cell, we find that the range of pawpularity is `1-100`. Normalize it to `0-1`. 

In [5]:
train_label = train_csv['Pawpularity'] / 100

### Use k-fold cross validation

This method is to make sure that the evaluation of model is rely on how we pick the data.

K-fold cross validation method procedure:

- Randomly, split the entire dataset into k number folds.
- For each fold in the dataset, build the model on `k-1` folds of the dataset. Then, test the model to check the effectiveness for kth fold.
- Repeat this until each of the k-folds has served as the test-set
- The average of k recorded accuracy is called the cross-validation accuracy and will serve as a performance metric for the model.




In [6]:
FOLD = 4
kf = KFold(n_splits = FOLD)

### Use Linear Regression to train the model

In [7]:
lr = LinearRegression()

all_rmse = []

for train_index,test_index in tqdm(kf.split((train_images))):
    # Train linear regression model.
    lr.fit(expand_features[train_index],train_label[train_index])
    # Predict pawpularity for evaluation.
    pred_y = lr.predict(expand_features[test_index]);
    y_true = train_label[test_index]

    # Use mean squared error as evaluation metric
    rmse = mean_squared_error(pred_y, y_true, squared=False)
    all_rmse.append(rmse)

    # Print rmse for one iteration.
    print('RMSE {:.2f}'.format(rmse))

# Print averge cross validation accuracy.
print(f'averge cross validation accuracy for {FOLD} folds is: {sum(all_rmse) / len(all_rmse)}')

1it [01:02, 62.83s/it]

RMSE 0.95


2it [02:05, 63.01s/it]

RMSE 1.04


3it [03:09, 63.18s/it]

RMSE 0.97


4it [04:13, 63.33s/it]

RMSE 0.98
averge cross validation accuracy for 4 folds is: 0.9842033578723071





### Linear Regression Model Persistence

Using `pickle` package to dump weights to local file.

In [8]:

with open('lr.pkl','wb') as f:
    pickle.dump(lr,f)

### Use SVR to train model

In [9]:
svr = SVR(C=0.01,kernel='poly')

all_rmse = []

for train_index,test_index in tqdm(kf.split((train_images))):
    
    svr.fit(expand_features[train_index],train_label[train_index])
    pred_y = svr.predict(expand_features[test_index]);
    y_true = train_label[test_index]

    rmse = mean_squared_error(pred_y,y_true,squared = False)
    all_rmse.append(rmse)
    
    print('RMSE {:.2f}'.format(rmse))

# Print averge cross validation accuracy.
print(f'averge cross validation accuracy for {FOLD} folds is: {sum(all_rmse) / len(all_rmse)}')

1it [01:43, 103.38s/it]

RMSE 0.21


2it [03:33, 107.27s/it]

RMSE 0.20


3it [05:19, 106.72s/it]

RMSE 0.20


4it [07:05, 106.50s/it]

RMSE 0.20
averge cross validation accuracy for 4 folds is: 0.20257650802425914





### SVR Model Persistence

In [20]:
with open('svr.pkl','wb') as f:
    pickle.dump(svr,f)

### Use XGBoost to train model.

In [11]:
xgb = xgboost.XGBRegressor(learning_rate=0.01, max_depth=4, n_estimators = 100)

all_rmse = []

for train_index, text_index in tqdm(kf.split((train_images))):

    xgb.fit(expand_features[train_index], train_label[train_index])
    pred_y = xgb.predict(expand_features[test_index]);
    y_true = train_label[test_index]
    
    rmse = mean_squared_error(pred_y,y_true,squared = False)
    all_rmse.append(rmse)

    print('RMSE {:.2f}'.format(rmse))

# Print averge cross validation accuracy.
print(f'averge cross validation accuracy for {FOLD} folds is: {sum(all_rmse) / len(all_rmse)}')

1it [00:34, 34.42s/it]

RMSE 0.20


2it [01:08, 34.51s/it]

RMSE 0.20


3it [01:43, 34.43s/it]

RMSE 0.20


4it [02:17, 34.39s/it]

RMSE 0.20
averge cross validation accuracy for 4 folds is: 0.19752659695873082





### XGBoost Model Persistence

In [12]:
with open('xgb.pkl','wb') as f:
    pickle.dump(xgb,f)

### Use lightgbm to train model

In [14]:


lgbm = LGBMRegressor(learning_rate= 0.01, max_depth=4, n_estimators=100)

all_rmse = []

for train_index, test_index in tqdm(kf.split(train_images)):

    lgbm.fit(expand_features[train_index], train_label[train_index])
    pred_y = lgbm.predict(expand_features[test_index]);
    y_true = train_label[test_index]
    
    rmse = mean_squared_error(pred_y, y_true, squared = False)
    all_rmse.append(rmse)

    print('RMSE {:.2f}'.format(rmse))

# Print averge cross validation accuracy.
print(f'averge cross validation accuracy for {FOLD} folds is: {sum(all_rmse) / len(all_rmse)}')

1it [00:16, 16.97s/it]

RMSE 0.21


2it [00:33, 16.73s/it]

RMSE 0.20


3it [00:49, 16.32s/it]

RMSE 0.20


4it [01:05, 16.37s/it]

RMSE 0.20
averge cross validation accuracy for 4 folds is: 0.20217341801761837





### Lightgbm Model Persistence

In [15]:
with open('lgbm.pkl','wb') as f:
    pickle.dump(lgbm,f)

### Predict pawpularity using each model for Pidan
- Firstly, read all images that should be predicted
- Secondly, using VGG16 above to extract feature.
- Thirdly, load linear regression weights and predict.
- Fourthly, load svr weights and predict.
- Fifthly, load xgboost weights and predict.
- Finally, load lighbgm weights and predict.

In [16]:
predict_dir = './data/pidan'
predict_images_list = os.listdir(predict_dir)

predict_images = []

for image in tqdm(predict_images_list):
    full_path = os.path.join(predict_dir, image)
    img = load_img(full_path, target_size=(128, 128, 3))
    image_array = img_to_array(img) / 255
    predict_images.append(image_array)

predict_images = np.array(predict_images)
    

100%|██████████| 3/3 [00:00<00:00, 76.89it/s]


In [35]:
test_features = model.predict(predict_images)
expand_features = test_features.reshape(test_features.shape[0],-1)



In [37]:
lr_weights = './lr.pkl'
with open(lr_weights, 'rb') as file:
    lr = pickle.load(file)

prediction = lr.predict(expand_features)
print(prediction)

[-0.49108666  0.7700281  -0.567711  ]


In [38]:
svr_weights = './svr.pkl'
with open(svr_weights, 'rb') as file:
    svr = pickle.load(file)

prediction = svr.predict(expand_features)
print(prediction)

[0.32936038 0.33635866 0.30284451]


In [39]:
xgb_weights = './xgb.pkl'
with open(xgb_weights, 'rb') as file:
    xgb = pickle.load(file)

prediction = xgb.predict(expand_features)
print(prediction)

[0.41824916 0.40569213 0.42035866]


In [40]:
lgbm_weights = './lgbm.pkl'
with open(lgbm_weights, 'rb') as file:
    lgbm = pickle.load(file)

prediction = lgbm.predict(expand_features)
print(prediction)

[0.369031   0.36742713 0.37338372]


### Generate `submission.csv` for test using SVR

In [41]:
test_csv = pd.read_csv('./data/test.csv')
test_images_dir = './data/test'
test_images_list = os.listdir(test_images_dir)

In [42]:
test_images = []
for image in tqdm(test_images_list):
    path = os.path.join(test_images_dir, image)
    img = load_img(full_path, target_size=(128, 128, 3))
    image_array = img_to_array(img) / 255
    test_images.append(image_array)
    
test_images = np.array(test_images)

100%|██████████| 8/8 [00:00<00:00, 269.77it/s]


In [43]:
test_features = model.predict(test_images)
expand_features = test_features.reshape(test_features.shape[0],-1)



In [1]:
svr_weights = './svr.pkl'

with open(svr_weights, 'rb') as file:
    svr = pickle.load(file)

FileNotFoundError: [Errno 2] No such file or directory: './svr.pkl'

In [45]:
prediction = svr.predict(expand_features)
submission = pd.DataFrame()
submission['Id'] = test_csv['Id']
# Resize pawpularity from `0-1` to `1-100`
submission['Pawpularity'] = prediction * 100
submission.to_csv('submission.csv', index=False)