## Experiment 12: Both Visual Analysis based on Pretrained CNN features

Import modules

In [1]:
%matplotlib inline
import os
import numpy as np
import pandas as pd
import pickle

In [2]:
# Load the csv file containing the posts
# Use only "URL" and "Total Interactions" columns
ingr_posts_df = pd.read_csv("Datasets/ingr/ingr.csv",
                            usecols=["URL", "Total Interactions"])[
    ["URL", "Total Interactions"]]
ingr_posts_df["image_dir"] = "Datasets/ingr/images/"  # Path to where your images are

tovima_posts_df = pd.read_csv("Datasets/tovima/tovima.csv",
                              usecols=["URL", "Total Interactions"]).head(522)[
    ["URL", "Total Interactions"]]
tovima_posts_df["image_dir"] = "Datasets/tovima/images/"  # Path to where your images are

# Combine the two dataframes into a single one
posts_df = pd.concat([ingr_posts_df, tovima_posts_df], ignore_index=True)

# Create a column storing the image path of each post
posts_df['image_path'] = posts_df.apply(
    lambda x: x["image_dir"] + x["URL"].split("/")[-1].strip() + ".jpg", axis=1)

# Drop rows with missing values values
posts_df = posts_df.dropna().reset_index(drop=True)

# Filter out rows with "Total Interactions" values below 0.05 quantile and above 0.95 quantile
posts_df = posts_df.loc[(posts_df["Total Interactions"] >= posts_df[
    "Total Interactions"].quantile(.05)) & (posts_df["Total Interactions"] <= posts_df[
    "Total Interactions"].quantile(.95))].reset_index(drop=True)
posts_df

Unnamed: 0,URL,Total Interactions,image_dir,image_path
0,https://www.facebook.com/293834325961/posts/10...,2,Datasets/ingr/images/,Datasets/ingr/images/10159636104980962.jpg
1,https://www.facebook.com/293834325961/posts/10...,2,Datasets/ingr/images/,Datasets/ingr/images/10159636289825962.jpg
2,https://www.facebook.com/293834325961/posts/10...,28,Datasets/ingr/images/,Datasets/ingr/images/10159635983895962.jpg
3,https://www.facebook.com/293834325961/posts/10...,10,Datasets/ingr/images/,Datasets/ingr/images/10159635136250962.jpg
4,https://www.facebook.com/293834325961/posts/10...,7,Datasets/ingr/images/,Datasets/ingr/images/10159635038980962.jpg
...,...,...,...,...
1026,https://www.facebook.com/184884834877395/posts...,0,Datasets/tovima/images/,Datasets/tovima/images/4394677827231387.jpg
1027,https://www.facebook.com/184884834877395/posts...,7,Datasets/tovima/images/,Datasets/tovima/images/4393836503982186.jpg
1028,https://www.facebook.com/184884834877395/posts...,1,Datasets/tovima/images/,Datasets/tovima/images/4394137847285385.jpg
1029,https://www.facebook.com/184884834877395/posts...,1,Datasets/tovima/images/,Datasets/tovima/images/4392883707410799.jpg


In [3]:
images, pca_features, pca = pickle.load(open('features_posts.p', 'rb'))

for img, f in list(zip(images, pca_features))[0:5]:
    print("image: %s, features: %0.2f,%0.2f,%0.2f,%0.2f... "%(img, f[0], f[1], f[2], f[3]))

image: Datasets/ingr/images/10159636104980962.jpg, features: -3.68,9.63,-15.86,-0.96... 
image: Datasets/ingr/images/10159636289825962.jpg, features: -9.69,-15.59,16.11,7.38... 
image: Datasets/ingr/images/10159635983895962.jpg, features: -19.34,-3.35,4.28,1.27... 
image: Datasets/ingr/images/10159635136250962.jpg, features: -11.44,10.00,1.33,0.95... 
image: Datasets/ingr/images/10159635038980962.jpg, features: -13.94,-9.00,-19.19,0.74... 


In [4]:
pca_features.shape

(1031, 300)

In [5]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.neighbors import KNeighborsRegressor, KNeighborsClassifier
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
from sklearn import metrics

In [6]:
X_train, X_test, y_train, y_test = train_test_split(pca_features, posts_df['Total Interactions'], test_size=0.25, random_state=42)

In [7]:
# Train and test the LinearRegression model
lin_reg = LinearRegression().fit(X_train, y_train)
pred_y = lin_reg.predict(X_test)

print('r2_score:', metrics.r2_score(y_test, pred_y))
print('MAE:', metrics.mean_absolute_error(y_test, pred_y))
print('MSE:', metrics.mean_squared_error(y_test, pred_y))

r2_score: -1.0485336075612404
MAE: 12.349688934725384
MSE: 347.88904080545086


In [8]:
# Train and test the KNeighborsRegressor model
kn_reg = KNeighborsRegressor().fit(X_train, y_train)
pred_y = kn_reg.predict(X_test)

print('r2_score:', metrics.r2_score(y_test, pred_y))
print('MAE:', metrics.mean_absolute_error(y_test, pred_y))
print('MSE:', metrics.mean_squared_error(y_test, pred_y))

r2_score: -0.18123405538224602
MAE: 8.711627906976746
MSE: 200.60124031007754


In [9]:
# Train and test the RandomForestRegressor model
rf_reg = RandomForestRegressor(n_estimators=200).fit(X_train, y_train)
pred_y = rf_reg.predict(X_test)

print('r2_score:', metrics.r2_score(y_test, pred_y))
print('MAE:', metrics.mean_absolute_error(y_test, pred_y))
print('MSE:', metrics.mean_squared_error(y_test, pred_y))

r2_score: -0.17983951247219676
MAE: 8.943713231751023
MSE: 200.36441422455547


In [10]:
# For classification, we consider 2 categories:
#   -Class 0: posts with "Total Interactions" below the column's median
#   -Class 1: posts with "Total Interactions" above the column's median

median = y_train.median()

y_train = y_train.apply(lambda x: int(x > median))
y_test = y_test.apply(lambda x: int(x > median))

In [11]:
# Train and test the LogisticRegression model
lr_clf = LogisticRegression(solver='lbfgs', max_iter=250).fit(X_train, y_train)
pred_y = lr_clf.predict(X_test)

print('accuracy:', metrics.accuracy_score(y_test, pred_y))
print('classification_report:\n', metrics.classification_report(y_test, pred_y))

accuracy: 0.5271317829457365
classification_report:
               precision    recall  f1-score   support

           0       0.59      0.54      0.56       145
           1       0.46      0.51      0.49       113

    accuracy                           0.53       258
   macro avg       0.53      0.53      0.52       258
weighted avg       0.53      0.53      0.53       258



In [12]:
# Train and test the KNeighborsClassifier model
kn_clf = KNeighborsClassifier().fit(X_train, y_train)
pred_y = kn_clf.predict(X_test)

print('accuracy:', metrics.accuracy_score(y_test, pred_y))
print('classification_report:\n', metrics.classification_report(y_test, pred_y))

accuracy: 0.5387596899224806
classification_report:
               precision    recall  f1-score   support

           0       0.59      0.57      0.58       145
           1       0.47      0.50      0.48       113

    accuracy                           0.54       258
   macro avg       0.53      0.53      0.53       258
weighted avg       0.54      0.54      0.54       258



In [13]:
# Train and test the RandomForestClassifier model
rf_clf = RandomForestClassifier(n_estimators=200).fit(X_train, y_train)
pred_y = rf_clf.predict(X_test)

print('accuracy:', metrics.accuracy_score(y_test, pred_y))
print('classification_report:\n', metrics.classification_report(y_test, pred_y))

accuracy: 0.5581395348837209
classification_report:
               precision    recall  f1-score   support

           0       0.58      0.81      0.67       145
           1       0.49      0.24      0.32       113

    accuracy                           0.56       258
   macro avg       0.53      0.52      0.50       258
weighted avg       0.54      0.56      0.52       258

