# Imports

In [218]:
import pandas as pd
import numpy as np
from PIL import Image
import os

import string 

from sklearn.pipeline import Pipeline
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import cross_validate

from sklearn.svm import LinearSVC

# Exploration

## Raw data import 

In [32]:
X = pd.read_csv("../raw_data/train_x.csv", delimiter=" ", index_col="Id")
y = pd.read_csv("../raw_data/train_y.csv", delimiter=" ", index_col="Id")

In [33]:
X

Unnamed: 0_level_0,Image_name,Title
Id,Unnamed: 1_level_1,Unnamed: 2_level_1
0,1588345297.jpg,With Schwarzkopf: Life Lessons of The Bear
1,1404803335.jpg,"Magnets: Pulling Together, Pushing Apart (Amaz..."
2,1446276082.jpg,Energy Security (SAGE Library of International...
3,1491522666.jpg,An Amish Gathering: Life in Lancaster County
4,0970096410.jpg,City of Rocks Idaho: A Climber's Guide (Region...
...,...,...
51295,0399176055.jpg,Extreme Prey
51296,0719816629.jpg,The Crime Writer's Guide to Police Practice an...
51297,0385353596.jpg,The Jezebel Remedy: A novel
51298,1475988559.jpg,Startup


In [34]:
y

Unnamed: 0_level_0,Genre_id,Genre_name
Id,Unnamed: 1_level_1,Unnamed: 2_level_1
0,1,Biographies & Memoirs
1,4,Children's Books
2,10,Engineering & Transportation
3,9,Christian Books & Bibles
4,26,Sports & Outdoors
...,...,...
51295,17,"Mystery, Thriller & Suspense"
51296,17,"Mystery, Thriller & Suspense"
51297,17,"Mystery, Thriller & Suspense"
51298,17,"Mystery, Thriller & Suspense"


In [35]:
genre_dict = y.set_index('Genre_id')['Genre_name'].to_dict()

## Preliminary Data Preprocessing

In [170]:
average_colors = []
most_frequent_colors = []
missing_images = []


for image_name in X['Image_name']:
    image_path = os.path.join("../raw_data/Images/", image_name)
    image = Image.open(image_path)
    image_array = np.array(image)
    average_color = image_array.mean(axis=(0, 1))
    
    flattened_pixels = image_array.reshape(-1, 3)
    color_counts = np.bincount(flattened_pixels.argmax(axis=1))
    most_frequent_color_idx = color_counts.argmax()
    most_frequent_color = flattened_pixels[flattened_pixels.argmax(axis=1) == most_frequent_color_idx][0]

    average_colors.append(average_color)
    most_frequent_colors.append(most_frequent_color)
    
    
average_colors = np.array(average_colors)
most_frequent_colors = np.array(most_frequent_colors)

In [230]:
colors = np.hstack((average_colors, most_frequent_colors))
extracted_features = pd.DataFrame(colors, columns=["average_r", "average_g", "average_b", "most_freq_r", "most_freq_g", "most_freq_b"])

In [231]:
title_words = np.array(X["Title"].str.split())

count_words = np.array([len(x) for x in title_words])
extracted_features["word_counts"] = count_words

In [232]:
count_words

array([7, 7, 7, ..., 5, 1, 8])

In [233]:
count_capital_letters = lambda s: sum(1 for c in s if c.isupper())

In [234]:
capital_counts = X["Title"].apply(count_capital_letters)

capital_ratio = np.array(capital_counts/count_words)

extracted_features["capital_ratio"] = capital_ratio

In [235]:
count_letters = lambda s: sum(1 for c in s if c.isalpha())

In [236]:
letter_counts = X["Title"].apply(count_letters)

letter_ratio = np.array(letter_counts/count_words)

extracted_features["letter_ratio"] = letter_ratio 

In [237]:
count_punctuation = lambda s: sum(1 for c in s if c in string.punctuation)

In [238]:
punct_counts = X["Title"].apply(count_punctuation)
extracted_features["punct_counts"] = punct_counts

In [245]:
extracted_features

Unnamed: 0,average_r,average_g,average_b,most_freq_r,most_freq_g,most_freq_b,word_counts,capital_ratio,letter_ratio,punct_counts
0,144.033542,144.528360,126.177575,201.0,195.0,171.0,7,0.857143,5.000000,1
1,121.790398,155.687759,171.200175,138.0,211.0,246.0,7,1.000000,6.857143,4
2,121.549187,8.156449,7.397939,175.0,147.0,126.0,7,1.285714,6.857143,2
3,193.156250,182.033183,103.955656,0.0,3.0,0.0,7,0.857143,5.285714,1
4,112.079959,136.003707,169.840143,66.0,85.0,177.0,11,0.909091,5.090909,4
...,...,...,...,...,...,...,...,...,...,...
51295,144.386420,98.107721,80.842494,53.0,39.0,52.0,2,1.000000,5.500000,0
51296,91.739557,81.838090,49.942841,25.0,23.0,44.0,9,0.777778,5.333333,1
51297,68.825335,63.520069,72.904835,158.0,161.0,166.0,5,0.800000,4.400000,1
51298,118.814274,82.102240,69.356844,62.0,61.0,59.0,1,1.000000,7.000000,0


# Pipeline

## Preprocessing Pipeline

In [197]:
class ImageFeatureExtractor(BaseEstimator, TransformerMixin):
    def __init__(self):
        pass
    
    def fit(self, X, y=None):
        return self
    
    def transform(self, X):
        average_colors = []
        most_frequent_colors = []
        
        for image_name in X['Image_name']:
            image_path = os.path.join("../raw_data/Images/", image_name)
            image = Image.open(image_path)
            image_array = np.array(image)
            average_color = image_array.mean(axis=(0, 1))

            flattened_pixels = image_array.reshape(-1, 3)
            color_counts = np.bincount(flattened_pixels.argmax(axis=1))
            most_common_color_idx = color_counts.argmax()
            most_common_color = flattened_pixels[flattened_pixels.argmax(axis=1) == most_common_color_idx][0]

            average_colors.append(average_color)
            most_frequent_colors.append(most_common_color)

        
        average_colors = np.array(average_colors)
        most_frequent_colors = np.array(most_frequent_colors)
        
        return np.hstack((average_colors, most_frequent_colors))/255

In [198]:
class TitleFeatureExtractor(BaseEstimator, TransformerMixin):
    def __init__(self):
        self.count_words = lambda s: len(s.split())
        self.count_capital_letters = lambda s: sum(1 for c in s if c.isupper())
        self.count_letters = lambda s: sum(1 for c in s if c.isalpha())
        self.count_punctuation = lambda s: sum(1 for c in s if c in string.punctuation)
    
    def fit(self, X, y=None):
        return self
    
    def transform(self, X):
        extracted_features = pd.DataFrame()
        extracted_features['count_words'] = X['Title'].apply(self.count_words)
        extracted_features['capital_ratio'] = X['Title'].apply(lambda s: self.count_capital_letters(s) / self.count_words(s))
        extracted_features['letter_ratio'] = X['Title'].apply(lambda s: self.count_letters(s) / self.count_words(s))
        extracted_features['punct_counts'] = X['Title'].apply(self.count_punctuation)
        
        return extracted_features.values

In [207]:
image_pipeline = Pipeline([
    ('image_features', ImageFeatureExtractor()),
])

title_pipeline = Pipeline([
    ('title_features', TitleFeatureExtractor()),
    ('normalize_text', MinMaxScaler())
])

preprocessing = ColumnTransformer([
    ("image", image_pipeline, ["Image_name"]),
    ("title", title_pipeline, ["Title"])
])

In [208]:
preprocessing

In [210]:
X_preproc = preprocessing.fit_transform(X)
y_preproc = y["Genre_id"]

In [219]:
X_preproc

array([[0.56483742, 0.56677788, 0.49481402, ..., 0.08571429, 0.23809524,
        0.02777778],
       [0.4776094 , 0.61054023, 0.67137324, ..., 0.1       , 0.32653061,
        0.11111111],
       [0.47666348, 0.03198608, 0.02901153, ..., 0.12857143, 0.32653061,
        0.05555556],
       ...,
       [0.26990327, 0.24909831, 0.28590131, ..., 0.08      , 0.20952381,
        0.02777778],
       [0.46593833, 0.32196957, 0.27198762, ..., 0.1       , 0.33333333,
        0.        ],
       [0.6876771 , 0.460935  , 0.35026784, ..., 0.1       , 0.22619048,
        0.02777778]])

# Model

In [212]:
model = LinearSVC()

In [246]:
scores = cross_validate(model, X_preproc, y_preproc, cv=5)

In [247]:
score = scores["test_score"].mean()
score

0.10315789473684212