# Imports

In [1]:
import pandas as pd
import numpy as np
from PIL import Image
import os

import string 

from sklearn.pipeline import Pipeline
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import cross_validate

from sklearn.svm import LinearSVC

import gensim
from gensim.models import Word2Vec
from tensorflow.keras.preprocessing.text import text_to_word_sequence

from joblib import dump

# Exploration

## Raw data import 

In [10]:
X = pd.read_csv("../raw_data/train_x.csv", delimiter=" ", index_col="Id")
y = pd.read_csv("../raw_data/train_y.csv", delimiter=" ", index_col="Id")
y_umbr = np.load("../raw_data/tmp_data/y_train_umbr.npy", encoding='bytes', allow_pickle=True)

In [3]:
X

Unnamed: 0_level_0,Image_name,Title
Id,Unnamed: 1_level_1,Unnamed: 2_level_1
0,1588345297.jpg,With Schwarzkopf: Life Lessons of The Bear
1,1404803335.jpg,"Magnets: Pulling Together, Pushing Apart (Amaz..."
2,1446276082.jpg,Energy Security (SAGE Library of International...
3,1491522666.jpg,An Amish Gathering: Life in Lancaster County
4,0970096410.jpg,City of Rocks Idaho: A Climber's Guide (Region...
...,...,...
51295,0399176055.jpg,Extreme Prey
51296,0719816629.jpg,The Crime Writer's Guide to Police Practice an...
51297,0385353596.jpg,The Jezebel Remedy: A novel
51298,1475988559.jpg,Startup


In [11]:
y_umbr_df = pd.DataFrame(y_umbr)
y_umbr_df = y_umbr_df.rename(columns={
    y_umbr_df.columns[0]: 'Genre_id',
    y_umbr_df.columns[1]: 'Genre_name',
    y_umbr_df.columns[2]: 'Genre_umbr_name',
    y_umbr_df.columns[3]: 'Genre_umbr_id'
})

#y_umbr = y_umbr.rename_axis('Genre_id')
y_umbr_df

Unnamed: 0,Genre_id,Genre_name,Genre_umbr_name,Genre_umbr_id
0,1,Biographies & Memoirs,History & Religion,1
1,4,Children's Books,Relationships & Children,2
2,10,Engineering & Transportation,Academic - Science,3
3,9,Christian Books & Bibles,History & Religion,1
4,26,Sports & Outdoors,Lifestyle & Fitness,4
...,...,...,...,...
51295,17,"Mystery, Thriller & Suspense",Literature & Entertainment,5
51296,17,"Mystery, Thriller & Suspense",Literature & Entertainment,5
51297,17,"Mystery, Thriller & Suspense",Literature & Entertainment,5
51298,17,"Mystery, Thriller & Suspense",Literature & Entertainment,5


In [12]:
genre_dict = y_umbr_df.set_index('Genre_umbr_id')['Genre_umbr_name'].to_dict()
genre_dict

  return Index(sequences[0], name=names)


{1: 'History & Religion',
 2: 'Relationships & Children',
 3: 'Academic - Science',
 4: 'Lifestyle & Fitness',
 6: 'Academic - Humanities',
 7: 'Arts & Hobbies',
 5: 'Literature & Entertainment',
 8: 'Academic - Pure'}

## Preliminary Data Preprocessing

In [13]:
array_test = np.array([])

for image_name in X['Image_name'][:5]:
    image_path = os.path.join("../raw_data/Images/", image_name)
    image = Image.open(image_path)
    image_array = np.array(image)

    flattened_pixels = image_array.reshape(-1, 3)
    average_color = np.mean(flattened_pixels, axis=0)
    unique, counts = np.unique(flattened_pixels, return_counts=True, axis=0)
    most_frequent_index = np.argmax(counts)
    most_frequent_color = unique[most_frequent_index]
    print(most_frequent_color, average_color)

    array_test = np.append(array_test, most_frequent_color)

print(array_test.reshape(-1, 3))

[92 97 74] [144.03354193 144.52836017 126.17757494]
[105 183 222] [121.7903978  155.68775909 171.20017538]
[147   3   3] [121.54918686   8.1564493    7.39793925]
[7 0 0] [193.15625    182.0331832  103.95565609]
[ 81 123 205] [112.07995855 136.00370695 169.8401427 ]
[[ 92.  97.  74.]
 [105. 183. 222.]
 [147.   3.   3.]
 [  7.   0.   0.]
 [ 81. 123. 205.]]


In [14]:
average_colors = []
most_frequent_colors = []

for image_name in X['Image_name']:
    image_path = os.path.join("../raw_data/Images/", image_name)
    image = Image.open(image_path)
    image = image.resize((100, 100))
    image_array = np.array(image)

    flattened_pixels = image_array.reshape(-1, 3)
    average_color = np.mean(flattened_pixels, axis=0)
    unique, counts = np.unique(flattened_pixels, return_counts=True, axis=0)
    most_frequent_index = np.argmax(counts)
    most_frequent_color = unique[most_frequent_index]

    average_colors.append(average_color)
    most_frequent_colors.append(most_frequent_color)
    
average_colors = np.array(average_colors)
most_frequent_colors = np.array(most_frequent_colors)

In [15]:
colors = np.hstack((average_colors, most_frequent_colors))
extracted_features = pd.DataFrame(colors, columns=["average_r", "average_g", "average_b", "most_freq_r", "most_freq_g", "most_freq_b"])

In [16]:
title_words = np.array(X["Title"].str.split())

count_words = np.array([len(x) for x in title_words])
extracted_features["word_counts"] = count_words

In [17]:
count_words

array([7, 7, 7, ..., 5, 1, 8])

In [18]:
count_capital_letters = lambda s: sum(1 for c in s if c.isupper())

In [19]:
capital_counts = X["Title"].apply(count_capital_letters)

capital_ratio = np.array(capital_counts/count_words)

extracted_features["capital_ratio"] = capital_ratio

In [20]:
count_letters = lambda s: sum(1 for c in s if c.isalpha())

In [21]:
letter_counts = X["Title"].apply(count_letters)

letter_ratio = np.array(letter_counts/count_words)

extracted_features["letter_ratio"] = letter_ratio 

In [22]:
count_punctuation = lambda s: sum(1 for c in s if c in string.punctuation)

In [23]:
punct_counts = X["Title"].apply(count_punctuation)
extracted_features["punct_counts"] = punct_counts

In [24]:
extracted_features

Unnamed: 0,average_r,average_g,average_b,most_freq_r,most_freq_g,most_freq_b,word_counts,capital_ratio,letter_ratio,punct_counts
0,143.9203,144.4273,126.1171,92.0,97.0,74.0,7,0.857143,5.000000,1
1,121.6648,155.5735,171.1021,105.0,183.0,222.0,7,1.000000,6.857143,4
2,121.4898,8.0966,7.3227,147.0,3.0,3.0,7,1.285714,6.857143,2
3,193.1224,181.9881,103.9752,254.0,134.0,37.0,7,0.857143,5.285714,1
4,112.0715,136.0007,169.8334,81.0,123.0,205.0,11,0.909091,5.090909,4
...,...,...,...,...,...,...,...,...,...,...
51295,144.2605,98.0033,80.7424,255.0,255.0,255.0,2,1.000000,5.500000,0
51296,91.7243,81.9982,50.0007,25.0,23.0,44.0,9,0.777778,5.333333,1
51297,68.6990,63.3906,72.7776,19.0,22.0,27.0,5,0.800000,4.400000,1
51298,118.8070,82.1060,69.3474,56.0,52.0,51.0,1,1.000000,7.000000,0


# Pipeline

## Preprocessing Pipeline

In [25]:
# CREATING CLASS AND ADDING TO PIPELINE
class EmbeddingFeatureExtractor(BaseEstimator, TransformerMixin):
    def __init__(self, vector_size=10, min_count=10, window=5):
        self.vector_size = vector_size
        self.min_count = min_count
        self.window = window
        #self.word2vec_model = None

    def fit(self, X, y=None):
        #self.X_embed = [text_to_word_sequence(str(title)) for title in X['Title']]
        #self.word2vec_model = Word2Vec(vector_size=self.vector_size, min_count=self.min_count, window=self.window)
        #self.word2vec_model.build_vocab(X_embed)
        return self
    
    def transform(self, X):
        X_embed = [text_to_word_sequence(str(title)) for title in X['Title']]
        word2vec = Word2Vec(sentences=X_embed, vector_size=self.vector_size, min_count=self.min_count, window=self.window)
        title_embeddings_avg = []
        
        for title in X_embed:
            title_embedding = []
            
            for word in title:
                if word in word2vec.wv.key_to_index:
                    word_embedding = word2vec.wv[word]
                    title_embedding.append(word_embedding)
            
            if title_embedding:
                title_embedding_avg = np.mean(title_embedding, axis=0)
                title_embeddings_avg.append(title_embedding_avg)
            else:
                title_embeddings_avg.append(np.zeros(word2vec.vector_size))
                
        title_embeddings_avg = np.array(title_embeddings_avg)
        
        return title_embeddings_avg

In [26]:
class ImageFeatureExtractor(BaseEstimator, TransformerMixin):
    def __init__(self):
        pass
    
    def fit(self, X, y=None):
        return self
    
    def transform(self, X):
        average_colors = []
        most_frequent_colors = []

        for image_name in X['Image_name']:
            image_path = os.path.join("../raw_data/Images/", image_name)
            image = Image.open(image_path)
            image = image.resize((100, 100))
            image_array = np.array(image)

            flattened_pixels = image_array.reshape(-1, 3)
            average_color = np.mean(flattened_pixels, axis=0)
            unique, counts = np.unique(flattened_pixels, return_counts=True, axis=0)
            most_frequent_index = np.argmax(counts)
            most_frequent_color = unique[most_frequent_index]

            average_colors.append(average_color)
            most_frequent_colors.append(most_frequent_color)

        average_colors = np.array(average_colors)
        most_frequent_colors = np.array(most_frequent_colors)
        
        return np.hstack((average_colors, most_frequent_colors))/255

In [27]:
class TitleFeatureExtractor(BaseEstimator, TransformerMixin):
    def __init__(self):
        self.count_words = lambda s: len(s.split())
        self.count_capital_letters = lambda s: sum(1 for c in s if c.isupper())
        self.count_letters = lambda s: sum(1 for c in s if c.isalpha())
        self.count_punctuation = lambda s: sum(1 for c in s if c in string.punctuation)
        self.count_special_chars = lambda s: sum(1 for c in s if c == ':' or c == '-')
    
    def fit(self, X, y=None):
        return self
    
    def transform(self, X):
        extracted_features = pd.DataFrame()
        extracted_features['count_words'] = X['Title'].apply(self.count_words)
        extracted_features['capital_ratio'] = X['Title'].apply(lambda s: self.count_capital_letters(s) / self.count_words(s))
        extracted_features['letter_ratio'] = X['Title'].apply(lambda s: self.count_letters(s) / self.count_words(s))
        extracted_features['punct_counts'] = X['Title'].apply(self.count_punctuation)
        extracted_features['special_chars_ratio'] = X['Title'].apply(lambda s: self.count_special_chars(s) / self.count_words(s))
        
        return extracted_features.values

In [28]:

image_pipeline = Pipeline([
    ('image_features', ImageFeatureExtractor()),
])

title_pipeline = Pipeline([
    ('title_features', TitleFeatureExtractor()),
    ('normalize_text', MinMaxScaler())
])

embeddings = Pipeline([
    ('title_embeddings', EmbeddingFeatureExtractor())
])

preprocessing = ColumnTransformer([
    ("image", image_pipeline, ["Image_name"]),
    ("title", title_pipeline, ["Title"]),
    ("embeddings", embeddings, ["Title"])
])

In [29]:
preprocessing

In [30]:
X_preproc = preprocessing.fit_transform(X)

In [55]:
y_preproc = y["Genre_id"]
y_preproc_umbr = y_umbr_df['Genre_umbr_id'].astype(int)

In [56]:
print(X_preproc.shape)
print(y_preproc_umbr)

(51300, 21)
0        1
1        2
2        3
3        1
4        4
        ..
51295    5
51296    5
51297    5
51298    5
51299    5
Name: Genre_umbr_id, Length: 51300, dtype: int64


# Model

## Cross Validate

In [46]:
model = LinearSVC(max_iter=10000)

In [57]:
scores = cross_validate(model, X_preproc, y_preproc_umbr, cv=5)

In [58]:
# 8 categories
score = scores["test_score"].mean()
print(f'Mean Umbrella Score: {score*100:.2f}%')

Mean Umbrella Score: 40.51%


In [48]:
# 30 categories
score = scores["test_score"].mean()
print(f'Mean Score: {score*100:.2f}%')

Mean Score: 22.94%


## Train and save model

In [14]:
model = LinearSVC()

In [59]:
model.fit(X_preproc, y_preproc_umbr)

In [24]:
#dump(model, "../models/baseline_svc_v1.pkl")

['../models/baseline_svc_v1.pkl']