In [1]:
import pandas as pd
import numpy as np
import os
from tqdm import tqdm
# Text processing
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.decomposition import LatentDirichletAllocation, TruncatedSVD
# Image processing
from skimage.io import imread, imshow
from skimage.color import rgba2rgb
from skimage.transform import resize
from skimage.filters import sobel

In [2]:
# Load CSV to Dataframe
PATH = '../data/'
TRAIN_FILE = 'XYtr.csv'
TEST_FILE = 'Xte.csv'
IMAGE_FOLDER = '../data/images/images'
df_train = pd.read_csv(PATH + TRAIN_FILE, index_col='id')
df_test = pd.read_csv(PATH + TEST_FILE, index_col='id')

# So description, version, symbol, fee1, and fee2 have missing values (NaN)
print(df_train.info())

<class 'pandas.core.frame.DataFrame'>
Index: 6914 entries, a44a5f4c5e13910205404271e750e7bc to 62defe67d57479ab0cd6d1ffb6525cbb
Data columns (total 9 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   X.sales      6914 non-null   int64  
 1   cdate        6914 non-null   object 
 2   description  6512 non-null   object 
 3   version      6746 non-null   object 
 4   symbol       5555 non-null   object 
 5   ext          6914 non-null   object 
 6   fee1         6696 non-null   float64
 7   fee2         6705 non-null   float64
 8   total        6914 non-null   float64
dtypes: float64(3), int64(1), object(5)
memory usage: 540.2+ KB
None


In [3]:
'''
Data Cleaning
'''

# description: use the token None to mean no description
df_train['description'] = df_train['description'].fillna('None')
df_test['description'] = df_test['description'].fillna('None')

# version: Has 'None' category. Set nan to 'None'. 
# print(df_test['version'].unique())
df_train['version'] = df_train['version'].fillna('None')
df_test['version'] = df_test['version'].fillna('None')

# symbol: 5 digit symbols. Set to 00000 to represent None.
# print(df_test['symbol'].unique())
df_train['symbol'] = df_train['symbol'].fillna('00000')
df_test['symbol'] = df_test['symbol'].fillna('00000')

# fee1: Small number misssin. Fill with the mean.
df_train['fee1'] = df_train['fee1'].fillna((df_train['fee1'].mean()))
df_test['fee1'] = df_test['fee1'].fillna((df_test['fee1'].mean()))

# fee2: Small number misssin. Fill with the mean.
df_train['fee2'] = df_train['fee2'].fillna((df_train['fee2'].mean()))
df_test['fee2'] = df_test['fee2'].fillna((df_test['fee2'].mean()))

print(df_train.info())
print(df_test.info())

<class 'pandas.core.frame.DataFrame'>
Index: 6914 entries, a44a5f4c5e13910205404271e750e7bc to 62defe67d57479ab0cd6d1ffb6525cbb
Data columns (total 9 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   X.sales      6914 non-null   int64  
 1   cdate        6914 non-null   object 
 2   description  6914 non-null   object 
 3   version      6914 non-null   object 
 4   symbol       6914 non-null   object 
 5   ext          6914 non-null   object 
 6   fee1         6914 non-null   float64
 7   fee2         6914 non-null   float64
 8   total        6914 non-null   float64
dtypes: float64(3), int64(1), object(5)
memory usage: 540.2+ KB
None
<class 'pandas.core.frame.DataFrame'>
Index: 6914 entries, 7e79f1a9cb10504dd2fc569d84f2a346 to 3b665129694904b2024dc7cd8230babe
Data columns (total 8 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   X.sales      6914 non-null   int64  
 1   cdate        6914 

In [4]:
def encode_and_bind(original_dataframe, feature_to_encode):
    dummies = pd.get_dummies(original_dataframe[[feature_to_encode]], drop_first=True)
    res = pd.concat([original_dataframe, dummies], axis=1)
    res = res.drop([feature_to_encode], axis=1)
    return(res)

# We need to combine the train and test sets so we can get the same number of features on each.
train_size = len(df_train)
test_size = len(df_test)

df_train_test =  df_train.append(df_test)

# One-hot encode version
df_train_test = encode_and_bind(df_train_test, 'version')

# One-hot encode symbol
df_train_test = encode_and_bind(df_train_test, 'symbol')

# One-hot encode ext
df_train_test = encode_and_bind(df_train_test, 'ext')

print(df_train_test.info())

<class 'pandas.core.frame.DataFrame'>
Index: 13828 entries, a44a5f4c5e13910205404271e750e7bc to 3b665129694904b2024dc7cd8230babe
Columns: 587 entries, X.sales to ext_.png
dtypes: float64(3), int64(1), object(2), uint8(581)
memory usage: 8.4+ MB
None


In [5]:
def extract_text_features(method, df_train_test):
    
    # Create the corpus using the training and test data
    description = df_train_test['description']
    corpus = list(description)
    vectorizer = CountVectorizer()
    corpus = vectorizer.fit_transform(corpus)
    features = vectorizer.get_feature_names_out()
    
    # Create a simple bag of words
    if method == 'BOG':
        sequences = list(description)
        sequences = vectorizer.transform(sequences).toarray()
        word_features = pd.DataFrame(sequences, columns=features, index=df_train_test.index)
        print(df_train_test.shape)
        print(word_features.shape)
        df_with_text = pd.concat([df_train_test, word_features], axis=1)
        return df_with_text


df_train_test = extract_text_features('BOG', df_train_test)
print(df_train_test.shape)


(13828, 587)
(13828, 14381)
(13828, 14968)


In [None]:
def image_text_features(method, df_train_test):
    
    '''
    Setup image features.
    '''
    IMAGE_HEIGHT = 16
    IMAGE_WIDTH = 16
    indexes = list(range(len(df_train_test)))
    image_features = ['pixel{}'.format(num) for num in range(IMAGE_HEIGHT*IMAGE_WIDTH)]
    image_features_df = pd.DataFrame(0, index=indexes, columns=image_features)
    image_features_df.index = df_train_test.index
    
    images = os.listdir(IMAGE_FOLDER)
    for i in tqdm(range(len(images))):
        image_file = images[i]
        image_id = image_file.split('.')[0]
        if method == 'EDGE':
            image = imread('{}/{}'.format(IMAGE_FOLDER, image_file), as_gray=True)
            height, width = image.shape
            if height*width > 96*96: continue
            image = resize(image, (IMAGE_HEIGHT, IMAGE_WIDTH))

            edge_sobel = sobel(image)
            edge_sobel = edge_sobel.reshape((-1, IMAGE_HEIGHT*IMAGE_WIDTH))
            image_features_df.loc[image_id,:] = edge_sobel.tolist()[0]
    
    return df_train_test.join(image_features_df, how='left')    

print(df_train_test.shape)
df_train_test = image_text_features('EDGE', df_train_test)
print(df_train_test.shape)


(13828, 14968)


 64%|████████████████████████████████████████████████████████████████▌                                    | 5671/8873 [07:14<03:36, 14.81it/s]

In [None]:
# TODO: Write to new csv for model training