In [24]:
import pandas as pd
import numpy as np

In [25]:
# Load CSV to Dataframe
PATH = '../data/'
FILE = 'XYtr.csv'
df_train = pd.read_csv(PATH + FILE)

# So description, version, symbol, fee1, and fee2 have missing values (NaN)
print(df_train.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6914 entries, 0 to 6913
Data columns (total 10 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   id           6914 non-null   object 
 1   X.sales      6914 non-null   int64  
 2   cdate        6914 non-null   object 
 3   description  6512 non-null   object 
 4   version      6746 non-null   object 
 5   symbol       5555 non-null   object 
 6   ext          6914 non-null   object 
 7   fee1         6696 non-null   float64
 8   fee2         6705 non-null   float64
 9   total        6914 non-null   float64
dtypes: float64(3), int64(1), object(6)
memory usage: 540.3+ KB
None


In [26]:
'''
Data Cleaning
'''

# description: use the token None to mean no description
df_train['description'] = df_train['description'].fillna('None')

# version: Has 'None' category. Set nan to 'None'. 
print(df_train['version'].unique())
df_train['version'] = df_train['version'].fillna('None')

# symbol: 5 digit symbols. Set to 00000 to represent None.
# print(df_train['symbol'].unique())
df_train['symbol'] = df_train['symbol'].fillna('00000')

# fee1: Small number misssin. Fill with the mean.
df_train['fee1'] = df_train['fee1'].fillna((df_train['fee1'].mean()))
                                           
# fee2: Small number misssin. Fill with the mean.
df_train['fee2'] = df_train['fee2'].fillna((df_train['fee2'].mean()))

print(df_train.info())

['3' 'None' 'unsupported' '4' nan '1' '2']
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6914 entries, 0 to 6913
Data columns (total 10 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   id           6914 non-null   object 
 1   X.sales      6914 non-null   int64  
 2   cdate        6914 non-null   object 
 3   description  6914 non-null   object 
 4   version      6914 non-null   object 
 5   symbol       6914 non-null   object 
 6   ext          6914 non-null   object 
 7   fee1         6914 non-null   float64
 8   fee2         6914 non-null   float64
 9   total        6914 non-null   float64
dtypes: float64(3), int64(1), object(6)
memory usage: 540.3+ KB
None


In [27]:
def encode_and_bind(original_dataframe, feature_to_encode):
    dummies = pd.get_dummies(original_dataframe[[feature_to_encode]], drop_first=True)
    res = pd.concat([original_dataframe, dummies], axis=1)
    res = res.drop([feature_to_encode], axis=1)
    return(res)


# One-hot encode version
df_train = encode_and_bind(df_train, 'version')

# One-hot encode symbol
df_train = encode_and_bind(df_train, 'symbol')

# One-hot encode ext
df_train = encode_and_bind(df_train, 'ext')

In [None]:
# TODO: text feature extraction


In [None]:
# TODO: image feature extraction

In [None]:
# TODO: Write to new csv for model training