In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.decomposition import LatentDirichletAllocation, TruncatedSVD
from pprint import pprint

# Base Processing

### Read in training & test data

In [2]:
# Load CSV to Dataframe
PATH = 'data/'
FILE_train = 'XYtr.csv'
FILE_test = 'Xte.csv'

raw_train = pd.read_csv(PATH + FILE_train)
raw_test = pd.read_csv(PATH + FILE_test)

# Description, version, symbol, fee1, and fee2 have missing values (NaN)
print(raw_train.info())
print()
print(raw_test.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6914 entries, 0 to 6913
Data columns (total 10 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   id           6914 non-null   object 
 1   X.sales      6914 non-null   int64  
 2   cdate        6914 non-null   object 
 3   description  6512 non-null   object 
 4   version      6746 non-null   object 
 5   symbol       5555 non-null   object 
 6   ext          6914 non-null   object 
 7   fee1         6696 non-null   float64
 8   fee2         6705 non-null   float64
 9   total        6914 non-null   float64
dtypes: float64(3), int64(1), object(6)
memory usage: 540.3+ KB
None

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6914 entries, 0 to 6913
Data columns (total 9 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   id           6914 non-null   object 
 1   X.sales      6914 non-null   int64  
 2   cdate        6914 non-null   object 
 3 

### size of the dataframe

In [3]:
print('raw_train shape: ', raw_train.shape)
print('raw_test shape: ', raw_test.shape)

raw_train shape:  (6914, 10)
raw_test shape:  (6914, 9)


### description

In [4]:
print('raw_train description missing values: ', raw_train['description'].isnull().sum())
print('raw_test description missing values: ', raw_test['description'].isnull().sum())

raw_train description missing values:  402
raw_test description missing values:  377


### version

In [5]:
print('raw_train version: ', raw_train['version'].unique())
print('raw_test version: ', raw_test['version'].unique())

raw_train version:  ['3' 'None' 'unsupported' '4' nan '1' '2']
raw_test version:  ['3' '4' 'None' nan 'unsupported' '1' '2']


In [6]:
print('raw_train version missing values: ', raw_train['version'].isnull().sum())
print('raw_test version missing values: ', raw_test['version'].isnull().sum())

raw_train version missing values:  168
raw_test version missing values:  154


### symbol

In [7]:
# https://stackoverflow.com/questions/45759966/counting-unique-values-in-a-column-in-pandas-dataframe-like-in-qlik/45760042
print('raw_train symbol: ', raw_train['symbol'].nunique())
print('raw_test symbol: ', raw_test['symbol'].nunique())

raw_train symbol:  417
raw_test symbol:  415


In [8]:
print('raw_train symbol missing values: ', raw_train['symbol'].isnull().sum())
print('raw_test symbol missing values: ', raw_test['symbol'].isnull().sum())

raw_train symbol missing values:  1359
raw_test symbol missing values:  1382


### ext

In [9]:
print('raw_train ext: ', raw_train['ext'].unique())
print('raw_test ext: ', raw_test['ext'].unique())

raw_train ext:  ['.png' '.jpg' '.gif']
raw_test ext:  ['.png' '.gif' '.jpg']


### fee1

In [10]:
print('raw_train fee1 missing values: ', raw_train['fee1'].isnull().sum())
print('raw_test fee1 missing values: ', raw_test['fee1'].isnull().sum())

raw_train fee1 missing values:  218
raw_test fee1 missing values:  284


### fee2

In [11]:
print('raw_train fee2 missing values: ', raw_train['fee2'].isnull().sum())
print('raw_test fee2 missing values: ', raw_test['fee2'].isnull().sum())

raw_train fee2 missing values:  209
raw_test fee2 missing values:  276


In [12]:
train_clean = raw_train.copy()
test_clean = raw_test.copy()

### Data Cleaning for training

In [13]:
# description: use the token None to mean no description
train_clean['description'] = train_clean['description'].fillna('None')

# version: Has 'None' category. Set nan to 'None'. 
#print(train_train['version'].unique())
train_clean['version'] = train_clean['version'].fillna('None')

# symbol: 5 digit symbols. Set to 00000 to represent None.
# print(df_train['symbol'].unique())
train_clean['symbol'] = train_clean['symbol'].fillna('00000')


# fee1: Small number misssin. Fill with the mean.
#df_train['fee1'] = df_train['fee1'].fillna((df_train['fee1'].mean()))
# https://www.w3resource.com/python-exercises/pandas/missing-values/python-pandas-missing-values-exercise-14.php
train_clean['fee1'].fillna(train_clean['fee1'].median(), inplace=True)
                                           
# fee2: Small number misssin. Fill with the mean.
#df_train['fee2'] = df_train['fee2'].fillna((df_train['fee2'].mean()))
train_clean['fee2'].fillna(train_clean['fee2'].median(), inplace=True)


print(train_clean.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6914 entries, 0 to 6913
Data columns (total 10 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   id           6914 non-null   object 
 1   X.sales      6914 non-null   int64  
 2   cdate        6914 non-null   object 
 3   description  6914 non-null   object 
 4   version      6914 non-null   object 
 5   symbol       6914 non-null   object 
 6   ext          6914 non-null   object 
 7   fee1         6914 non-null   float64
 8   fee2         6914 non-null   float64
 9   total        6914 non-null   float64
dtypes: float64(3), int64(1), object(6)
memory usage: 540.3+ KB
None


### Data Cleaning for test

In [14]:
# description: use the token None to mean no description
test_clean['description'] = test_clean['description'].fillna('None')

# version: Has 'None' category. Set nan to 'None'. 
test_clean['version'] = test_clean['version'].fillna('None')

# symbol: 5 digit symbols. Set to 00000 to represent None.
test_clean['symbol'] = test_clean['symbol'].fillna('00000')

# fee1: Small number misssin. Fill with the mean.
test_clean['fee1'].fillna(test_clean['fee1'].median(), inplace=True)
                                           
# fee2: Small number misssin. Fill with the mean.
test_clean['fee2'].fillna(test_clean['fee2'].median(), inplace=True)


print(test_clean.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6914 entries, 0 to 6913
Data columns (total 9 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   id           6914 non-null   object 
 1   X.sales      6914 non-null   int64  
 2   cdate        6914 non-null   object 
 3   description  6914 non-null   object 
 4   version      6914 non-null   object 
 5   symbol       6914 non-null   object 
 6   ext          6914 non-null   object 
 7   fee1         6914 non-null   float64
 8   fee2         6914 non-null   float64
dtypes: float64(2), int64(1), object(6)
memory usage: 486.3+ KB
None


### Data preprocessing for training & test

In [15]:
train_processed = train_clean.copy()
test_processed = test_clean.copy()

In [16]:
# cdate: change dates to float
tr_date = train_processed['cdate']
train_processed['cdate'] = pd.to_datetime(tr_date).values.astype(np.float64)/8.64e+13

te_date = test_processed['cdate']
test_processed['cdate'] = pd.to_datetime(te_date).values.astype(np.float64)/8.64e+13

In [17]:
# X.sales: change from int to float
train_processed['X.sales'] = train_processed['X.sales'].astype(np.float64)
test_processed['X.sales'] = test_processed['X.sales'].astype(np.float64)

In [18]:
# obtain unique values from each dataset
train_symbols = set(train_processed['symbol'].unique())
test_symbols = set(test_processed['symbol'].unique())

# union all the values
# https://stackoverflow.com/questions/52976664/python-differences-between-two-lists
# # https://www.programiz.com/python-programming/methods/set/union
all_symbols = train_symbols.union(test_symbols)

# values not included in train set
train_required_symbols = list(all_symbols - train_symbols)

# values not included in test set
test_required_symbols = list(all_symbols - test_symbols)

In [19]:
# one-hot encoding on version, symbol and ext
train_processed = pd.get_dummies(train_processed, columns = ['version', 'ext', 'symbol'], drop_first = False, prefix = ['version', 'ext', 'symbol'])
test_processed = pd.get_dummies(test_processed, columns = ['version', 'ext', 'symbol'], drop_first = False, prefix = ['version', 'ext', 'symbol'])

In [20]:
# https://stackoverflow.com/questions/18674064/how-do-i-insert-a-column-at-a-specific-column-index-in-pandas
for train_syms in train_required_symbols:
    train_processed.insert(train_processed.shape[1], str('symbol_') + train_syms, 0)
train_base = train_processed.copy()

  if (await self.run_code(code, result,  async_=asy)):


In [21]:
for test_syms in test_required_symbols:
    test_processed.insert(test_processed.shape[1], str('symbol_') + test_syms, 0)
test_base = test_processed.copy()

In [22]:
train_base = train_processed.copy()

In [23]:
test_base = test_processed.copy()

# Text Feature Extraction

In [24]:
# Check for missing values before processing
print('train description missing values: ', train_base['description'].isnull().sum())
print('test description missing values: ', test_base['description'].isnull().sum())

train description missing values:  0
test description missing values:  0


### Create a corpus using training and test data

In [25]:
# Create corpus using training and test data
corpus = list(train_base['description'])+list(test_base['description'])

### Create a Document-Word matrix

In [26]:
# Create a Vectorizer Object
# remove tokens that appear in 10% of the documents
# remove unique tokens that appear in, at most, 2 documents
vectorizer = CountVectorizer(max_df=0.1, min_df=2)

In [27]:
# Encode the documents in a count matrix
corpus_vectorized = vectorizer.fit_transform(corpus)

In [28]:
# Feature names and size
print('feature name: ', vectorizer.get_feature_names_out())
print('feature size: ', len(vectorizer.get_feature_names_out()))

# dimension of a sparse matrix of documents (row) vs number of unique words
print('matrix dimension: ', corpus_vectorized.shape)

feature name:  ['002n7' '00b0d' '00jhg' ... 'zztgg' 'zzvdf' 'zzw3j']
feature size:  10566
matrix dimension:  (13828, 10566)


In [29]:
# A sparse matrix of documents (row) vs number of unique words
count_array = corpus_vectorized.toarray()
corpus_df = pd.DataFrame(data=count_array,columns = vectorizer.get_feature_names_out())
corpus_df

Unnamed: 0,002n7,00b0d,00jhg,00ud9,00xck,01abs,01fnu,01jsj,01k0e,01nrz,...,zzhb3,zzht0,zzlz3,zznp1,zzns7,zzpvk,zzr1c,zztgg,zzvdf,zzw3j
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
13823,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
13824,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
13825,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
13826,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


### Build a LDA model for topic modeling

In [30]:
lda_model = LatentDirichletAllocation(n_components = 10,
                                     learning_method = 'online',
                                      learning_decay = 0.9,
                                      random_state=100, 
                                      batch_size=128, 
                                      evaluate_every = -1, 
                                      n_jobs = -1)

lda_output = lda_model.fit_transform(corpus_vectorized)

In [31]:
print(lda_output.shape)
print(lda_model)

(13828, 10)
LatentDirichletAllocation(learning_decay=0.9, learning_method='online',
                          n_jobs=-1, random_state=100)


### Separate the dataframe back to train and test

In [32]:
topicnames = ["Topic" + str(i) for i in range(lda_model.n_components)]
train_document_topic = pd.DataFrame(np.round(lda_output[0:6914,], 8), columns = topicnames)
test_document_topic = pd.DataFrame(np.round(lda_output[6914:13828,], 8), columns = topicnames)

In [33]:
train_document_topic

Unnamed: 0,Topic0,Topic1,Topic2,Topic3,Topic4,Topic5,Topic6,Topic7,Topic8,Topic9
0,0.005557,0.005556,0.005561,0.005556,0.005558,0.005556,0.005556,0.005558,0.949986,0.005556
1,0.016667,0.016667,0.016667,0.179592,0.016678,0.016667,0.016667,0.016669,0.687062,0.016667
2,0.700000,0.033333,0.033333,0.033333,0.033333,0.033333,0.033333,0.033333,0.033333,0.033333
3,0.004167,0.004167,0.004167,0.004167,0.004167,0.004167,0.004167,0.004167,0.962499,0.004167
4,0.050000,0.050000,0.050000,0.550000,0.050000,0.050000,0.050000,0.050000,0.050000,0.050000
...,...,...,...,...,...,...,...,...,...,...
6909,0.115124,0.001961,0.001961,0.001961,0.001961,0.094700,0.001961,0.776450,0.001961,0.001961
6910,0.033333,0.699982,0.033333,0.033333,0.033351,0.033333,0.033333,0.033333,0.033333,0.033333
6911,0.003125,0.003125,0.003125,0.003125,0.003125,0.003125,0.003125,0.003125,0.971874,0.003125
6912,0.010000,0.510000,0.010001,0.164334,0.010001,0.010000,0.010000,0.010001,0.255661,0.010002


In [34]:
test_document_topic

Unnamed: 0,Topic0,Topic1,Topic2,Topic3,Topic4,Topic5,Topic6,Topic7,Topic8,Topic9
0,0.009091,0.009091,0.009091,0.009091,0.009091,0.009091,0.009091,0.009091,0.009091,0.918180
1,0.050000,0.050000,0.050000,0.550000,0.050000,0.050000,0.050000,0.050000,0.050000,0.050000
2,0.128232,0.007693,0.186306,0.007693,0.007693,0.007695,0.007697,0.631605,0.007694,0.007692
3,0.003226,0.003227,0.796178,0.003226,0.003226,0.003226,0.003226,0.064850,0.046840,0.072775
4,0.012501,0.012500,0.012500,0.012500,0.887497,0.012500,0.012500,0.012501,0.012500,0.012500
...,...,...,...,...,...,...,...,...,...,...
6909,0.001220,0.001220,0.001220,0.001220,0.001220,0.001220,0.989024,0.001220,0.001220,0.001220
6910,0.002565,0.642044,0.182279,0.157726,0.002564,0.002564,0.002564,0.002564,0.002564,0.002564
6911,0.025000,0.025000,0.025000,0.025000,0.775000,0.025000,0.025000,0.025000,0.025000,0.025000
6912,0.004762,0.004762,0.004762,0.004762,0.957142,0.004762,0.004762,0.004762,0.004762,0.004762


In [35]:
# TODO: image feature extraction

# Regression

In [44]:
from sklearn.linear_model import LinearRegression, SGDRegressor
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.metrics import mean_absolute_error
from sklearn.preprocessing import StandardScaler

### Preprocessing - concatenate text features to base dataframe

In [37]:
train_concatenated = pd.concat([train_base, train_document_topic], axis=1)
train_concatenated = train_concatenated.drop(['id', 'description'], axis=1)
train_concatenated.shape

(6914, 599)

In [38]:
test_concatenated = pd.concat([test_base, test_document_topic], axis=1)
test_concatenated = test_concatenated.drop(['id', 'description'], axis=1)
test_concatenated.shape

(6914, 598)

### Preprocessing - train-test-split to both train and test dataset

In [45]:
y = train_concatenated['total'].copy()
X = train_concatenated.drop(['total'], axis=1).copy()

In [None]:
#y = train_base['total'].copy()
#X = train_base.drop(['id', 'description', 'total'], axis=1).copy()

In [40]:
x_train, x_test, y_train, y_test = train_test_split(X, y, train_size=0.75, test_size=0.25, random_state=42)

# Make a new section for your models

### Linear Regressor

In [41]:
OLS = LinearRegression().fit(x_train, y_train)

In [42]:
OLS_predictions = OLS.predict(x_test)

In [43]:
mean_absolute_error(y_test, OLS_predictions)

17.901943932140455

### SGD Regressor

In [46]:
# always standardize input
scaler = StandardScaler()
x_train = scaler.fit_transform(x_train)
x_test = scaler.transform(x_test)

In [47]:
SGD = SGDRegressor(loss = 'epsilon_insensitive', alpha = 0, epsilon = 0).fit(x_train, y_train)

In [48]:
SGD_predictions = SGD.predict(x_test)

In [49]:
mean_absolute_error(y_test, SGD_predictions)

11.534553563892858

### Tuning SGD Regressor

In [70]:
param_grid = {'alpha':[0.1,0.01, 0],
              'learning_rate':['constant','optimal','invscaling','adaptive'],
              'max_iter':[1000,2000],
              'penalty':['l2','l1','elasticnet'],
              'loss':['huber', 'epsilon_insensitive','squared_epsilon_insensitive', 'squared_error'],
              'eta0': [0.01, 1, 10]}

In [71]:
sgd_ = SGDRegressor(random_state = 1)
g_search = GridSearchCV(estimator = sgd_, param_grid = param_grid, cv = 3, n_jobs = 1, verbose = 0, return_train_score=True)

In [72]:
g_search.fit(x_train, y_train);
print(g_search.best_params_)



KeyboardInterrupt: 

In [None]:
### New model