In [None]:
import imageio
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation

# Read data
XYtr = pd.read_csv('XYtr.csv')
Xte = pd.read_csv('Xte.csv')
pred = pd.read_csv('pred.csv')

In [None]:
# Convert date to numeric, extract numeric columns
cdate = XYtr['cdate']
# make a copy
XYtr1 = XYtr.copy() 
XYtr1['cdate'] = pd.to_datetime(cdate).values.astype(np.float64)/8.64e+13
# make a copy
Xte1 = Xte.copy()
Xte1['cdate'] = pd.to_datetime(Xte1['cdate']).values.astype(np.float64)/8.64e+13
# make a copy
pred0 = pred.copy()

In [None]:
# Processing version for XYtr
XYtr['version'] = XYtr['version'].fillna("None")
versions_XYtr = pd.get_dummies(XYtr['version'])
# Processing version for Xte
Xte['version'] = Xte['version'].fillna("None")
versions_Xte = pd.get_dummies(Xte['version'])

In [None]:
# Make corpus and vocab

K = 500
K2 = 20
XYtr['symbol'] = XYtr['symbol'].fillna("NAN")
XYtr['description'] = XYtr['description'].fillna("NAN")
Xte['description'] = Xte['description'].fillna("NAN")
Xte['symbol'] = Xte['symbol'].fillna("NAN")
corpus = list(XYtr['description'])+list(Xte['description'])
corpus2 =list(XYtr['symbol'])+list(Xte['symbol'])
# transform a given text into a vector on the basis of the frequency (count) of each word that occurs in the entire text.

vectorizer = CountVectorizer()

corpus = vectorizer.fit_transform(corpus)
corpus2 = vectorizer.fit_transform(corpus2)
lda = LatentDirichletAllocation(n_components = K)
lda2 = LatentDirichletAllocation(n_components = K2)
lda.fit(corpus)
lda2.fit(corpus2)
corpus
corpus2

In [None]:
topics = lda.transform(corpus)
topics2 = lda2.transform(corpus2)
topics2

In [None]:
N = XYtr.shape[0]

In [None]:
# Text Processing
fp = open('XYtr_ft.csv', 'w')
fp.write('id')
for k in range(K):
    fp.write(',FT%04d' % k)
fp.write('\n')
for i in range(N):
    id = XYtr.loc[i,'id']
    fp.write('%s' % id)
    for k in range(K):
        fp.write(',%f' % topics[i, k])    
    fp.write('\n')

fp.close()    

fp = open('Xte_ft.csv', 'w')
fp.write('id')
for k in range(K):
    fp.write(',FT%04d' % k)
fp.write('\n')
for i in range(N):
    id = Xte.loc[i,'id']
    fp.write('%s' % id)
    for k in range(K):
        fp.write(',%f' % topics[i + N, k])   
    fp.write('\n')

fp.close() 


# Text Processing2
fp = open('XYtr_ft2.csv', 'w')
fp.write('id')
for k in range(K2):
    fp.write(',FT%04d' % k)

fp.write('\n')
for i in range(N):
    id = XYtr.loc[i,'id']
    fp.write('%s' % id)
    for k in range(K2):
        fp.write(',%f' % topics2[i, k])
    
    fp.write('\n')

fp.close()    

fp = open('Xte_ft2.csv', 'w')
fp.write('id')
for k in range(K2):
    fp.write(',FT%04d' % k)

fp.write('\n')
for i in range(N):
    id = Xte.loc[i,'id']
    fp.write('%s' % id)
    for k in range(K2):
        fp.write(',%f' % topics2[i + N, k])
    
    fp.write('\n')

fp.close() 

In [None]:
# Image Processing
fp = open('XYtr_fi.csv', 'w')
fp.write('id,fi1,fi2,fi3,fi4,fi5,fi6,fi7\n')
n=len(XYtr)
for i in range(n):
    id = XYtr.loc[i,'id']
    f = XYtr.loc[i,'id'] + XYtr.loc[i, 'ext']
    try:
        pic = imageio.imread('images/images/' + f)
        # number of rows
        fi1 = pic.shape[0]
        # number of columns
        fi2 = pic.shape[1]
        # find the mean value of rgb
        fi3 = np.mean(pic[:,:,:])
        # find the min pixel
        fi4 = pic.min()
        fi5 = np.mean(pic[:,:,0])
        fi6 = np.mean(pic[:,:,1])
        fi7 = np.mean(pic[:,:,2])
    
    except:
        fi1 = np.nan
        fi2 = np.nan
        fi3 = np.nan
        fi4 = np.nan
        fi5 = np.nan
        fi6 = np.nan
        fi7 = np.nan
        

        pass
    
    fp.write('%s,%f,%f,%f,%f,%f,%f,%f\n' % (id, fi1,fi2,fi3,fi4,fi5,fi6,fi7))
fp.close() 

fp = open('Xte_fi.csv', 'w')
fp.write('id,fi1,fi2,fi3,fi4,fi5,fi6,fi7\n')
n=len(Xte)
for i in range(n):
    id = Xte.loc[i,'id']
    f = Xte.loc[i,'id'] + Xte.loc[i, 'ext']
    try:
        pic = imageio.imread('images/images/' + f)
        # number of rows
        fi1 = pic.shape[0]
        # number of columns
        fi2 = pic.shape[1]
        # find the mean value of rgb
        fi3 = np.mean(pic[:,:,:])
        # find the min pixel
        fi4 = pic.min()
        fi5 = np.mean(pic[:,:,0])
        fi6 = np.mean(pic[:,:,1])
        fi7 = np.mean(pic[:,:,2])

    
    except:
        fi1 = np.nan
        fi2 = np.nan
        fi3 = np.nan
        fi4 = np.nan
        fi5 = np.nan
        fi6 = np.nan
        fi7 = np.nan
        
        pass
    
    fp.write('%s,%f,%f,%f,%f,%f,%f,%f\n' % (id, fi1,fi2,fi3,fi4,fi5,fi6,fi7))
fp.close() 

In [None]:
# Add both Image and Text and Version for XYtr
XYtr_fi = pd.read_csv('XYtr_fi.csv')
XYtr_fi1 = XYtr_fi.copy()
XYtr_fi1=XYtr_fi1.drop(["id"],axis=1)

XYtr_ft = pd.read_csv('XYtr_ft.csv')
XYtr_ft1 = XYtr_ft.copy()
XYtr_ft1=XYtr_ft1.drop(["id"],axis=1)

XYtr_ft2 = pd.read_csv('XYtr_ft2.csv')
XYtr_ft2=XYtr_ft2.drop(["id"],axis=1)

New_XYtr = pd.concat([XYtr1[['X.sales', 'cdate', 'fee1', 'fee2']],XYtr_fi1,XYtr_ft1,XYtr_ft2,versions_XYtr],axis=1)
New_XYtr

In [None]:
# Add both Image and Text and Version for Xte
Xte_fi = pd.read_csv('Xte_fi.csv')
Xte_fi1 = Xte_fi.copy()
Xte_fi1= Xte_fi1.drop(["id"],axis=1)

Xte_ft = pd.read_csv('Xte_ft.csv')
Xte_ft1 = Xte_ft.copy()
Xte_ft1= Xte_ft1.drop(["id"],axis=1)

Xte_ft2 = pd.read_csv('Xte_ft2.csv')
Xte_ft2= Xte_ft2.drop(["id"],axis=1)

New_Xte = pd.concat([Xte1[['X.sales', 'cdate', 'fee1', 'fee2']],Xte_fi1,Xte_ft1,Xte_ft2,versions_Xte],axis=1)
New_Xte

In [None]:

Xtr_final = New_XYtr.copy()
Ytr_final = XYtr1['total'].copy()
Xte_final = New_Xte.copy()

In [None]:
Xtr_final

In [None]:
Ytr_final

In [None]:
Xte_final

In [None]:
Xtr_final.dtypes

In [None]:
Xtr_final = Xtr_final.astype(np.float64)
Xtr_final.dtypes

In [None]:
# fill NA with 0
Xtr_final = Xtr_final.fillna(0)

In [None]:
Xte_final.dtypes

In [None]:
Xte_final = Xte_final.astype(np.float64)
Xte_final.dtypes

In [None]:
Xte_final = Xte_final.fillna(0)

In [None]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
Xtr_scale = scaler.fit_transform(Xtr_final)
Xte_scale = scaler.transform(Xte_final)

In [None]:
from sklearn.linear_model import SGDRegressor

#model = SGDRegressor(loss = 'epsilon_insensitive', alpha = 0, epsilon = 0,random_state=440).fit(Xtr_scale, Ytr_final)
model = SGDRegressor(loss = 'epsilon_insensitive', alpha = 0, epsilon = 0).fit(Xtr_scale, Ytr_final)

In [None]:
# For add both Image and Text
pred_IT = pred0.copy()
pred_IT['total'] = model.predict(Xte_scale)
pred_IT.to_csv('pred_ITVS.csv', index = False)