In [1]:
import pandas as pd
from sklearn.preprocessing import StandardScaler
from torch.utils.data import DataLoader, TensorDataset
import numpy as np
from sklearn.svm import SVR
from sklearn.model_selection import GridSearchCV
import data_load

In [2]:
import gc
gc.collect()

0

In [3]:
max_sequence_length = 1024

In [4]:
from gensim.models import KeyedVectors  # 用于加载GloVe词嵌入
# 加载预训练的GloVe词嵌入
glove_file = 'glove.6B/glove.6B.100d.txt'  # 替换为实际的GloVe文件路径 Common Crawl (42B tokens, 1.9M vocab, uncased, 300d vectors, 1.75 GB download Vocabulary Size: 400000   Embedding Dimension: 100d )
glove_model = KeyedVectors.load_word2vec_format(glove_file, binary=False)
print('Loaded GloVe!')

Loaded GloVe!


In [5]:
def get_embedding_vector_for_word(word):
    if word in glove_model:
        return glove_model[word]
    else:
        return None

def review_preprocessing(X_train,X_validation,X_test):

    x_train_reviews=X_train['reviewText'].tolist()
    x_validation_reviews=X_validation['reviewText'].tolist()
    x_test_reviews=X_test['reviewText'].tolist()

    x_train_combined = x_train_reviews + x_validation_reviews
    del x_train_reviews
    del x_validation_reviews

    # Combine all reviews to create vocabulary
    all_reviews = x_train_combined + x_test_reviews

    # Build vocabulary manually
    word_index = {}
    current_index = 1  # Start index from 1
    for review in all_reviews:
        words = review.split()
        for word in words:
            if word not in word_index:
                word_index[word] = current_index
                current_index += 1
    vocab_size = len(word_index) + 1
    print('Vocabulary size:', vocab_size)

    # Initialize embedding matrix
    embedding_dim = 100  # Assuming embedding dimension is 100
    embedding_matrix = np.zeros((vocab_size, embedding_dim))

    # Fill embedding matrix
    for word, i in word_index.items():
        embedding_vector = get_embedding_vector_for_word(word)  # Get embedding vector for word
        if embedding_vector is not None:
            embedding_matrix[i] = embedding_vector
    print('Embedding matrix shape:', embedding_matrix.shape)
    # Save embedding matrix to file
    np.save('embedding_matrix.npy', embedding_matrix)

    # Convert text to embedding representation
    def text_to_sequences(texts, word_index):
        sequences = []
        for text in texts:
            words = text.split()
            sequence = [word_index[word] for word in words if word in word_index]
            sequences.append(sequence)
        return sequences

    x_train_combined_sequences = text_to_sequences(x_train_combined, word_index)
    x_test_sequences = text_to_sequences(x_test_reviews, word_index)

    def pad_sequences(sequences, max_length):
        padded_sequences = []
        for seq in sequences:
            if len(seq) < max_length:
                padded_seq = seq + [0] * (max_length - len(seq))
            else:
                padded_seq = seq[:max_length]
            padded_sequences.append(padded_seq)
        return padded_sequences

    x_train_combined_padded = pad_sequences(x_train_combined_sequences, max_sequence_length)
    x_test_padded = pad_sequences(x_test_sequences, max_sequence_length)
    print('sample review:', len(x_train_combined_padded[0]))
    print('Text preprocessing done!')
    return x_train_combined_padded, x_test_padded


In [6]:
def features_preprocessing(X_train,X_validation,X_test):
    # do with Features
    X_train.drop(columns=['reviewText'],inplace=True)
    X_validation.drop(columns=['reviewText'],inplace=True)
    X_test.drop(columns=['reviewText'], inplace=True)

    x_train_features_combined = pd.concat([X_train, X_validation], axis=0)
    del X_train
    del X_validation

    # 定义数值特征转换器，将每个特征的值转换为均值为 0，标准差为 1 的分布，从而使得数据的分布更接近正态分布。
    scaler = StandardScaler()
    x_train_features_combined_features=scaler.fit_transform(x_train_features_combined)
    x_test_features=scaler.transform(X_test)
    print('Features preprocessing done!')
    return x_train_features_combined_features,x_test_features

In [7]:
def data_processing(x_train_combined_padded, x_test_padded,x_train_features_combined_features,x_test_features,y_train,y_validation,y_test):
    x_train_combined=np.concatenate((x_train_combined_padded, x_train_features_combined_features), axis=1)
    y_train_combined = np.concatenate((y_train, y_validation), axis=0)
    x_test=np.concatenate((x_test_padded, x_test_features), axis=1)

    print('x_train_combined.shape:', x_train_combined.shape)

    return x_train_combined,y_train_combined,x_test,y_test

## Model

In [8]:
import time
from sklearn.svm import NuSVR
# # NuSVR: a Support Vector Machine for Regression
#     # suitable for large datasets
#     # 处理高维数据：NuSVR可以有效地处理高维特征空间中的数据，适用于大量的特征。
#     # 控制支持向量数量：通过参数nu，控制模型中的支持向量数量，从而在大型数据库中降低内存和计算资源的需求。
#     # 非线性关系建模：NuSVR支持使用核函数来处理非线性关系，这使得它在处理复杂的数据模式时非常有用。
#     # 可扩展性：NuSVR可以处理大型数据集，因为它在训练过程中采用了一些优化方法，如启发式收缩和使用子集选择支持向量。
#     # 泛化能力：NuSVR在适当的参数设置下，通常能够很好地泛化到未见过的数据。
#
#

In [9]:

# def find_best_parameters(X_train_combined, y_train_combined):
#     nu_svm_model = NuSVR()
#     # GridSearchCV to find the best hyperparameters
#     param_grid = {
#         'nu': [0.3,0.5],  # range of nu
#         'gamma': ['scale', 'auto'],  # range of gamma
#         'cache_size': [80,100,200] , # cache size in MB: 原数据大小：min 165, max 1500,report里写80-1000。这里这样是因为取了1/10
#         'shrinking': [True, False], # whether to use the shrinking heuristic
#         'max_iter': [-1], # the maximum number of iterations to be run,100，1000，2000时是无法收敛，ConvergenceWarning: Solver terminated early (max_iter=100,1000).
#         'kernel':['rbf','poly']
#         # epsilon, nuSVR does not support the parameter epsilon
#     }
#     start=time.time()
#     # build GridSearchCV instance
#     grid_search = GridSearchCV(nu_svm_model, param_grid, cv=2, scoring='neg_mean_squared_error', verbose=3, n_jobs=-1)
#
#     # do grid search on training set
#     grid_search.fit(X_train_combined, y_train_combined)
#
#     end = time.time() - start
#     # get the best hyperparameters
#     best_params = grid_search.best_params_
#     print('Best parameters for nu-svy_predr: ', best_params)
#
#     #  best: {'cache_size': 80, 'gamma': 'scale', 'kernel': 'rbf', 'max_iter': -1, 'nu': 0.5,'shrinking': False}
#     best_svr = grid_search.best_estimator_
#     # best_estimator_: 一步到位找到最佳的模型
#
#     with open('results/evaluation_result_SVR.txt', 'a') as f:
#         f.write('training time: '+str(end) + '\n')
#         f.write('Best parameters for nu-svy_predr: '+ str(best_params) + '\n')
#
#
#     return best_svr


In [10]:
def model_build(x_train_combined,y_train_combined):
    start = time.time()
    best_svm_model = NuSVR(cache_size=80, gamma='scale', kernel='rbf', max_iter=-1, nu=0.5, shrinking=False)
    # 在训练集上训练模型
    best_svm_model.fit(x_train_combined, y_train_combined)
    end = time.time() - start
    print('training time:', end)
    with open('results/evaluation_result_SVR.txt', 'a') as f:
        f.write('training time: '+str(end) + '\n')
    return best_svm_model


In [11]:
# X 是特征向量，y 是目标变量（有用性分数）
def predict( X_test, y_ture,x_test_original,best_svm_model,saved_file):
    y_pred = best_svm_model.predict(X_test)
    print(len(y_pred))
    print('y_pred:', y_pred[:10])
    x_test_original['y_pred'] = y_pred
    x_test_original['y_true'] = y_ture
    x_test_original.to_csv(saved_file, index=False)
    return y_pred

In [12]:
import evaluation as ev
def evaluation_svr(y_pred,y_ture,path):
    # Reshape y_pred_video_games
    y_pred = np.reshape(y_pred, (y_pred.shape[0],))
    print('y_pred.shape:', y_pred.shape)

    rmse=ev.rmse(y_ture,y_pred)
    rmse=round(rmse,3)
    pcc=ev.pcc(y_ture,y_pred)
    pcc=round(pcc,3)
    ndcg=ev.ndcg(y_ture,y_pred)
    ndcg=round(ndcg,3)
    print('rmse:',rmse)
    print('pcc:',pcc)
    print('ndcg:',ndcg)

    # save into text file
    with open(path, 'a') as f:
        f.write('RMSE: ' + str(rmse) + '\n')
        f.write('PCC: ' + str(pcc) + '\n')
        f.write('NDCG: ' + str(ndcg) + '\n')
        f.write('\n\n')

## Main

## 1. video_games

### 1.1 all features

In [13]:
x_train_video_games, x_validation_video_games, x_test_video_games, y_train_video_games, y_validation_video_games, y_test_video_games = data_load.data_load(tag='all',table_name='features_video_games')
with open('results/evaluation_result_SVR.txt', 'a') as f:
    f.write('===========video_games===========' + '\n\n')
    f.write('====All Features====' + '\n')

  df = pd.read_sql(query, conn)


load features_video_games from MySQL: (55120, 11)!
features selection finished!
No null values found in the dataset.
shape of X_train: (39686, 10)
shape of X_validation: (4410, 10)
shape of X_test: (11024, 10)
shape of y_train: (39686,)
shape of y_validation: (4410,)
shape of y_test: (11024,)
Index(['reviewText', 'overall', 'review_length', 'num_sentences',
       'avg_sentence_length', 'subjectivity', 'readability', 'timeline',
       'topic', 'review_volume'],
      dtype='object')


In [14]:
x_train_combined_padded, x_test_padded=review_preprocessing(x_train_video_games, x_validation_video_games, x_test_video_games)

Vocabulary size: 374733
Embedding matrix shape: (374733, 100)
sample review: 1024
Text preprocessing done!


In [15]:
x_train_features_combined_features,x_test_features=features_preprocessing(x_train_video_games, x_validation_video_games, x_test_video_games)

Features preprocessing done!


In [16]:
x_train_combined,y_train_combined,x_test,y_test=data_processing(x_train_combined_padded, x_test_padded,x_train_features_combined_features,x_test_features,y_train_video_games, y_validation_video_games,y_test_video_games)

x_train_combined.shape: (44096, 1033)


In [17]:
# best_svm_model=find_best_parameters_randomized(x_train_combined, y_train_combined)

In [None]:
best_svm_model=model_build(x_train_combined,y_train_combined)

In [None]:
y_pred_video_games=predict(x_test,y_test,x_test_video_games,best_svm_model,'results/svr_video_games_all.csv')

In [None]:
evaluation_svr(y_pred_video_games,y_test_video_games,'results/evaluation_result_SVR.txt')

In [None]:
del x_train_video_games
del x_test
del x_test_features
del x_train_features_combined_features
del x_train_combined

## Gini: only gini features

In [None]:
x_train_video_games, x_validation_video_games, x_test_video_games, y_train_video_games, y_validation_video_games, y_test_video_games = data_load.data_load(tag='Gini',table_name='features_video_games')
with open('results/evaluation_result_SVR.txt', 'a') as f:
    f.write('====Gini Features====' + '\n')

In [None]:
x_train_combined_padded, x_test_padded=review_preprocessing(x_train_video_games, x_validation_video_games, x_test_video_games)

In [None]:
x_train_features_combined_features,x_test_features=features_preprocessing(x_train_video_games, x_validation_video_games, x_test_video_games)

In [None]:
x_train_combined,y_train_combined,x_test,y_test=data_processing(x_train_combined_padded, x_test_padded,x_train_features_combined_features,x_test_features,y_train_video_games, y_validation_video_games,y_test_video_games)

In [None]:
best_svm_model=model_build(x_train_combined,y_train_combined)

In [None]:
y_pred_video_games=predict(x_test,y_test,x_test_video_games,best_svm_model,'results/svr_video_games_Gini.csv')

In [None]:
evaluation_svr(y_pred_video_games,y_test_video_games,'results/evaluation_result_SVR.txt')

## PCC: only PCC features

In [None]:
x_train_video_games, x_validation_video_games, x_test_video_games, y_train_video_games, y_validation_video_games, y_test_video_games = data_load.data_load(tag='PCC',table_name='features_video_games')
with open('results/evaluation_result_SVR.txt', 'a') as f:
    f.write('====PCC Features====' + '\n')

In [None]:
x_train_combined_padded, x_test_padded=review_preprocessing(x_train_video_games, x_validation_video_games, x_test_video_games)

In [None]:
x_train_features_combined_features,x_test_features=features_preprocessing(x_train_video_games, x_validation_video_games, x_test_video_games)

In [None]:
x_train_combined,y_train_combined,x_test,y_test=data_processing(x_train_combined_padded, x_test_padded,x_train_features_combined_features,x_test_features,y_train_video_games, y_validation_video_games,y_test_video_games)

In [None]:
best_svm_model=model_build(x_train_combined,y_train_combined)

In [None]:
y_pred_video_games=predict(x_test,y_test,x_test_video_games,best_svm_model,'results/svr_video_games_PCC.csv')

In [None]:
evaluation_svr(y_pred_video_games,y_test_video_games,'results/evaluation_result_SVR.txt')

## SPC: only SPC features

In [None]:
x_train_video_games, x_validation_video_games, x_test_video_games, y_train_video_games, y_validation_video_games, y_test_video_games = data_load.data_load(tag='SPC',table_name='features_video_games')
with open('results/evaluation_result_SVR.txt', 'a') as f:
    f.write('====SPC Features====' + '\n')

In [None]:
x_train_combined_padded, x_test_padded=review_preprocessing(x_train_video_games, x_validation_video_games, x_test_video_games)

In [None]:
x_train_features_combined_features,x_test_features=features_preprocessing(x_train_video_games, x_validation_video_games, x_test_video_games)

In [None]:
x_train_combined,y_train_combined,x_test,y_test=data_processing(x_train_combined_padded, x_test_padded,x_train_features_combined_features,x_test_features,y_train_video_games, y_validation_video_games,y_test_video_games)

In [None]:
best_svm_model=model_build(x_train_combined,y_train_combined)

In [None]:
y_pred_video_games=predict(x_test,y_test,x_test_video_games,best_svm_model,'results/svr_video_games_SPC.csv')

In [None]:
evaluation_svr(y_pred_video_games,y_test_video_games,'results/evaluation_result_SVR.txt')

## None features: Only reviewText

In [14]:
x_train_video_games, x_validation_video_games, x_test_video_games, y_train_video_games, y_validation_video_games, y_test_video_games = data_load.data_load(tag='none',table_name='features_video_games')
with open('results/evaluation_result_SVR.txt', 'a') as f:
    f.write('====None Features====' + '\n')

  df = pd.read_sql(query, conn)


load features_video_games from MySQL: (18374, 11)!
features selection finished!
No null values found in the dataset.
shape of X_train: (13229, 1)
shape of X_validation: (1470, 1)
shape of X_test: (3675, 1)
shape of y_train: (13229,)
shape of y_validation: (1470,)
shape of y_test: (3675,)
Index(['reviewText'], dtype='object')


In [15]:
x_train_combined_padded, x_test_padded=review_preprocessing(x_train_video_games, x_validation_video_games, x_test_video_games)

Vocabulary size: 173529
Embedding matrix shape: (173529, 100)
sample review: 1024
Text preprocessing done!


In [19]:
y_train_combined = np.concatenate((y_train_video_games, y_validation_video_games), axis=0)
# 转换为dataframe
x_train_combined=pd.DataFrame(x_train_combined_padded)
x_test=pd.DataFrame(x_test_padded)
y_test=y_test_video_games
print(x_test.shape)

(3675, 1024)


In [17]:
best_svm_model=model_build(x_train_combined,y_train_combined)

training time: 133.63129591941833


In [20]:
y_pred_video_games=predict(x_test,y_test,x_test_video_games,best_svm_model,'results/svr_video_games_none.csv')

3675
y_pred: [0.6225422  0.6527349  0.59775787 0.65590853 0.67687925 0.48571723
 0.73145672 0.37565835 0.46239102 0.58702385]


In [21]:
evaluation_svr(y_pred_video_games,y_test_video_games,'results/evaluation_result_SVR.txt')

y_pred.shape: (3675,)
rmse: 0.334
pcc: 0.238
ndcg: 0.936


In [22]:
del x_train_video_games
del x_validation_video_games
del x_test_video_games
del y_train_video_games
del y_validation_video_games
del y_test_video_games
del x_train_combined_padded
del x_test_padded
del y_train_combined
del x_test
del y_test
del best_svm_model
del y_pred_video_games

# 2. Books

## 2.1 all features

In [None]:
x_train_books, x_validation_books, x_test_books, y_train_books, y_validation_books, y_test_books = data_load.data_load(tag='all',table_name='features_books')
with open('results/evaluation_result_SVR.txt', 'a') as f:
    f.write('===========books===========' + '\n\n')
    f.write('====All Features====' + '\n')

In [None]:
x_train_combined_padded, x_test_padded=review_preprocessing(x_train_books, x_validation_books, x_test_books)

In [None]:
x_train_features_combined_features,x_test_features=features_preprocessing(x_train_books, x_validation_books, x_test_books)

In [None]:
x_train_combined,y_train_combined,x_test,y_test=data_processing(x_train_combined_padded, x_test_padded,x_train_features_combined_features,x_test_features,y_train_books, y_validation_books,y_test_books)

In [None]:
best_svm_model=model_build(x_train_combined,y_train_combined)

In [None]:
y_pred_books=predict(x_test,y_test,x_test_books,best_svm_model,'results/svr_books_all.csv')

In [None]:
evaluation_svr(y_pred_books,y_test_books,'results/evaluation_result_SVR.txt')

In [None]:
del x_train_books
del x_validation_books
del x_test
del x_test_features
del x_train_combined_padded
del x_train_combined
del y_pred_books
del y_test_books
del best_svm_model
del x_test_padded
del x_train_features_combined_features

## Gini: only gini features

In [None]:
x_train_books, x_validation_books, x_test_books, y_train_books, y_validation_books, y_test_books = data_load.data_load(tag='Gini',table_name='features_books')
with open('results/evaluation_result_SVR.txt', 'a') as f:
    f.write('====Gini Features====' + '\n')

In [None]:
x_train_combined_padded, x_test_padded=review_preprocessing(x_train_books, x_validation_books, x_test_books)

In [None]:
x_train_features_combined_features,x_test_features=features_preprocessing(x_train_books, x_validation_books, x_test_books)

In [None]:
x_train_combined,y_train_combined,x_test,y_test=data_processing(x_train_combined_padded, x_test_padded,x_train_features_combined_features,x_test_features,y_train_books, y_validation_books,y_test_books)

In [None]:
best_svm_model=model_build(x_train_combined,y_train_combined)

In [None]:
y_pred_books=predict(x_test,y_test,x_test_books,best_svm_model,'results/svr_books_Gini.csv')

In [None]:
evaluation_svr(y_pred_books,y_test_books,'results/evaluation_result_SVR.txt')

In [None]:
del x_train_books
del x_validation_books
del x_test
del x_test_features
del x_train_combined_padded
del x_train_combined
del y_pred_books
del y_test_books
del best_svm_model
del x_test_padded
del x_train_features_combined_features

## PCC: only PCC features

In [None]:
x_train_books, x_validation_books, x_test_books, y_train_books, y_validation_books, y_test_books = data_load.data_load(tag='PCC',table_name='features_books')
with open('results/evaluation_result_SVR.txt', 'a') as f:
    f.write('====PCC Features====' + '\n')

In [None]:
x_train_combined_padded, x_test_padded=review_preprocessing(x_train_books, x_validation_books, x_test_books)

In [None]:
x_train_features_combined_features,x_test_features=features_preprocessing(x_train_books, x_validation_books, x_test_books)

In [None]:
x_train_combined,y_train_combined,x_test,y_test=data_processing(x_train_combined_padded, x_test_padded,x_train_features_combined_features,x_test_features,y_train_books, y_validation_books,y_test_books)

In [None]:
best_svm_model=model_build(x_train_combined,y_train_combined)

In [None]:
y_pred_books=predict(x_test,y_test,x_test_books,best_svm_model,'results/svr_books_PCC.csv')

In [None]:
evaluation_svr(y_pred_books,y_test_books,'results/evaluation_result_SVR.txt')

In [None]:
del x_train_books
del x_validation_books
del x_test
del x_test_features
del x_train_combined_padded
del x_train_combined
del y_pred_books
del y_test_books
del best_svm_model
del x_test_padded
del x_train_features_combined_features

## SPC: only SPC features

In [None]:
x_train_books, x_validation_books, x_test_books, y_train_books, y_validation_books, y_test_books = data_load.data_load(tag='SPC',table_name='features_books')
with open('results/evaluation_result_SVR.txt', 'a') as f:
    f.write('====SPC Features====' + '\n')


In [None]:
x_train_combined_padded, x_test_padded=review_preprocessing(x_train_books, x_validation_books, x_test_books)

In [None]:
x_train_features_combined_features,x_test_features=features_preprocessing(x_train_books, x_validation_books, x_test_books)

In [None]:
x_train_combined,y_train_combined,x_test,y_test=data_processing(x_train_combined_padded, x_test_padded,x_train_features_combined_features,x_test_features,y_train_books, y_validation_books,y_test_books)

In [None]:
best_svm_model=model_build(x_train_combined,y_train_combined)

In [None]:
y_pred_books=predict(x_test,y_test,x_test_books,best_svm_model,'results/svr_books_SPC.csv')

In [None]:
evaluation_svr(y_pred_books,y_test_books,'results/evaluation_result_SVR.txt')

In [None]:
del x_train_books
del x_validation_books
del x_test
del x_test_features
del x_train_combined_padded
del x_train_combined
del y_pred_books
del y_test_books
del best_svm_model
del x_test_padded
del x_train_features_combined_features

## None features: Only reviewText

In [21]:
x_train_books, x_validation_books, x_test_books, y_train_books, y_validation_books, y_test_books = data_load.data_load(tag='none',table_name='features_books')

with open('results/evaluation_result_SVR.txt', 'a') as f:
    f.write('====None Features====' + '\n')

  df = pd.read_sql(query, conn)


load features_books from MySQL: (16240, 11)!
features selection finished!
No null values found in the dataset.
shape of X_train: (11692, 1)
shape of X_validation: (1300, 1)
shape of X_test: (3248, 1)
shape of y_train: (11692,)
shape of y_validation: (1300,)
shape of y_test: (3248,)
Index(['reviewText'], dtype='object')


In [22]:
x_train_combined_padded, x_test_padded=review_preprocessing(x_train_books, x_validation_books, x_test_books)

Vocabulary size: 207954
Embedding matrix shape: (207954, 100)
sample review: 1024
Text preprocessing done!


In [23]:
y_train_combined = np.concatenate((y_train_books, y_validation_books), axis=0)
x_train_combined=pd.DataFrame(x_train_combined_padded)
x_test=pd.DataFrame(x_test_padded)
print(x_train_combined.shape)

(12992, 1024)


In [24]:
best_svm_model=model_build(x_train_combined,y_train_combined)

training time: 87.33634400367737


In [26]:
y_pred_books=predict(x_test,y_test_books,x_test_books,best_svm_model,'results/svr_books_none.csv')

3248
y_pred: [0.55353394 0.74053495 0.69906644 0.6993805  0.68932229 0.64537149
 0.69818853 0.7385824  0.5886254  0.72143682]


In [27]:
evaluation_svr(y_pred_books,y_test_books,'results/evaluation_result_SVR.txt')

y_pred.shape: (3248,)
rmse: 0.292
pcc: 0.198
ndcg: 0.96


In [28]:
gc.collect()

544

# 3. cell phone

In [None]:
x_train_cell_phone, x_validation_cell_phone, x_test_cell_phone, y_train_cell_phone, y_validation_cell_phone, y_test_cell_phone = data_load.data_load(tag='all',table_name='features_cell_phones')
with open('results/evaluation_result_SVR.txt', 'a') as f:
    f.write('===========cell_phone===========' + '\n\n')
    f.write('====All Features====' + '\n')

In [None]:
x_train_combined_padded, x_test_padded=review_preprocessing(x_train_cell_phone, x_validation_cell_phone, x_test_cell_phone)

In [None]:
x_train_features_combined_features,x_test_features=features_preprocessing(x_train_cell_phone, x_validation_cell_phone, x_test_cell_phone)

In [None]:
x_train_combined,y_train_combined,x_test,y_test=data_processing(x_train_combined_padded, x_test_padded,x_train_features_combined_features,x_test_features,y_train_cell_phone, y_validation_cell_phone,y_test_cell_phone)

In [None]:
best_svm_model=model_build(x_train_combined,y_train_combined)

In [None]:
y_pred_cell_phone=predict(x_test,y_test,x_test_cell_phone,best_svm_model,'results/svr_cell_phone_all.csv')

In [None]:
evaluation_svr(y_pred_cell_phone,y_test_cell_phone,'results/evaluation_result_SVR.txt')

In [None]:
del x_train_cell_phone
del x_validation_cell_phone
del x_test
del x_test_features
del x_train_combined_padded
del x_train_combined
del y_pred_cell_phone
del y_test_cell_phone
del best_svm_model

## Gini: only gini features

In [None]:
x_train_cell_phone, x_validation_cell_phone, x_test_cell_phone, y_train_cell_phone, y_validation_cell_phone, y_test_cell_phone = data_load.data_load(tag='Gini',table_name='features_cell_phones')

In [None]:
with open('results/evaluation_result_SVR.txt', 'a') as f:
    f.write('====Gini Features====' + '\n')

In [None]:
x_train_combined_padded, x_test_padded=review_preprocessing(x_train_cell_phone, x_validation_cell_phone, x_test_cell_phone)

In [None]:
x_train_features_combined_features,x_test_features=features_preprocessing(x_train_cell_phone, x_validation_cell_phone, x_test_cell_phone)

In [None]:
x_train_combined,y_train_combined,x_test,y_test=data_processing(x_train_combined_padded, x_test_padded,x_train_features_combined_features,x_test_features,y_train_cell_phone, y_validation_cell_phone,y_test_cell_phone)

In [None]:
best_svm_model=model_build(x_train_combined,y_train_combined)

In [None]:
y_pred_cell_phone=predict(x_test,y_test,x_test_cell_phone,best_svm_model,'results/svr_cell_phone_Gini.csv')

In [None]:
evaluation_svr(y_pred_cell_phone,y_test_cell_phone,'results/evaluation_result_SVR.txt')

In [None]:
del x_train_cell_phone
del x_validation_cell_phone
del x_test
del x_test_features
del x_train_combined_padded
del x_train_combined
del y_pred_cell_phone
del y_test_cell_phone
del best_svm_model

## PCC: only PCC features

In [None]:
x_train_cell_phone, x_validation_cell_phone, x_test_cell_phone, y_train_cell_phone, y_validation_cell_phone, y_test_cell_phone = data_load.data_load(tag='PCC',table_name='features_cell_phones')
with open('results/evaluation_result_SVR.txt', 'a') as f:
    f.write('====PCC Features====' + '\n')

In [None]:
x_train_combined_padded, x_test_padded=review_preprocessing(x_train_cell_phone, x_validation_cell_phone, x_test_cell_phone)

In [None]:
x_train_features_combined_features,x_test_features=features_preprocessing(x_train_cell_phone, x_validation_cell_phone, x_test_cell_phone)

In [None]:
x_train_combined,y_train_combined,x_test,y_test=data_processing(x_train_combined_padded, x_test_padded,x_train_features_combined_features,x_test_features,y_train_cell_phone, y_validation_cell_phone,y_test_cell_phone)

In [None]:
best_svm_model=model_build(x_train_combined,y_train_combined)

In [None]:
y_pred_cell_phone=predict(x_test,y_test,x_test_cell_phone,best_svm_model,'results/svr_cell_phone_PCC.csv')

In [None]:
evaluation_svr(y_pred_cell_phone,y_test_cell_phone,'results/evaluation_result_SVR.txt')

In [None]:
del x_train_cell_phone
del x_validation_cell_phone
del x_test
del x_test_features
del x_train_combined_padded
del x_train_combined
del y_pred_cell_phone
del y_test_cell_phone
del best_svm_model

## SPC: only SPC features

In [None]:
x_train_cell_phone, x_validation_cell_phone, x_test_cell_phone, y_train_cell_phone, y_validation_cell_phone, y_test_cell_phone = data_load.data_load(tag='SPC',table_name='features_cell_phones')

In [None]:
with open('results/evaluation_result_SVR.txt', 'a') as f:
    f.write('====SPC Features====' + '\n')

In [None]:
x_train_combined_padded, x_test_padded=review_preprocessing(x_train_cell_phone, x_validation_cell_phone, x_test_cell_phone)

In [None]:
x_train_features_combined_features,x_test_features=features_preprocessing(x_train_cell_phone, x_validation_cell_phone, x_test_cell_phone)

In [None]:
x_train_combined,y_train_combined,x_test,y_test=data_processing(x_train_combined_padded, x_test_padded,x_train_features_combined_features,x_test_features,y_train_cell_phone, y_validation_cell_phone,y_test_cell_phone)

In [None]:
best_svm_model=model_build(x_train_combined,y_train_combined)

In [None]:
y_pred_cell_phone=predict(x_test,y_test,x_test_cell_phone,best_svm_model,'results/svr_cell_phone_SPC.csv')

In [None]:
evaluation_svr(y_pred_cell_phone,y_test_cell_phone,'results/evaluation_result_SVR.txt')

In [None]:
del x_train_cell_phone
del x_validation_cell_phone
del x_test
del x_test_features
del x_train_combined_padded
del x_train_combined
del y_pred_cell_phone
del y_test_cell_phone
del best_svm_model

## None features: Only reviewText

In [13]:
x_train_cell_phone, x_validation_cell_phone, x_test_cell_phone, y_train_cell_phone, y_validation_cell_phone, y_test_cell_phone = data_load.data_load(tag='none',table_name='features_cell_phones')

  df = pd.read_sql(query, conn)


load features_cell_phones from MySQL: (35667, 11)!
features selection finished!
No null values found in the dataset.
shape of X_train: (25679, 1)
shape of X_validation: (2854, 1)
shape of X_test: (7134, 1)
shape of y_train: (25679,)
shape of y_validation: (2854,)
shape of y_test: (7134,)
Index(['reviewText'], dtype='object')


In [14]:
with open('results/evaluation_result_SVR.txt', 'a') as f:
    f.write('features_cell_phones')
    f.write('====None Features====' + '\n')

In [15]:
x_train_combined_padded, x_test_padded=review_preprocessing(x_train_cell_phone, x_validation_cell_phone, x_test_cell_phone)

Vocabulary size: 156827
Embedding matrix shape: (156827, 100)
sample review: 1024
Text preprocessing done!


In [16]:
y_train_combined = np.concatenate((y_train_cell_phone, y_validation_cell_phone), axis=0)
x_train_combined=pd.DataFrame(x_train_combined_padded)
x_test=pd.DataFrame(x_test_padded)
print(x_train_combined.shape)

(28533, 1024)


In [17]:
best_svm_model=model_build(x_train_combined,y_train_combined)

training time: 629.8269200325012


In [19]:
y_pred_cell_phone=predict(x_test,y_test_cell_phone,x_test_cell_phone,best_svm_model,'results/svr_cell_phone_none.csv')

7134
y_pred: [0.79166356 0.78737279 0.67120789 0.69179194 0.79470353 0.72442176
 0.79804711 0.69566366 0.69493791 0.76973687]


In [20]:
evaluation_svr(y_pred_cell_phone,y_test_cell_phone,'results/evaluation_result_SVR.txt')

y_pred.shape: (7134,)
rmse: 0.319
pcc: -0.01
ndcg: 0.957


In [21]:
gc.collect()

3346

# 4. electronics

In [None]:
X_train_electronics, X_validation_electronics, X_test_electronics, y_train_electronics, y_validation_electronics, y_test_electronics = data_load.data_load(tag='all', table_name='features_electronics')
with open('results/evaluation_result_SVR.txt', 'a') as f:
    f.write('===========electronics===========' + '\n\n')
    f.write('====All Features====' + '\n')

In [None]:
x_train_combined_padded, x_test_padded=review_preprocessing(X_train_electronics, X_validation_electronics, X_test_electronics)

In [None]:
x_train_features_combined_features,x_test_features=features_preprocessing(X_train_electronics, X_validation_electronics, X_test_electronics)

In [None]:
x_train_combined,y_train_combined,x_test,y_test=data_processing(x_train_combined_padded, x_test_padded,x_train_features_combined_features,x_test_features,y_train_electronics, y_validation_electronics,y_test_electronics)

In [None]:
best_svm_model=model_build(x_train_combined,y_train_combined)

In [None]:
y_pred_electronics=predict(x_test,y_test,X_test_electronics,best_svm_model,'results/svr_electronics_all.csv')

In [None]:
evaluation_svr(y_pred_electronics,y_test_electronics,'results/evaluation_result_SVR.txt')

In [None]:
del x_train_combined
del x_test
del x_train_features_combined_features
del x_test_features
del x_train_combined_padded
del x_test_padded
del y_pred_electronics
del y_test_electronics
del best_svm_model

## Gini: only gini features

In [None]:
X_train_electronics, X_validation_electronics, X_test_electronics, y_train_electronics, y_validation_electronics, y_test_electronics = data_load.data_load(tag='Gini', table_name='features_electronics')

In [None]:
with open('results/evaluation_result_SVR.txt', 'a') as f:
    f.write('====Gini Features====' + '\n')

In [None]:
x_train_combined_padded, x_test_padded=review_preprocessing(X_train_electronics, X_validation_electronics, X_test_electronics)

In [None]:
x_train_features_combined_features,x_test_features=features_preprocessing(X_train_electronics, X_validation_electronics, X_test_electronics)

In [None]:
x_train_combined,y_train_combined,x_test,y_test=data_processing(x_train_combined_padded, x_test_padded,x_train_features_combined_features,x_test_features,y_train_electronics, y_validation_electronics,y_test_electronics)

In [None]:
best_svm_model=model_build(x_train_combined,y_train_combined)

In [None]:
y_pred_electronics=predict(x_test,y_test,X_test_electronics,best_svm_model,'results/svr_electronics_Gini.csv')

In [None]:
evaluation_svr(y_pred_electronics,y_test_electronics,'results/evaluation_result_SVR.txt')

In [None]:
del x_train_combined
del x_test
del x_train_features_combined_features
del x_test_features
del x_train_combined_padded
del x_test_padded
del y_pred_electronics
del y_test_electronics
del best_svm_model

## PCC: only PCC features

In [None]:
X_train_electronics, X_validation_electronics, X_test_electronics, y_train_electronics, y_validation_electronics, y_test_electronics = data_load.data_load(tag='PCC', table_name='features_electronics')

In [None]:
with open('results/evaluation_result_SVR.txt', 'a') as f:
    f.write('====PCC Features====' + '\n')

In [None]:
x_train_combined_padded, x_test_padded=review_preprocessing(X_train_electronics, X_validation_electronics, X_test_electronics)

In [None]:
x_train_features_combined_features,x_test_features=features_preprocessing(X_train_electronics, X_validation_electronics, X_test_electronics)

In [None]:
x_train_combined,y_train_combined,x_test,y_test=data_processing(x_train_combined_padded, x_test_padded,x_train_features_combined_features,x_test_features,y_train_electronics, y_validation_electronics,y_test_electronics)

In [None]:
best_svm_model=model_build(x_train_combined,y_train_combined)

In [None]:
y_pred_electronics=predict(x_test,y_test,X_test_electronics,best_svm_model,'results/svr_electronics_PCC.csv')

In [None]:
evaluation_svr(y_pred_electronics,y_test_electronics,'results/evaluation_result_SVR.txt')

In [None]:
del x_train_combined
del x_test
del x_train_features_combined_features
del x_test_features
del x_train_combined_padded
del x_test_padded
del y_pred_electronics
del y_test_electronics
del best_svm_model

## SPC: only SPC features

In [None]:
X_train_electronics, X_validation_electronics, X_test_electronics, y_train_electronics, y_validation_electronics, y_test_electronics = data_load.data_load(tag='SPC', table_name='features_electronics')

In [None]:
with open('results/evaluation_result_SVR.txt', 'a') as f:
    f.write('====SPC Features====' + '\n')

In [None]:
x_train_combined_padded, x_test_padded=review_preprocessing(X_train_electronics, X_validation_electronics, X_test_electronics)

In [None]:
x_train_features_combined_features,x_test_features=features_preprocessing(X_train_electronics, X_validation_electronics, X_test_electronics)

In [None]:
x_train_combined,y_train_combined,x_test,y_test=data_processing(x_train_combined_padded, x_test_padded,x_train_features_combined_features,x_test_features,y_train_electronics, y_validation_electronics,y_test_electronics)

In [None]:
best_svm_model=model_build(x_train_combined,y_train_combined)

In [None]:
y_pred_electronics=predict(x_test,y_test,X_test_electronics,best_svm_model,'results/svr_electronics_SPC.csv')

In [None]:
evaluation_svr(y_pred_electronics,y_test_electronics,'results/evaluation_result_SVR.txt')

In [None]:
del x_train_combined
del x_test
del x_train_features_combined_features
del x_test_features
del x_train_combined_padded
del x_test_padded
del y_pred_electronics
del y_test_electronics
del best_svm_model

## None features: Only reviewText

In [13]:
X_train_electronics, X_validation_electronics, X_test_electronics, y_train_electronics, y_validation_electronics, y_test_electronics = data_load.data_load(tag='none', table_name='features_electronics')

  df = pd.read_sql(query, conn)


load features_electronics from MySQL: (12916, 11)!
features selection finished!
No null values found in the dataset.
shape of X_train: (9298, 1)
shape of X_validation: (1034, 1)
shape of X_test: (2584, 1)
shape of y_train: (9298,)
shape of y_validation: (1034,)
shape of y_test: (2584,)
Index(['reviewText'], dtype='object')


In [14]:
with open('results/evaluation_result_SVR.txt', 'a') as f:
    f.write('features_electronics\n')
    f.write('====None Features====' + '\n')

In [15]:
x_train_combined_padded, x_test_padded=review_preprocessing(X_train_electronics, X_validation_electronics, X_test_electronics)

Vocabulary size: 108387
Embedding matrix shape: (108387, 100)
sample review: 1024
Text preprocessing done!


In [16]:
y_train_combined = np.concatenate((y_train_electronics, y_validation_electronics), axis=0)
x_train_combined=pd.DataFrame(x_train_combined_padded)
x_test=pd.DataFrame(x_test_padded)
print(x_train_combined.shape)

(10332, 1024)


In [17]:
best_svm_model=model_build(x_train_combined,y_train_combined)

training time: 46.249162912368774


In [18]:
y_pred_electronics=predict(x_test,y_test_electronics,X_test_electronics,best_svm_model,'results/svr_electronics_none.csv')

2584
y_pred: [0.71304371 0.70966094 0.74205532 0.65046189 0.76437643 0.7403033
 0.66033558 0.7406425  0.56740973 0.6030238 ]


In [19]:
evaluation_svr(y_pred_electronics,y_test_electronics,'results/evaluation_result_SVR.txt')

y_pred.shape: (2584,)
rmse: 0.329
pcc: 0.124
ndcg: 0.953


In [20]:
gc.collect()

41