In [92]:
# Import Key Libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [94]:
# Read book_ratings.csv
book_ratings = pd.read_csv("book_ratings.csv")

In [96]:
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
import re
nltk.download('punkt')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\jason\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\jason\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [98]:
# Lowercasing
book_ratings['review/summary'] = book_ratings['review/summary'].astype(str).str.lower()
book_ratings['review/text'] = book_ratings['review/text'].astype(str).str.lower()

In [100]:
# Tokenization
book_ratings['review/summary'] = book_ratings['review/summary'].apply(word_tokenize)
book_ratings['review/text'] = book_ratings['review/text'].apply(word_tokenize)

In [101]:
# Removing Punctuation
book_ratings['review/summary'] = book_ratings['review/summary'].apply(lambda x: [word for word in x if word.isalnum()])
book_ratings['review/text'] = book_ratings['review/text'].apply(lambda x: [word for word in x if word.isalnum()])

In [104]:
# Removing Stopwords
stop_words = set(stopwords.words('english'))
book_ratings['review/summary'] = book_ratings['review/summary'].apply(lambda x: [word for word in x if word not in stop_words])
book_ratings['review/text'] = book_ratings['review/text'].apply(lambda x: [word for word in x if word not in stop_words])

In [106]:
# Stemming
ps = PorterStemmer()
book_ratings['review/summary'] = book_ratings['review/summary'].apply(lambda x: [ps.stem(word) for word in x])
book_ratings['review/text'] = book_ratings['review/text'].apply(lambda x: [ps.stem(word) for word in x])

In [107]:
# Removing Special Characters and Numbers
book_ratings['review/summary'] = book_ratings['review/summary'].apply(lambda x: [re.sub(r'[^a-zA-Z]', '', str(word)) for word in x])
book_ratings['review/text'] = book_ratings['review/text'].apply(lambda x: [re.sub(r'[^a-zA-Z]', '', str(word)) for word in x])

In [108]:
custom_stop_words = {"is", "in", "would", "may", "must", "one", "upon", "might", "shall", "could"}
stop_words = set(stopwords.words('english'))
stop_words = stop_words.union(custom_stop_words)

book_ratings['review/summary'] = book_ratings['review/summary'].apply(lambda x: [word for word in x if word not in stop_words])
book_ratings['review/text'] = book_ratings['review/text'].apply(lambda x: [word for word in x if word not in stop_words])


In [112]:
from gensim import corpora, models
import pandas as pd
from nltk.tokenize import word_tokenize

# Read the CSV file
book_ratings = pd.read_csv("book_ratings.csv")

# Create corpus for "review/summary" and "review/text"
corpus_text = [doc.split() for doc in book_ratings["review/text"]]

In [114]:
# Create dictionary for "review/text"
text_dictionary = corpora.Dictionary(corpus_text)

# Filter out terms that appear in fewer than 2 documents or more than 75% of the documents
text_dictionary.filter_extremes(no_below=2, no_above=0.75)

# Convert "review/text" corpus into document-term matrix
text_corpus = [text_dictionary.doc2bow(tokens) for tokens in corpus_text]

In [116]:
import gensim
from gensim import corpora, models

# Train TF-IDF model
tfidf = models.TfidfModel(text_corpus)
text_corpus_tfidf = tfidf[text_corpus]

# Apply SVD to extract 5 components
n_SVD = 5
SVD_model = models.LsiModel(text_corpus_tfidf, id2word=text_dictionary, num_topics=n_SVD)
SVD = SVD_model[text_corpus_tfidf]

import pandas as pd

# Load the book_ratings.csv file
book_ratings = pd.read_csv("book_ratings.csv")

# Extract the non-text columns
non_text_columns = book_ratings[['User_id', 'review/helpfulness']]


In [118]:
# convert results into array
svd_array = gensim.matutils.corpus2csc(SVD).T.toarray()

# convert results to data frame
svd_df = pd.DataFrame(svd_array)

# show SVD results - reduced vector representation of the documents
svd_df

Unnamed: 0,0,1,2,3,4
0,-0.044458,0.017166,0.006512,0.022264,-0.001194
1,-0.227860,0.026546,-0.086918,0.008854,-0.010232
2,-0.172844,-0.017331,-0.020711,0.064680,-0.010724
3,-0.135117,0.054652,-0.000707,-0.017435,-0.054039
4,-0.219155,-0.005209,-0.078767,0.008632,0.007763
...,...,...,...,...,...
995,-0.163187,-0.086378,-0.092925,0.057052,-0.025490
996,-0.097399,-0.125608,0.007111,-0.013534,-0.027389
997,-0.143504,0.028947,0.056614,0.041186,-0.074167
998,-0.084141,0.004337,-0.006905,0.017827,-0.040530


In [120]:
import pandas as pd
import numpy as np

# Load the book_ratings.csv file
book_ratings = pd.read_csv("book_ratings.csv")

# Select the non-text columns
non_text_columns = book_ratings[['Price', 'review/helpfulness']]

# Convert non-text columns to arrays
non_text_array = non_text_columns.values

# Combine non-text columns with SVD components
combined_array = np.concatenate((non_text_array, svd_array), axis=1)

# Convert the combined array to a DataFrame
combined_df = pd.DataFrame(combined_array)

# Display the combined DataFrame
print(combined_df)

         0         1         2         3         4         5         6
0    18.96  0.142857 -0.044458  0.017166  0.006512  0.022264 -0.001194
1    14.04  0.950000 -0.227860  0.026546 -0.086918  0.008854 -0.010232
2    19.57       NaN -0.172844 -0.017331 -0.020711  0.064680 -0.010724
3    43.94  1.000000 -0.135117  0.054652 -0.000707 -0.017435 -0.054039
4    15.27  0.666667 -0.219155 -0.005209 -0.078767  0.008632  0.007763
..     ...       ...       ...       ...       ...       ...       ...
995  17.89  1.000000 -0.163187 -0.086378 -0.092925  0.057052 -0.025490
996  14.95  1.000000 -0.097399 -0.125608  0.007111 -0.013534 -0.027389
997  12.95       NaN -0.143504  0.028947  0.056614  0.041186 -0.074167
998  10.79       NaN -0.084141  0.004337 -0.006905  0.017827 -0.040530
999  10.95  1.000000 -0.142592  0.056084  0.071676 -0.001183 -0.097222

[1000 rows x 7 columns]


In [122]:
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import mean_squared_error

# Extract the "review/score" column
y = book_ratings["review/score"]

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(combined_df, y, test_size=0.2, random_state=42)

# Train the decision tree model
Model_1 = DecisionTreeRegressor()
Model_1.fit(X_train, y_train)

# Make predictions
y_pred = Model_1.predict(X_test)

# Evaluate the model
mse = mean_squared_error(y_test, y_pred)
print("Mean Squared Error:", mse)

Mean Squared Error: 1.1417805334337736


In [128]:
from sklearn.metrics import confusion_matrix
from sklearn.tree import DecisionTreeClassifier

# Discretize the "review/score" into categories
y_train_class = pd.cut(y_train, bins=[-np.inf, 2.5, 5.0, np.inf], labels=['Low', 'Medium', 'High'])

# Train the decision tree model for classification
Model_1_class = DecisionTreeClassifier()
Model_1_class.fit(X_train, y_train_class)

# Make predictions
y_pred_class = Model_1_class.predict(X_test)
y_test_class = pd.cut(y_test, bins=[-np.inf, 2.5, 5.0, np.inf], labels=['Low', 'Medium', 'High'])

# Evaluate the model using confusion matrix
conf_matrix = confusion_matrix(y_test_class, y_pred_class)
print("Confusion Matrix:")
print(conf_matrix)

Confusion Matrix:
[[  0  17]
 [  0 183]]


In [132]:
import gensim
from gensim import corpora, models

# Train TF-IDF model
tfidf = models.TfidfModel(text_corpus)
text_corpus_tfidf = tfidf[text_corpus]

# Apply SVD to extract 8 components
n_SVD = 8
SVD_model = models.LsiModel(text_corpus_tfidf, id2word=text_dictionary, num_topics=n_SVD)
SVD = SVD_model[text_corpus_tfidf]

import pandas as pd

# Load the book_ratings.csv file
book_ratings = pd.read_csv("book_ratings.csv")

# Extract the non-text columns
non_text_columns = book_ratings[['User_id', 'review/helpfulness']]



In [134]:
# convert results into array
svd_array = gensim.matutils.corpus2csc(SVD).T.toarray()

# convert results to data frame
svd_df = pd.DataFrame(svd_array)

# show SVD results - reduced vector representation of the documents
svd_df

Unnamed: 0,0,1,2,3,4,5,6,7
0,-0.044431,-0.021650,0.002172,-0.020095,-0.002945,0.034956,0.009291,-0.023217
1,-0.227810,-0.024358,-0.086751,-0.015460,-0.025954,0.017204,-0.019817,0.020377
2,-0.172796,0.017253,-0.017151,-0.063272,-0.016479,-0.043693,0.054208,-0.030815
3,-0.135180,-0.061913,0.000099,0.013374,-0.051935,0.059716,-0.036561,0.012752
4,-0.219112,0.002204,-0.071628,0.004215,0.006191,0.022200,-0.029799,0.010689
...,...,...,...,...,...,...,...,...
995,-0.163194,0.083621,-0.095028,-0.059430,-0.023682,-0.030185,0.053305,-0.003794
996,-0.097285,0.125400,0.003377,0.007289,-0.027124,0.067648,0.040329,-0.047133
997,-0.143568,-0.024855,0.056297,-0.047158,-0.088821,-0.051233,0.011546,-0.006587
998,-0.084143,-0.005409,0.004295,-0.022515,-0.053508,0.002776,-0.003666,-0.005082


In [136]:
import pandas as pd
import numpy as np

# Load the book_ratings.csv file
book_ratings = pd.read_csv("book_ratings.csv")

# Select the non-text columns
non_text_columns = book_ratings[['Price', 'review/helpfulness']]

# Convert non-text columns to arrays
non_text_array = non_text_columns.values

# Combine non-text columns with SVD components
combined_array = np.concatenate((non_text_array, svd_array), axis=1)

# Convert the combined array to a DataFrame
combined_df = pd.DataFrame(combined_array)

# Display the combined DataFrame
print(combined_df)

         0         1         2         3         4         5         6  \
0    18.96  0.142857 -0.044431 -0.021650  0.002172 -0.020095 -0.002945   
1    14.04  0.950000 -0.227810 -0.024358 -0.086751 -0.015460 -0.025954   
2    19.57       NaN -0.172796  0.017253 -0.017151 -0.063272 -0.016479   
3    43.94  1.000000 -0.135180 -0.061913  0.000099  0.013374 -0.051935   
4    15.27  0.666667 -0.219112  0.002204 -0.071628  0.004215  0.006191   
..     ...       ...       ...       ...       ...       ...       ...   
995  17.89  1.000000 -0.163194  0.083621 -0.095028 -0.059430 -0.023682   
996  14.95  1.000000 -0.097285  0.125400  0.003377  0.007289 -0.027124   
997  12.95       NaN -0.143568 -0.024855  0.056297 -0.047158 -0.088821   
998  10.79       NaN -0.084143 -0.005409  0.004295 -0.022515 -0.053508   
999  10.95  1.000000 -0.142545 -0.060386  0.068624 -0.009436 -0.102657   

            7         8         9  
0    0.034956  0.009291 -0.023217  
1    0.017204 -0.019817  0.020377  
2  

In [142]:
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import mean_squared_error

# Extract the "review/score" column
y = book_ratings["review/score"]

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(combined_df, y, test_size=0.2, random_state=42)

# Train the decision tree model
Model_2 = DecisionTreeRegressor()
Model_2.fit(X_train, y_train)

# Make predictions
y_pred = Model_1.predict(X_test)

# Evaluate the model
mse = mean_squared_error(y_test, y_pred)
print("Mean Squared Error:", mse)

Mean Squared Error: 1.0134984876999307


In [140]:
from sklearn.metrics import confusion_matrix
from sklearn.tree import DecisionTreeClassifier

# Discretize the "review/score" into categories
y_train_class = pd.cut(y_train, bins=[-np.inf, 2.5, 5.0, np.inf], labels=['Low', 'Medium', 'High'])

# Train the decision tree model for classification
Model_1_class = DecisionTreeClassifier()
Model_1_class.fit(X_train, y_train_class)

# Make predictions
y_pred_class = Model_1_class.predict(X_test)
y_test_class = pd.cut(y_test, bins=[-np.inf, 2.5, 5.0, np.inf], labels=['Low', 'Medium', 'High'])

# Evaluate the model using confusion matrix
conf_matrix = confusion_matrix(y_test_class, y_pred_class)
print("Confusion Matrix:")
print(conf_matrix)

Confusion Matrix:
[[  1  16]
 [ 10 173]]
