# RoboReviews Project
#### The new product review aggregator

## Phase 1: Setting up the environment

#### Loading the Dataset and Explore

In [None]:
import pandas as pd

# Load dataset
dataset_path = r'combined_amazon_reviews.csv'
df = pd.read_csv(dataset_path)

# Print the first few rows of the DataFrame
print(df.head())

# Print the column names
print(df.columns)

In [None]:
print(df.columns)


In [None]:
df.head()

#### Removing unnecesary columns

In [None]:
df = df[['name', 'brand', 'categories', 'reviews.rating', 'reviews.text']]
df.head()

## Phase 2: Data Preprocessing

#### Split the data

In [9]:
from sklearn.model_selection import train_test_split

X = df[['reviews.text']]  # feature column (review text)
y = df['reviews.rating']  # target column (rating)

X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.3, random_state=42)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)

### Data Cleaning

Not removing duplicates since we want to see which products have the most reviews

In [None]:
# Check for missing values
print(df.isnull().sum())

# Remove any rows with missing values in the 'reviews.text' column
df_cleaned = df.dropna(subset=['reviews.text', 'reviews.rating','categories'])


print("Missing values after cleaning:")
print(df_cleaned)


Text Preprocessing

In [None]:
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import string

nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

def preprocess_text(text):
    # Tokenize the text
    tokens = word_tokenize(text)
    
    # Remove stopwords and punctuation
    tokens = [token for token in tokens if token not in stopwords.words('english') and token not in string.punctuation]
    
    # Lemmatize the tokens
    lemmatizer = WordNetLemmatizer()
    tokens = [lemmatizer.lemmatize(token) for token in tokens]
    
    # Convert to lowercase
    tokens = [token.lower() for token in tokens]
    
    # Join the tokens back into a string
    text = ' '.join(tokens)
    
    return text

df_cleaned['reviews.text'] = df_cleaned['reviews.text'].apply(preprocess_text)


Feature extraction

In [12]:

from sklearn.feature_extraction.text import TfidfVectorizer

# Create a TfidfVectorizer object
vectorizer = TfidfVectorizer()

# Fit the vectorizer to the preprocessed text data and transform it into a matrix of TF-IDF features
X = vectorizer.fit_transform(df_cleaned['reviews.text'])

### Sentiment Labeling

In [None]:
# Creating a new column to store the sentiment labels

df_cleaned['sentiment'] = df_cleaned['reviews.rating'].apply(lambda x: 'Positive' if x >= 4 else 'Neutral' if x == 3 else 'Negative')

### Category Clustering
Using K-Means for this which is a popular unsupervised learning algorithm that can handle categorical data.

In [None]:
from sklearn.cluster import KMeans
from sklearn.preprocessing import LabelEncoder

# Create a KMeans model with 4 clusters (eBook readers, batteries, accessories, non-electronics)
kmeans = KMeans(n_clusters=4)

# Fit the KMeans model to the data
kmeans.fit(X)

# Predict the cluster labels for each review
df_cleaned['cluster'] = kmeans.labels_

## Phase 3: Build Models