In [6]:
import pandas as pd

# Load the dataset
data = pd.read_csv('/content/Amazon Product Review Database.zip')  # 'Amazon Product Review Database.zip' with your actual file name

# Display the first few rows to understand the data
print(data.head())

# Check for missing values
print(data.isnull().sum())

# Drop columns that are not needed for analysis
columns_to_drop = ['id', 'asins', 'dateUpdated', 'dimension', 'ean', 'keys', 'manufacturerNumber', 'reviews.sourceURLs', 'upc', 'weight']
data = data.drop(columns=columns_to_drop)

# Handling missing values
# Drop rows with missing values in critical columns
data.dropna(subset=['reviews.date', 'reviews.rating', 'reviews.text'], inplace=True)

# Convert 'reviews.date' to datetime format
data['reviews.date'] = pd.to_datetime(data['reviews.date'])

# Handling duplicates
data = data.drop_duplicates()

# Text cleaning for 'reviews.text' column
data['reviews.text'] = data['reviews.text'].astype(str)  # Convert to string
data['reviews.text'] = data['reviews.text'].str.replace('[^a-zA-Z]', ' ')  # Remove non-alphabetic characters

# Encoding categorical variables
# One-hot encoding for 'categories' column
data = pd.get_dummies(data, columns=['categories'])

# Scale numerical features (if needed)
# Example:
# from sklearn.preprocessing import MinMaxScaler
# scaler = MinMaxScaler()
# data[['reviews.rating', 'reviews.numHelpful']] = scaler.fit_transform(data[['reviews.rating', 'reviews.numHelpful']])

# Prepare for modeling
# Define X (features) and y (target variable)
X = data.drop(['reviews.rating'], axis=1)  # Assuming 'reviews.rating' is the target variable
y = data['reviews.rating']

# Split the dataset into training and testing sets
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Further preprocessing steps and model fitting would be done here

# For instance, fitting a model using RandomForestClassifier
from sklearn.ensemble import RandomForestClassifier
model = RandomForestClassifier()




                     id       asins   brand                  categories  \
0  AVpe7AsMilAPnD_xQ78G  B00QJDU3KY  Amazon  Amazon Devices,mazon.co.uk   
1  AVpe7AsMilAPnD_xQ78G  B00QJDU3KY  Amazon  Amazon Devices,mazon.co.uk   
2  AVpe7AsMilAPnD_xQ78G  B00QJDU3KY  Amazon  Amazon Devices,mazon.co.uk   
3  AVpe7AsMilAPnD_xQ78G  B00QJDU3KY  Amazon  Amazon Devices,mazon.co.uk   
4  AVpe7AsMilAPnD_xQ78G  B00QJDU3KY  Amazon  Amazon Devices,mazon.co.uk   

  colors             dateAdded           dateUpdated  \
0    NaN  2016-03-08T20:21:53Z  2017-07-18T23:52:58Z   
1    NaN  2016-03-08T20:21:53Z  2017-07-18T23:52:58Z   
2    NaN  2016-03-08T20:21:53Z  2017-07-18T23:52:58Z   
3    NaN  2016-03-08T20:21:53Z  2017-07-18T23:52:58Z   
4    NaN  2016-03-08T20:21:53Z  2017-07-18T23:52:58Z   

                  dimension  ean                         keys  ...  \
0  169 mm x 117 mm x 9.1 mm  NaN  kindlepaperwhite/b00qjdu3ky  ...   
1  169 mm x 117 mm x 9.1 mm  NaN  kindlepaperwhite/b00qjdu3ky  ...   
2 

  data['reviews.text'] = data['reviews.text'].str.replace('[^a-zA-Z]', ' ')  # Remove non-alphabetic characters
