In [None]:
# IMPORTANT: RUN THIS CELL IN ORDER TO IMPORT YOUR KAGGLE DATA SOURCES
# TO THE CORRECT LOCATION (/kaggle/input) IN YOUR NOTEBOOK,
# THEN FEEL FREE TO DELETE THIS CELL.
# NOTE: THIS NOTEBOOK ENVIRONMENT DIFFERS FROM KAGGLE'S PYTHON
# ENVIRONMENT SO THERE MAY BE MISSING LIBRARIES USED BY YOUR
# NOTEBOOK.

import os
import sys
from tempfile import NamedTemporaryFile
from urllib.request import urlopen
from urllib.parse import unquote, urlparse
from urllib.error import HTTPError
from zipfile import ZipFile
import tarfile
import shutil

CHUNK_SIZE = 40960
DATA_SOURCE_MAPPING = 'amazon-books-reviews:https%3A%2F%2Fstorage.googleapis.com%2Fkaggle-data-sets%2F2476732%2F4200454%2Fbundle%2Farchive.zip%3FX-Goog-Algorithm%3DGOOG4-RSA-SHA256%26X-Goog-Credential%3Dgcp-kaggle-com%2540kaggle-161607.iam.gserviceaccount.com%252F20240904%252Fauto%252Fstorage%252Fgoog4_request%26X-Goog-Date%3D20240904T070744Z%26X-Goog-Expires%3D259200%26X-Goog-SignedHeaders%3Dhost%26X-Goog-Signature%3D63734674518450559419f2a4efa8d962e101eae6ff601eb6722e120856e001f20dc34d82f4bc0999088ed7dc13ce77c42629842d3b3cf7b2196d78636f23c7d561bf9d3bb7cf40689029da97f4a4b16786ab44b54cce836526e81fea0583b73d1438bbaeded19dcaf9a3257fe2c952174910758e429f9f54c4debb04872b20c198872786bc559e01cf5dd47528b95e18e4fdfae368e570cf9fa084b2d19da3dc2679ca6c541f6aa1c3be0a6f8b8ebe5e60bd253e635fc4fbdc33baecefdaf2c14716ea11aeb2ade10e789f27bf6f2a0cc2813305474af958146705a9386ec7493c532f5b15051fcd38918378fd4e00e80629735cd757722c0e99009f6b654cf2'

KAGGLE_INPUT_PATH='/kaggle/input'
KAGGLE_WORKING_PATH='/kaggle/working'
KAGGLE_SYMLINK='kaggle'

!umount /kaggle/input/ 2> /dev/null
shutil.rmtree('/kaggle/input', ignore_errors=True)
os.makedirs(KAGGLE_INPUT_PATH, 0o777, exist_ok=True)
os.makedirs(KAGGLE_WORKING_PATH, 0o777, exist_ok=True)

try:
  os.symlink(KAGGLE_INPUT_PATH, os.path.join("..", 'input'), target_is_directory=True)
except FileExistsError:
  pass
try:
  os.symlink(KAGGLE_WORKING_PATH, os.path.join("..", 'working'), target_is_directory=True)
except FileExistsError:
  pass

for data_source_mapping in DATA_SOURCE_MAPPING.split(','):
    directory, download_url_encoded = data_source_mapping.split(':')
    download_url = unquote(download_url_encoded)
    filename = urlparse(download_url).path
    destination_path = os.path.join(KAGGLE_INPUT_PATH, directory)
    try:
        with urlopen(download_url) as fileres, NamedTemporaryFile() as tfile:
            total_length = fileres.headers['content-length']
            print(f'Downloading {directory}, {total_length} bytes compressed')
            dl = 0
            data = fileres.read(CHUNK_SIZE)
            while len(data) > 0:
                dl += len(data)
                tfile.write(data)
                done = int(50 * dl / int(total_length))
                sys.stdout.write(f"\r[{'=' * done}{' ' * (50-done)}] {dl} bytes downloaded")
                sys.stdout.flush()
                data = fileres.read(CHUNK_SIZE)
            if filename.endswith('.zip'):
              with ZipFile(tfile) as zfile:
                zfile.extractall(destination_path)
            else:
              with tarfile.open(tfile.name) as tarfile:
                tarfile.extractall(destination_path)
            print(f'\nDownloaded and uncompressed: {directory}')
    except HTTPError as e:
        print(f'Failed to load (likely expired) {download_url} to path {destination_path}')
        continue
    except OSError as e:
        print(f'Failed to load {download_url} to path {destination_path}')
        continue

print('Data source import complete.')


In [None]:
import numpy as np
import pandas as pd

In [None]:
ratings = pd.read_csv('/kaggle/input/amazon-books-reviews/Books_rating.csv')
ratings.head()

Unnamed: 0,Id,Title,Price,User_id,profileName,review/helpfulness,review/score,review/time,review/summary,review/text
0,1882931173,Its Only Art If Its Well Hung!,,AVCGYZL8FQQTD,"Jim of Oz ""jim-of-oz""",7/7,4.0,940636800,Nice collection of Julie Strain images,This is only for Julie Strain fans. It's a col...
1,826414346,Dr. Seuss: American Icon,,A30TK6U7DNS82R,Kevin Killian,10/10,5.0,1095724800,Really Enjoyed It,I don't care much for Dr. Seuss but after read...
2,826414346,Dr. Seuss: American Icon,,A3UH4UZ4RSVO82,John Granger,10/11,5.0,1078790400,Essential for every personal and Public Library,"If people become the books they read and if ""t..."
3,826414346,Dr. Seuss: American Icon,,A2MVUWT453QH61,"Roy E. Perry ""amateur philosopher""",7/7,4.0,1090713600,Phlip Nel gives silly Seuss a serious treatment,"Theodore Seuss Geisel (1904-1991), aka &quot;D..."
4,826414346,Dr. Seuss: American Icon,,A22X4XUPKF66MR,"D. H. Richards ""ninthwavestore""",3/3,4.0,1107993600,Good academic overview,Philip Nel - Dr. Seuss: American IconThis is b...


In [None]:
ratings.dtypes

Id                     object
Title                  object
Price                 float64
User_id                object
profileName            object
review/helpfulness     object
review/score          float64
review/time             int64
review/summary         object
review/text            object
dtype: object

In [None]:
ratings.drop(['Id', 'User_id', 'profileName', 'review/time'], axis = 1, inplace=True)
ratings.isnull().sum()

Title                     208
Price                 2518829
review/helpfulness          0
review/score                0
review/summary            407
review/text                 8
dtype: int64

In [None]:
ratings.shape

(3000000, 6)

# Preprocessing

In [None]:
ratings.dropna(subset=['Title', 'review/summary', 'review/text'], inplace=True)
ratings.isnull().sum()

Title                       0
Price                 2518250
review/helpfulness          0
review/score                0
review/summary              0
review/text                 0
dtype: int64

In [None]:
ratings.shape

(2999377, 6)

In [None]:
from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
import re
import numpy as np

# Initialize the PorterStemmer
stemmer = PorterStemmer()

# Preprocessing function
def preprocess_text_vectorized(text_series):
    text_series = text_series.str.lower()
    text_series = text_series.str.replace(r'[^\w\s]', '', regex=True)
    text_series = text_series.str.replace(r'\d+', '', regex=True)
    text_series = text_series.apply(word_tokenize)

    stop_words = set(stopwords.words('english'))

    # Remove stop words and apply stemming
    text_series = text_series.apply(lambda tokens: [stemmer.stem(word) for word in tokens if word not in stop_words])

    return text_series.str.join(' ')

# Split the dataset into chunks (adjust the number of splits as needed)
n_splits = 30  # You can adjust this number based on the size of your data and available memory
chunks = np.array_split(ratings, n_splits)

# Process each chunk and save the result separately
for i, chunk in enumerate(chunks):
    chunk['title'] = preprocess_text_vectorized(chunk['Title'])
    chunk['review/Summary'] = preprocess_text_vectorized(chunk['review/summary'])
    chunk['review/Text'] = preprocess_text_vectorized(chunk['review/text'])

    # Save each processed chunk to a separate CSV file
    chunk.to_csv(f"Cleaned_ratings_chunk_{i}.csv", index=False)
    print(f"Processed and saved chunk {i + 1} of {n_splits}")

# List to hold each chunk DataFrame
processed_chunks = []

# Load each processed chunk and append to the list
for i in range(n_splits):
    chunk = pd.read_csv(f"Cleaned_ratings_chunk_{i}.csv")
    processed_chunks.append(chunk)

# Combine all chunks into a single DataFrame
processed_ratings = pd.concat(processed_chunks, ignore_index=True)

# Save the combined DataFrame
processed_ratings.to_csv("Cleaned_ratings_combined.csv", index=False)

print("All chunks combined and saved as 'Cleaned_ratings_combined.csv'")

  return bound(*args, **kwds)


Processed and saved chunk 1 of 30
Processed and saved chunk 2 of 30
Processed and saved chunk 3 of 30
Processed and saved chunk 4 of 30
Processed and saved chunk 5 of 30
Processed and saved chunk 6 of 30
Processed and saved chunk 7 of 30
Processed and saved chunk 8 of 30
Processed and saved chunk 9 of 30
Processed and saved chunk 10 of 30
Processed and saved chunk 11 of 30
Processed and saved chunk 12 of 30
Processed and saved chunk 13 of 30
Processed and saved chunk 14 of 30
Processed and saved chunk 15 of 30
Processed and saved chunk 16 of 30
Processed and saved chunk 17 of 30
Processed and saved chunk 18 of 30
Processed and saved chunk 19 of 30
Processed and saved chunk 20 of 30
Processed and saved chunk 21 of 30
Processed and saved chunk 22 of 30
Processed and saved chunk 23 of 30
Processed and saved chunk 24 of 30
Processed and saved chunk 25 of 30
Processed and saved chunk 26 of 30
Processed and saved chunk 27 of 30
Processed and saved chunk 28 of 30
Processed and saved chunk 29 

# Sentiment Analysis

In [None]:
import numpy as np
import pandas as pd

ratings = pd.read_csv('/kaggle/working/Cleaned_ratings_combined.csv')
ratings.head()

Unnamed: 0,Title,Price,review/helpfulness,review/score,review/summary,review/text,title,review/Summary,review/Text
0,Its Only Art If Its Well Hung!,,7/7,4.0,Nice collection of Julie Strain images,This is only for Julie Strain fans. It's a col...,art well hung,nice collect juli strain imag,juli strain fan collect photo page worth nice ...
1,Dr. Seuss: American Icon,,10/10,5.0,Really Enjoyed It,I don't care much for Dr. Seuss but after read...,dr seuss american icon,realli enjoy,dont care much dr seuss read philip nel book c...
2,Dr. Seuss: American Icon,,10/11,5.0,Essential for every personal and Public Library,"If people become the books they read and if ""t...",dr seuss american icon,essenti everi person public librari,peopl becom book read child father man dr seus...
3,Dr. Seuss: American Icon,,7/7,4.0,Phlip Nel gives silly Seuss a serious treatment,"Theodore Seuss Geisel (1904-1991), aka &quot;D...",dr seuss american icon,phlip nel give silli seuss seriou treatment,theodor seuss geisel aka quotdr seussquot one ...
4,Dr. Seuss: American Icon,,3/3,4.0,Good academic overview,Philip Nel - Dr. Seuss: American IconThis is b...,dr seuss american icon,good academ overview,philip nel dr seuss american iconthi basic aca...


In [None]:
ratings.drop(['title', 'review/summary', 'review/text', 'review/helpfulness'], axis = 1, inplace = True)
ratings.shape

(2999377, 5)

In [None]:
# Remove duplicate rows
ratings = ratings.drop_duplicates(keep = 'first')

# Save the cleaned DataFrame to a new CSV file (optional)
ratings.to_csv("Cleaned_ratings_no_duplicates.csv", index=False)

# Display the first few rows of the cleaned DataFrame
ratings.head()

Unnamed: 0,Title,Price,review/score,review/Summary,review/Text
0,Its Only Art If Its Well Hung!,,4.0,nice collect juli strain imag,juli strain fan collect photo page worth nice ...
1,Dr. Seuss: American Icon,,5.0,realli enjoy,dont care much dr seuss read philip nel book c...
2,Dr. Seuss: American Icon,,5.0,essenti everi person public librari,peopl becom book read child father man dr seus...
3,Dr. Seuss: American Icon,,4.0,phlip nel give silli seuss seriou treatment,theodor seuss geisel aka quotdr seussquot one ...
4,Dr. Seuss: American Icon,,4.0,good academ overview,philip nel dr seuss american iconthi basic aca...


In [None]:
ratings.shape

(2647307, 5)