In [1]:
import pandas as pd 
df = pd.read_csv("../data/tripadvisor_hotel_reviews.csv")
df

Unnamed: 0,Review,Rating
0,nice hotel expensive parking got good deal sta...,4
1,ok nothing special charge diamond member hilto...,2
2,nice rooms not 4* experience hotel monaco seat...,3
3,"unique, great stay, wonderful time hotel monac...",5
4,"great stay great stay, went seahawk game aweso...",5
...,...,...
20486,"best kept secret 3rd time staying charm, not 5...",5
20487,great location price view hotel great quick pl...,4
20488,"ok just looks nice modern outside, desk staff ...",2
20489,hotel theft ruined vacation hotel opened sept ...,1


In [2]:
import sys
from pathlib import Path
sys.path.append(str(Path("../utils/preprocessing.py").resolve().parent.parent / "utils"))
from preprocessing import preprocess_text

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Lenovo\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Lenovo\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Lenovo\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20491 entries, 0 to 20490
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   Review  20491 non-null  object
 1   Rating  20491 non-null  int64 
dtypes: int64(1), object(1)
memory usage: 320.3+ KB


In [4]:
df["Rating"].unique()

array([4, 2, 3, 5, 1])

In [5]:
df.duplicated().sum() 

np.int64(0)

In [6]:
df.isna().sum()

Review    0
Rating    0
dtype: int64

In [7]:
df['Review'] = df['Review'].astype(str)

In [8]:
df['cleaned_review'] = df['Review'].apply(preprocess_text)
df

Unnamed: 0,Review,Rating,cleaned_review
0,nice hotel expensive parking got good deal sta...,4,nice hotel expensive parking got good deal sta...
1,ok nothing special charge diamond member hilto...,2,ok nothing special charge diamond member hilto...
2,nice rooms not 4* experience hotel monaco seat...,3,nice room experience hotel monaco seattle good...
3,"unique, great stay, wonderful time hotel monac...",5,unique great stay wonderful time hotel monaco ...
4,"great stay great stay, went seahawk game aweso...",5,great stay great stay went seahawk game awesom...
...,...,...,...
20486,"best kept secret 3rd time staying charm, not 5...",5,best kept secret rd time staying charm star ca...
20487,great location price view hotel great quick pl...,4,great location price view hotel great quick pl...
20488,"ok just looks nice modern outside, desk staff ...",2,ok look nice modern outside desk staff nt part...
20489,hotel theft ruined vacation hotel opened sept ...,1,hotel theft ruined vacation hotel opened sept ...


In [None]:
from nltk.tokenize import sent_tokenize

def chunk_text(text, max_words=500):
    sentences = sent_tokenize(text)
    chunks = []
    current_chunk = ''
    
    for sentence in sentences:
        if len(current_chunk.split()) + len(sentence.split()) <= max_words:
            current_chunk += ' ' + sentence
        else:
            chunks.append(current_chunk.strip())
            current_chunk = sentence
    if current_chunk:
        chunks.append(current_chunk.strip())
    
    return chunks

chunk_list = []

for idx, row in df.iterrows():
    review_chunks = chunk_text(row['cleaned_review'])
    for chunk in review_chunks:
        chunk_list.append({
            'review_chunk': chunk,
            'original_index': idx,
            'rating': row['Rating']
        })

chunk_df = pd.DataFrame(chunk_list)

In [10]:
chunk_df.to_csv('../data/processed_chunks.csv', index=False)

In [13]:
data = pd.read_csv("../data/processed_chunks.csv")
data 

Unnamed: 0,review_chunk,original_index,rating
0,nice hotel expensive parking got good deal sta...,0,4
1,ok nothing special charge diamond member hilto...,1,2
2,nice room experience hotel monaco seattle good...,2,3
3,unique great stay wonderful time hotel monaco ...,3,5
4,great stay great stay went seahawk game awesom...,4,5
...,...,...,...
20658,great location price view hotel great quick pl...,20487,4
20659,ok look nice modern outside desk staff nt part...,20488,2
20660,,20489,1
20661,hotel theft ruined vacation hotel opened sept ...,20489,1


In [14]:
data.isna().sum()

review_chunk      172
original_index      0
rating              0
dtype: int64

In [16]:
data.dropna(inplace = True)
data.isna().sum()

review_chunk      0
original_index    0
rating            0
dtype: int64

In [18]:
data.info()

<class 'pandas.core.frame.DataFrame'>
Index: 20491 entries, 0 to 20662
Data columns (total 3 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   review_chunk    20491 non-null  object
 1   original_index  20491 non-null  int64 
 2   rating          20491 non-null  int64 
dtypes: int64(2), object(1)
memory usage: 640.3+ KB


In [17]:
data.to_csv('../data/processed_chunks.csv', index=False)