In [1]:
import pandas as pd
from my_functions import *
from collections import Counter

import nltk
from nltk import FreqDist
from nltk.corpus import stopwords

# Get the original Datasets
df_train = pd.read_csv('D:\\Tensorflow\\yelp_reviews\\yelp_polarity_train.csv', names = ["sentiment", "review"])

# Drop NaN 
df_train = df_train.dropna()

# Checking for nulls
df_train.isnull().any()

# Mapping dictionary for replacement of 'negative' with 0 and 'positive' with 1
mapping = {1: 0, 2: 1}

# Replace values in the 'sentiment' column using the mapping dictionary
df_train['sentiment'] = df_train['sentiment'].replace(mapping)

print("Train Dataset:", len(df_train)) 

# Create a new column 'clean_review'
df_train['clean_review'] = df_train['review'].astype('str')

# Convert to lowercase 
df_train['clean_review'] = df_train.clean_review.str.lower()
df_train



Train Dataset: 560000


Unnamed: 0,sentiment,review,clean_review
0,0,"Unfortunately, the frustration of being Dr. Go...","unfortunately, the frustration of being dr. go..."
1,1,Been going to Dr. Goldberg for over 10 years. ...,been going to dr. goldberg for over 10 years. ...
2,0,I don't know what Dr. Goldberg was like before...,i don't know what dr. goldberg was like before...
3,0,I'm writing this review to give you a heads up...,i'm writing this review to give you a heads up...
4,1,All the food is great here. But the best thing...,all the food is great here. but the best thing...
...,...,...,...
559995,1,Ryan was as good as everyone on yelp has claim...,ryan was as good as everyone on yelp has claim...
559996,1,Professional \nFriendly\nOn time AND affordabl...,professional \nfriendly\non time and affordabl...
559997,0,Phone calls always go to voicemail and message...,phone calls always go to voicemail and message...
559998,0,Looks like all of the good reviews have gone t...,looks like all of the good reviews have gone t...


In [3]:
df_train['clean_review'] = df_train['clean_review'].apply(lambda text: delete_newlines_and_tabs(text))
df_train['clean_review'] = df_train['clean_review'].apply(lambda text: strip_html_tags(text))
df_train['clean_review'] = df_train['clean_review'].apply(lambda text: remove_links(text))
df_train['clean_review'] = df_train['clean_review'].apply(lambda text: remove_dates_and_numbers(text))
df_train['clean_review'] = df_train['clean_review'].apply(lambda text: remove_special_characters(text))
df_train['clean_review'] = df_train['clean_review'].apply(lambda text: reduce_character_repeatation(text))
df_train['clean_review'] = df_train['clean_review'].apply(lambda text: remove_accented_characters(text))
df_train['clean_review'] = df_train['clean_review'].apply(lambda text: remove_whitespace(text))
df_train['clean_review'] = df_train['clean_review'].apply(lambda text: expand_contractions(text))
df_train['clean_review'] = df_train['clean_review'].apply(lambda text: remove_stopwords(text))
df_train['clean_review'] = df_train['clean_review'].apply(lambda text: remove_dots_commas(text))

df_train



Unnamed: 0,sentiment,review,clean_review
0,0,"Unfortunately, the frustration of being Dr. Go...",unfortunately frustration dr goldberg patient ...
1,1,Been going to Dr. Goldberg for over 10 years. ...,going dr goldberg years think one patients sta...
2,0,I don't know what Dr. Goldberg was like before...,know dr goldberg like moving arizona let tell ...
3,0,I'm writing this review to give you a heads up...,writing review give heads see doctor office st...
4,1,All the food is great here. But the best thing...,food great best thing wings wings simply fanta...
...,...,...,...
559995,1,Ryan was as good as everyone on yelp has claim...,ryan good everyone yelp claimed courteous know...
559996,1,Professional \nFriendly\nOn time AND affordabl...,professional friendly time affordable definite...
559997,0,Phone calls always go to voicemail and message...,phone calls always go voicemail messages retur...
559998,0,Looks like all of the good reviews have gone t...,looks like good reviews gone head place jason ...


In [4]:
results = Counter()
df_train['clean_review'].str.split().apply(results.update)
print(len(results))

280821


In [5]:
# Download the stopwords corpus
nltk.download('stopwords')

# Tokenize the text and create a frequency distribution
tokenized_words = word_tokenize(' '.join(df_train['clean_review']))
fdist = FreqDist(tokenized_words)

# Define the threshold for uncommon words
threshold = 6

# Get the set of uncommon words
uncommon_words = set(word for word, freq in fdist.items() if freq <= threshold)

# Remove uncommon words from the review column
df_train['clean_review'] = df_train['clean_review'].apply(lambda text: ' '.join([word for word in word_tokenize(text) if word not in uncommon_words]))

# Display the modified DataFrame
df_train

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\nasra\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Unnamed: 0,sentiment,review,clean_review
0,0,"Unfortunately, the frustration of being Dr. Go...",unfortunately frustration dr goldberg patient ...
1,1,Been going to Dr. Goldberg for over 10 years. ...,going dr goldberg years think one patients sta...
2,0,I don't know what Dr. Goldberg was like before...,know dr goldberg like moving arizona let tell ...
3,0,I'm writing this review to give you a heads up...,writing review give heads see doctor office st...
4,1,All the food is great here. But the best thing...,food great best thing wings wings simply fanta...
...,...,...,...
559995,1,Ryan was as good as everyone on yelp has claim...,ryan good everyone yelp claimed courteous know...
559996,1,Professional \nFriendly\nOn time AND affordabl...,professional friendly time affordable definite...
559997,0,Phone calls always go to voicemail and message...,phone calls always go voicemail messages retur...
559998,0,Looks like all of the good reviews have gone t...,looks like good reviews gone head place jason ...


In [7]:
# Check the first 10 samples (raw vs. processed)
for i in range(10):
    print('Raw Text:', df_train['review'][i])
    print()
    print('Clean Text:', df_train['clean_review'][i])
    print('-----------')
    print()

Raw Text: Unfortunately, the frustration of being Dr. Goldberg's patient is a repeat of the experience I've had with so many other doctors in NYC -- good doctor, terrible staff.  It seems that his staff simply never answers the phone.  It usually takes 2 hours of repeated calling to get an answer.  Who has time for that or wants to deal with it?  I have run into this problem with many other doctors and I just don't get it.  You have office workers, you have patients with medical needs, why isn't anyone answering the phone?  It's incomprehensible and not work the aggravation.  It's with regret that I feel that I have to give Dr. Goldberg 2 stars.

Clean Text: unfortunately frustration dr goldberg patient repeat experience many doctors nyc good doctor terrible staff seems staff simply never answers phone usually takes hours repeated calling get answer time wants deal run problem many doctors get office workers patients medical needs anyone answering phone incomprehensible work aggravatio

In [11]:
# Drop original review column and rename the new clean_review to review
df_train = df_train.drop(columns = ['review'])
df_train = df_train.rename(columns={"clean_review":"review"})
df_train

Unnamed: 0,sentiment,review
0,0,unfortunately frustration dr goldberg patient ...
1,1,going dr goldberg years think one patients sta...
2,0,know dr goldberg like moving arizona let tell ...
3,0,writing review give heads see doctor office st...
4,1,food great best thing wings wings simply fanta...
...,...,...
559995,1,ryan good everyone yelp claimed courteous know...
559996,1,professional friendly time affordable definite...
559997,0,phone calls always go voicemail messages retur...
559998,0,looks like good reviews gone head place jason ...


In [12]:
# Save dataframe to a new CSV file 
df_train.to_csv('D:\\Tensorflow\\yelp_reviews\\yelp_polarity_train_clean.csv', index=False)