In [1]:
# importinng dependencies
import pandas as pd
import zipfile as zf

In [2]:
# importing the Zip file 'CSV_cleaned_for_PySpark' and reading into a df
zip_file = zf.ZipFile('PySpark_DataFile/data_ready_for_PySpark_NLP.zip') 
hotel_reviews = pd.read_csv(zip_file.open('data_ready_for_PySpark_NLP.csv'))
hotel_reviews

Unnamed: 0,Review,Reviewer_Score
0,I am so angry that i made this post available...,2.9
1,No real complaints the hotel was great great ...,7.5
2,Rooms are nice but for elderly a bit difficul...,7.1
3,My room was dirty and I was afraid to walk ba...,3.8
4,You When I booked with your company on line y...,6.7
...,...,...
512338,no trolly or staff to help you take the lugga...,7.0
512339,The hotel looks like 3 but surely not 4 Brea...,5.8
512340,The ac was useless It was a hot week in vienn...,2.5
512341,The rooms are enormous and really comfortable...,8.8


# Challenge 
* Choosing 3 categories improved the accuracy of our NLP algorithm is 77.63 %, now trying to imporve it further

# Solution: 
* Adding a new column 'Review_Sentiment' that will have the values as 'positive' and 'negative'
* Sentiment 'Positive' = Reviewer_Score >= 5
* Sentiment 'Negative' = Reviewer_Score <= 4.9

In [3]:
# creating a new column: 'Reviewer_Sentiment' - values 'positive' and 'negative'
hotel_reviews['Reviewer_Sentiment'] = ''
hotel_reviews

Unnamed: 0,Review,Reviewer_Score,Reviewer_Sentiment
0,I am so angry that i made this post available...,2.9,
1,No real complaints the hotel was great great ...,7.5,
2,Rooms are nice but for elderly a bit difficul...,7.1,
3,My room was dirty and I was afraid to walk ba...,3.8,
4,You When I booked with your company on line y...,6.7,
...,...,...,...
512338,no trolly or staff to help you take the lugga...,7.0,
512339,The hotel looks like 3 but surely not 4 Brea...,5.8,
512340,The ac was useless It was a hot week in vienn...,2.5,
512341,The rooms are enormous and really comfortable...,8.8,


In [4]:
# For Scores <= 4.9, changing the value in 'review'

hotel_reviews.loc[hotel_reviews["Reviewer_Score"] <= 4.9, "Reviewer_Sentiment"] = "negative"
hotel_reviews.loc[hotel_reviews["Reviewer_Score"] >= 5.0, "Reviewer_Sentiment"] = "positive"

hotel_reviews

Unnamed: 0,Review,Reviewer_Score,Reviewer_Sentiment
0,I am so angry that i made this post available...,2.9,negative
1,No real complaints the hotel was great great ...,7.5,positive
2,Rooms are nice but for elderly a bit difficul...,7.1,positive
3,My room was dirty and I was afraid to walk ba...,3.8,negative
4,You When I booked with your company on line y...,6.7,positive
...,...,...,...
512338,no trolly or staff to help you take the lugga...,7.0,positive
512339,The hotel looks like 3 but surely not 4 Brea...,5.8,positive
512340,The ac was useless It was a hot week in vienn...,2.5,negative
512341,The rooms are enormous and really comfortable...,8.8,positive


In [5]:
# dropping the Column 'Reviewer_Score'
hotel_reviews = hotel_reviews.drop(columns = 'Reviewer_Score')
hotel_reviews.index.name='Review_Id'
hotel_reviews

Unnamed: 0_level_0,Review,Reviewer_Sentiment
Review_Id,Unnamed: 1_level_1,Unnamed: 2_level_1
0,I am so angry that i made this post available...,negative
1,No real complaints the hotel was great great ...,positive
2,Rooms are nice but for elderly a bit difficul...,positive
3,My room was dirty and I was afraid to walk ba...,negative
4,You When I booked with your company on line y...,positive
...,...,...
512338,no trolly or staff to help you take the lugga...,positive
512339,The hotel looks like 3 but surely not 4 Brea...,positive
512340,The ac was useless It was a hot week in vienn...,negative
512341,The rooms are enormous and really comfortable...,positive


In [6]:
# Exporting as a Zip file
compression_opts = dict(method='zip', archive_name='hotel_reviews_for_NLP4_Positive_Negative_review_score.csv')  
hotel_reviews.to_csv('PySpark_DataFile/hotel_reviews_for_NLP4_Positive_Negative_review_score.zip', index=True, compression=compression_opts) 