In [1]:
# importinng dependencies
import pandas as pd
import zipfile as zf
import numpy as np

In [2]:
# importing the Zip file 'CSV_cleaned_for_PySpark' and reading into a df
zip_file = zf.ZipFile('PySpark_DataFile/hotel_reviews_for_NLP2_int_review_score.zip') 
hotel_reviews = pd.read_csv(zip_file.open('hotel_reviews_for_NLP2_int_review_score.csv'))
hotel_reviews

Unnamed: 0,Review,Reviewer_Score_Int
0,I am so angry that i made this post available...,3
1,No real complaints the hotel was great great ...,8
2,Rooms are nice but for elderly a bit difficul...,7
3,My room was dirty and I was afraid to walk ba...,4
4,You When I booked with your company on line y...,7
...,...,...
512338,no trolly or staff to help you take the lugga...,7
512339,The hotel looks like 3 but surely not 4 Brea...,6
512340,The ac was useless It was a hot week in vienn...,2
512341,The rooms are enormous and really comfortable...,9


# Challenge 
* Choosing the format of Reviewer_Score as int resulted in an increase in accuracy, however the accuracy of 39.62% is still low

# Solution: 
* Convert the values in the column 'Reviewer_Score' to 5 star rating: 1 to 5

## Categories
* Star Rating 1: Integer Scores 0-2
* Star Rating 2: Integer Scores 3-4
* Star Rating 3: Integer Scores 5-6
* Star Rating 4: Integer Scores 7-8
* Star Rating 5: Integer Scores 9-10

In [3]:
# Create a new column 'Star_Rating'
hotel_reviews['Star_Rating'] = ''
hotel_reviews

Unnamed: 0,Review,Reviewer_Score_Int,Star_Rating
0,I am so angry that i made this post available...,3,
1,No real complaints the hotel was great great ...,8,
2,Rooms are nice but for elderly a bit difficul...,7,
3,My room was dirty and I was afraid to walk ba...,4,
4,You When I booked with your company on line y...,7,
...,...,...,...
512338,no trolly or staff to help you take the lugga...,7,
512339,The hotel looks like 3 but surely not 4 Brea...,6,
512340,The ac was useless It was a hot week in vienn...,2,
512341,The rooms are enormous and really comfortable...,9,


In [4]:
# Converting values into Categories
hotel_reviews.loc[(hotel_reviews['Reviewer_Score_Int'] <= 2), 'Star_Rating'] = '1'
hotel_reviews.loc[(hotel_reviews['Reviewer_Score_Int'] >= 9), 'Star_Rating'] = '5'
hotel_reviews.loc[(hotel_reviews['Reviewer_Score_Int'] >= 3) & (hotel_reviews['Reviewer_Score_Int'] <= 4), 'Star_Rating'] = '2'
hotel_reviews.loc[(hotel_reviews['Reviewer_Score_Int'] >= 5) & (hotel_reviews['Reviewer_Score_Int'] <= 6), 'Star_Rating'] = '3'
hotel_reviews.loc[(hotel_reviews['Reviewer_Score_Int'] >= 7) & (hotel_reviews['Reviewer_Score_Int'] <= 8), 'Star_Rating'] = '4'
hotel_reviews

Unnamed: 0,Review,Reviewer_Score_Int,Star_Rating
0,I am so angry that i made this post available...,3,2
1,No real complaints the hotel was great great ...,8,4
2,Rooms are nice but for elderly a bit difficul...,7,4
3,My room was dirty and I was afraid to walk ba...,4,2
4,You When I booked with your company on line y...,7,4
...,...,...,...
512338,no trolly or staff to help you take the lugga...,7,4
512339,The hotel looks like 3 but surely not 4 Brea...,6,3
512340,The ac was useless It was a hot week in vienn...,2,1
512341,The rooms are enormous and really comfortable...,9,5


In [5]:
# Drop the column 'Reviewer_Score'
hotel_reviews = hotel_reviews.drop(columns ='Reviewer_Score_Int')
hotel_reviews

Unnamed: 0,Review,Star_Rating
0,I am so angry that i made this post available...,2
1,No real complaints the hotel was great great ...,4
2,Rooms are nice but for elderly a bit difficul...,4
3,My room was dirty and I was afraid to walk ba...,2
4,You When I booked with your company on line y...,4
...,...,...
512338,no trolly or staff to help you take the lugga...,4
512339,The hotel looks like 3 but surely not 4 Brea...,3
512340,The ac was useless It was a hot week in vienn...,1
512341,The rooms are enormous and really comfortable...,5


In [6]:
# Exporting as a Zip file
compression_opts = dict(method='zip', archive_name='hotel_reviews_for_NLP4_Star_Rating.csv')  
hotel_reviews.to_csv('PySpark_DataFile/hotel_reviews_for_NLP4_Star_Rating.zip', index=False, compression=compression_opts) 