In [1]:
# importinng dependencies
import pandas as pd
import zipfile as zf
import numpy as np

In [2]:
# importing the Zip file 'CSV_cleaned_for_PySpark' and reading into a df
zip_file = zf.ZipFile('PySpark_DataFile/hotel_reviews_for_NLP2_int_review_score.zip') 
hotel_reviews = pd.read_csv(zip_file.open('hotel_reviews_for_NLP2_int_review_score.csv'))
hotel_reviews

Unnamed: 0,Review,Reviewer_Score_Int
0,I am so angry that i made this post available...,3
1,No real complaints the hotel was great great ...,8
2,Rooms are nice but for elderly a bit difficul...,7
3,My room was dirty and I was afraid to walk ba...,4
4,You When I booked with your company on line y...,7
...,...,...
512338,no trolly or staff to help you take the lugga...,7
512339,The hotel looks like 3 but surely not 4 Brea...,6
512340,The ac was useless It was a hot week in vienn...,2
512341,The rooms are enormous and really comfortable...,9


# Challenge 
* Choosing the format of Reviewer_Score as int resulted in an increase in accuracy, however the accuracy of 39.62% is still low

# Solution: 
* Convert the values in the column 'Reviewer_Score' to 3 categories - Bad, Average, and Good

## Categories
* Category 1: "Bad" - Score <= 4 
* Category 2: "Average" - Score 5-7
* Category 3: "Good" - Score >= 8

In [3]:
# Create a new column 'Review_Category'
hotel_reviews['Review_Category'] = ''
hotel_reviews

Unnamed: 0,Review,Reviewer_Score_Int,Review_Category
0,I am so angry that i made this post available...,3,
1,No real complaints the hotel was great great ...,8,
2,Rooms are nice but for elderly a bit difficul...,7,
3,My room was dirty and I was afraid to walk ba...,4,
4,You When I booked with your company on line y...,7,
...,...,...,...
512338,no trolly or staff to help you take the lugga...,7,
512339,The hotel looks like 3 but surely not 4 Brea...,6,
512340,The ac was useless It was a hot week in vienn...,2,
512341,The rooms are enormous and really comfortable...,9,


In [4]:
# Converting values into Categories
hotel_reviews.loc[(hotel_reviews['Reviewer_Score_Int'] <= 4), 'Review_Category'] = 'Bad'
hotel_reviews.loc[(hotel_reviews['Reviewer_Score_Int'] >= 8), 'Review_Category'] = 'Good'
hotel_reviews.loc[(hotel_reviews['Reviewer_Score_Int'] > 4) & (hotel_reviews['Reviewer_Score_Int'] < 8), 'Review_Category'] = 'Average'
hotel_reviews

Unnamed: 0,Review,Reviewer_Score_Int,Review_Category
0,I am so angry that i made this post available...,3,Bad
1,No real complaints the hotel was great great ...,8,Good
2,Rooms are nice but for elderly a bit difficul...,7,Average
3,My room was dirty and I was afraid to walk ba...,4,Bad
4,You When I booked with your company on line y...,7,Average
...,...,...,...
512338,no trolly or staff to help you take the lugga...,7,Average
512339,The hotel looks like 3 but surely not 4 Brea...,6,Average
512340,The ac was useless It was a hot week in vienn...,2,Bad
512341,The rooms are enormous and really comfortable...,9,Good


In [5]:
# Drop the column 'Reviewer_Score'
hotel_reviews = hotel_reviews.drop(columns ='Reviewer_Score_Int')
hotel_reviews

Unnamed: 0,Review,Review_Category
0,I am so angry that i made this post available...,Bad
1,No real complaints the hotel was great great ...,Good
2,Rooms are nice but for elderly a bit difficul...,Average
3,My room was dirty and I was afraid to walk ba...,Bad
4,You When I booked with your company on line y...,Average
...,...,...
512338,no trolly or staff to help you take the lugga...,Average
512339,The hotel looks like 3 but surely not 4 Brea...,Average
512340,The ac was useless It was a hot week in vienn...,Bad
512341,The rooms are enormous and really comfortable...,Good


In [6]:
# Exporting as a Zip file
compression_opts = dict(method='zip', archive_name='hotel_reviews_for_NLP3_3categories_review_score.csv')  
hotel_reviews.to_csv('PySpark_DataFile/hotel_reviews_for_NLP3_3categories_review_score.zip', index=False, compression=compression_opts) 