In [1]:
# importinng dependencies
import pandas as pd
import zipfile as zf
import numpy as np

In [2]:
# importing the Zip file 'CSV_cleaned_for_PySpark' and reading into a df
zip_file = zf.ZipFile('PySpark_DataFile/data_ready_for_PySpark_NLP.zip') 
hotel_reviews = pd.read_csv(zip_file.open('data_ready_for_PySpark_NLP.csv'))
hotel_reviews

Unnamed: 0,Review,Reviewer_Score
0,I am so angry that i made this post available...,2.9
1,No real complaints the hotel was great great ...,7.5
2,Rooms are nice but for elderly a bit difficul...,7.1
3,My room was dirty and I was afraid to walk ba...,3.8
4,You When I booked with your company on line y...,6.7
...,...,...
512338,no trolly or staff to help you take the lugga...,7.0
512339,The hotel looks like 3 but surely not 4 Brea...,5.8
512340,The ac was useless It was a hot week in vienn...,2.5
512341,The rooms are enormous and really comfortable...,8.8


# Challenge 
* Choosing the format of Reviewer_Score as float resulted in too big an output range for the PySpark Algorithm, which resulted in very low accuracy of the model: 20.77%

# Solution: 
* Rounding the format of Reviewer_Score to int

In [3]:
# creating a new column 'Reviewer_Score_int'
hotel_reviews['Reviewer_Score_Int'] = hotel_reviews['Reviewer_Score'].round(0).astype(int)
hotel_reviews

Unnamed: 0,Review,Reviewer_Score,Reviewer_Score_Int
0,I am so angry that i made this post available...,2.9,3
1,No real complaints the hotel was great great ...,7.5,8
2,Rooms are nice but for elderly a bit difficul...,7.1,7
3,My room was dirty and I was afraid to walk ba...,3.8,4
4,You When I booked with your company on line y...,6.7,7
...,...,...,...
512338,no trolly or staff to help you take the lugga...,7.0,7
512339,The hotel looks like 3 but surely not 4 Brea...,5.8,6
512340,The ac was useless It was a hot week in vienn...,2.5,2
512341,The rooms are enormous and really comfortable...,8.8,9


In [4]:
# drop the column 'Reviewer_Score'
hotel_reviews = hotel_reviews.drop(columns='Reviewer_Score')
hotel_reviews 


Unnamed: 0,Review,Reviewer_Score_Int
0,I am so angry that i made this post available...,3
1,No real complaints the hotel was great great ...,8
2,Rooms are nice but for elderly a bit difficul...,7
3,My room was dirty and I was afraid to walk ba...,4
4,You When I booked with your company on line y...,7
...,...,...
512338,no trolly or staff to help you take the lugga...,7
512339,The hotel looks like 3 but surely not 4 Brea...,6
512340,The ac was useless It was a hot week in vienn...,2
512341,The rooms are enormous and really comfortable...,9


In [6]:
# Exporting as a Zip file
compression_opts = dict(method='zip', archive_name='hotel_reviews_for_NLP2_int_review_score.csv')  
hotel_reviews.to_csv('PySpark_DataFile/hotel_reviews_for_NLP2_int_review_score.zip', index=False, compression=compression_opts) 