In [12]:
# !pip install -r requirements.txt

In [13]:
import pandas as pd

#read the book data from json file
books_df = pd.read_json('Books_small_10000.json', lines=True)
print('Number of rows  books: ', len(books_df))
#print unique values of the column 'overall'
print('Unique values of the column overall: ', books_df['overall'].unique())
books_df = books_df[['reviewText', 'overall']]

books_df.head()

Number of rows  books:  10000
Unique values of the column overall:  [5 3 4 2 1]


Unnamed: 0,reviewText,overall
0,"I bought both boxed sets, books 1-5. Really a...",5
1,I enjoyed this short book. But it was way way ...,3
2,I love Nicholas Sparks. I&#8217;ve read everyt...,4
3,I really enjoyed this adventure and look forwa...,4
4,It was a decent read.. typical story line. Not...,3


In [14]:
#read the Restaurant data from tsv file
restaurants_df = pd.read_csv('reviews.csv')
print('Number of rows in restaurants dataset: ', len(restaurants_df))
print('Unique values in Liked column: ', restaurants_df['Review'].unique())
#drop the rows with missing values NaN values at Review  column
restaurants_df = restaurants_df.dropna(subset=['Review'])

#drop the Recommends column

restaurants_df = restaurants_df.drop(['Recommends'], axis=1)

restaurants_df.head()

Number of rows in restaurants dataset:  16597
Unique values in Liked column:  [nan  5.  4.  3.  1.  2.]


Unnamed: 0,Review Text,Review
2,The man who is foodie like me for him arabian ...,5.0
4,This place is too much comfortable & food is d...,4.0
6,I check it out like a second home of mine...fe...,3.0
8,"you guys are awesome & I just love your ""offer...",5.0
10,Went there after referred by a friend. Tried t...,5.0


In [15]:
import sqlite3
 
#read the movies data from db file
connect = sqlite3.connect('IMDB_Movies_2021.db')

query = 'SELECT REVIEW,RATING FROM REVIEWS'
movies_df = pd.read_sql_query(query,connect)

#print the number of rows 
print('Number of rows in books dataset: ', len(books_df))
#print the unique values in the sentiment column
print('Unique values in sentiment column: ', movies_df['RATING'].unique())

#remove rows with nan values in RATING column
movies_df = movies_df.dropna(subset=['RATING'])

#rescale the ratings between 0 and 5 and rouding up to integer
movies_df['RATING'] = round(movies_df['RATING'] * (5/10))

print('Unique values in sentiment column: ', movies_df['RATING'].unique())

movies_df.head()

Number of rows in books dataset:  10000
Unique values in sentiment column:  [ 5.  8.  4.  6.  9.  7.  3.  1.  2. nan 10.]
Unique values in sentiment column:  [2. 4. 3. 0. 1. 5.]


Unnamed: 0,REVIEW,RATING
0,I don't get all the terrible reviews for this ...,2.0
1,I cannot believe anyone could give this film l...,4.0
2,Great White is not the worst way to spend 90 m...,2.0
3,Great White is as basic of a killer shark film...,2.0
4,"Terrible story, dialogue and CGI. The film has...",2.0


In [16]:
#create a pie chart to show the distribution of the data use plotly

import plotly.graph_objects as go

labels = ['Books', 'Restaurants', 'Movies']
values = [len(books_df), len(restaurants_df), len(movies_df)]


fig = go.Figure(data=[go.Pie(labels=labels, values=values)])

#create and center the title
fig.update_layout(title_text='Distribution of the data', title_x=0.5)

#update the legend position to be in the middle below the title
fig.update_layout(legend=dict(
    orientation="h",
    yanchor="bottom",
    y=1.02,
    xanchor="center",
    x=0.5
))


fig.show()

In [17]:
# drop all the column in the books_df except the reviewText and overall columns
# reviewerID	asin	reviewerName	helpful	reviewText	overall	summary	unixReviewTime	reviewTime


def convert_rating_to_sentiment(rating):
    
    #conv to int
    rating = int(rating)

    if rating <= 2:
        return 'negative'
    elif rating == 3:
        return 'neutral'
    else:
        return 'positive'


In [18]:

#convert the ratings to sentiment
books_df['overall'] = books_df['overall'].apply(convert_rating_to_sentiment)
restaurants_df['Review'] = restaurants_df['Review'].apply(convert_rating_to_sentiment)
movies_df['RATING'] = movies_df['RATING'].apply(convert_rating_to_sentiment)

In [19]:

#combine the dataframes by stacking them on top of each other with new header name review and sentiment

books_df = books_df.rename(columns={'reviewText': 'review', 'overall': 'sentiment'})
restaurants_df = restaurants_df.rename(columns={'Review Text': 'review', 'Review': 'sentiment'})
movies_df = movies_df.rename(columns={'REVIEW': 'review', 'RATING': 'sentiment'})

#add a new column to the dataframes to identify the source of the data
books_df['source'] = 'books'
restaurants_df['source'] = 'restaurants'
movies_df['source'] = 'movies'

#combine the dataframes

df_comb = pd.concat([books_df, restaurants_df, movies_df], ignore_index=True)
df_comb.head()

Unnamed: 0,review,sentiment,source
0,"I bought both boxed sets, books 1-5. Really a...",positive,books
1,I enjoyed this short book. But it was way way ...,neutral,books
2,I love Nicholas Sparks. I&#8217;ve read everyt...,positive,books
3,I really enjoyed this adventure and look forwa...,positive,books
4,It was a decent read.. typical story line. Not...,neutral,books


In [20]:
#save the combined dataframe to csv file
df_comb.to_csv('combined_train_sentiment.csv', index=False)

In [35]:
#visualize the df_comb dataframe using plotly and bar chart

import plotly.express as px

#create a bar chart to show the distribution of sentiment and source
fig = px.histogram(df_comb, x="sentiment", color="source", barmode="group", title='Distribution of sentiment and source', text_auto='percent')


fig.show()