In [1]:
# Import dependencies
import pandas as pd
import numpy as np

In [2]:
# Read in vine table as dataframe
vine_df = pd.read_csv('vine_table.csv')
vine_df.head()

Unnamed: 0,review_id,star_rating,helpful_votes,total_votes,vine,verified_purchase
0,REAKC26P07MDN,5.0,0.0,0.0,N,Y
1,R3NU7OMZ4HQIEG,2.0,0.0,1.0,N,Y
2,R14QJW3XF8QO1P,5.0,0.0,0.0,N,Y
3,R2HB7AX0394ZGY,5.0,0.0,0.0,N,Y
4,RGKMPDQGSAHR3,5.0,0.0,0.0,N,Y


In [3]:
# Check datatypes
vine_df.dtypes

review_id             object
star_rating          float64
helpful_votes        float64
total_votes          float64
vine                  object
verified_purchase     object
dtype: object

In [4]:
# Get all rows with total votes greater or equal to 20
filter_vine_df = vine_df[vine_df['total_votes']>=20]

In [5]:
# Find the percentage of total votes that are helpful

filter_vine_df['helpful/total'] = filter_vine_df['helpful_votes']/filter_vine_df['total_votes']
filter_vine_df.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  This is separate from the ipykernel package so we can avoid doing imports until


Unnamed: 0,review_id,star_rating,helpful_votes,total_votes,vine,verified_purchase,helpful/total
128,R21KC552Y6HL8X,1.0,27.0,31.0,N,Y,0.870968
161,RX9WC9FTIR1XR,5.0,25.0,25.0,N,Y,1.0
256,RGDCOU1KBHMNG,3.0,29.0,31.0,N,Y,0.935484
267,RVTYWID2TPMMY,2.0,35.0,42.0,N,Y,0.833333
719,R2CMPZ5VESGRLY,4.0,27.0,28.0,N,Y,0.964286


In [6]:
# Find rows where helpful votes divided by total votes is greater than or equal to 50%
filter_vine_df = filter_vine_df[filter_vine_df['helpful/total'] >= 0.5]
filter_vine_df.head()

Unnamed: 0,review_id,star_rating,helpful_votes,total_votes,vine,verified_purchase,helpful/total
128,R21KC552Y6HL8X,1.0,27.0,31.0,N,Y,0.870968
161,RX9WC9FTIR1XR,5.0,25.0,25.0,N,Y,1.0
256,RGDCOU1KBHMNG,3.0,29.0,31.0,N,Y,0.935484
267,RVTYWID2TPMMY,2.0,35.0,42.0,N,Y,0.833333
719,R2CMPZ5VESGRLY,4.0,27.0,28.0,N,Y,0.964286


In [7]:
# Remove the extra column
filter_vine_df.pop('helpful/total')

128        0.870968
161        1.000000
256        0.935484
267        0.833333
719        0.964286
             ...   
2643577    0.770492
2643591    0.983051
2643614    0.941176
2643616    0.857143
2643618    0.850000
Name: helpful/total, Length: 38010, dtype: float64

In [8]:
# Find all reviews that are not part of vine program
not_vine_df = filter_vine_df[filter_vine_df['vine'] == 'N']
not_vine_df.head()

Unnamed: 0,review_id,star_rating,helpful_votes,total_votes,vine,verified_purchase
128,R21KC552Y6HL8X,1.0,27.0,31.0,N,Y
161,RX9WC9FTIR1XR,5.0,25.0,25.0,N,Y
256,RGDCOU1KBHMNG,3.0,29.0,31.0,N,Y
267,RVTYWID2TPMMY,2.0,35.0,42.0,N,Y
719,R2CMPZ5VESGRLY,4.0,27.0,28.0,N,Y


In [9]:
# Find all reviews that are part of vine program
vine_df  = filter_vine_df[filter_vine_df['vine'] == 'Y']
vine_df.head()

Unnamed: 0,review_id,star_rating,helpful_votes,total_votes,vine,verified_purchase
8547,R3A71VR1JZD8WF,2.0,27.0,30.0,Y,N
10246,R16OMUJIGI18JZ,5.0,72.0,72.0,Y,N
25168,R3TS8ZP2FHQ9XR,5.0,39.0,42.0,Y,N
46422,R2MHP919VZN7DI,5.0,29.0,30.0,Y,N
66446,RD2BCTVS59A5L,2.0,20.0,20.0,Y,N


In [10]:
# Count the total number of votes for both data sets
vine_total = vine_df['total_votes'].count()
not_vine_total = not_vine_df['total_votes'].count()

In [11]:
print(vine_total)

170


In [12]:
print(not_vine_total)

37840


In [13]:
# Count up 5 star reviews for both data frames
vine5star = len(vine_df[vine_df['star_rating']==5])
notvine5star = len(not_vine_df[not_vine_df['star_rating']==5]) 

In [14]:
print(vine5star)

65


In [16]:
print(notvine5star)

20612


In [17]:
# Find percentage of 5 star reviews for vine reviews 
print(vine5star/vine_total)

0.38235294117647056


In [18]:
# Find percentage of 5 star reviews for unpaid reviews 
print(notvine5star/not_vine_total)

0.5447145877378435
