In [1]:
import pandas as pd
import altair as alt
import numpy as np
import math
import matplotlib.pyplot as plt

In [2]:
df = pd.read_csv('all_reviews.csv')
df.columns
df.head()

Unnamed: 0,teacher_id,attendance,clarityColor,easyColor,helpColor,helpCount,id,notHelpCount,onlineClass,quality,...,rTimestamp,rWouldTakeAgain,sId,takenForCredit,teacher,teacherGrade,teacherRatingTags,unUsefulGrouping,usefulGrouping,overall_rating
0,1000064,,good,poor,good,1,25262261,0,,awesome,...,1445096719000,,1452,Yes,,,"['Inspirational', 'Amazing lectures ']",people,person,5.0
1,1000064,,good,average,good,1,13252233,0,,awesome,...,1181323024000,,1452,,,,[],people,person,5.0
2,1007241,Mandatory,average,good,average,3,27179836,0,,poor,...,1479119749000,No,1452,Yes,,,"['Tough grader', 'Lecture heavy']",people,people,2.7
3,1007241,,poor,good,poor,0,18307613,2,,awful,...,1300970532000,,1452,,,,[],people,people,2.7
4,1007241,,good,average,good,0,17041715,0,,awesome,...,1271948269000,,1452,,,,[],people,people,2.7


In [4]:
for col in df:
    print(col,": ",df[col].unique())

teacher_id :  [1000064 1007241 1007871 ...   99449   99765   99965]
attendance :  [nan 'Mandatory' 'Not Mandatory']
clarityColor :  ['good' 'average' 'poor']
easyColor :  ['poor' 'average' 'good']
helpColor :  ['good' 'average' 'poor']
helpCount :  [  1   3   0   2   5   6   7  11  24  22  10  12   8   9  16   4  23  28
  26  14  19  15  13  39  17  36  27  38  18 160  20  21  31  34  33  76
  58  25  55  30  40 354  43  32  44  29  64 230  48  49  37  57  61  41
  42  46  50  67  45  35 105  65 179  88  60  72  54  75  69  70  92  66]
id :  [25262261 13252233 27179836 ...   538291   446406   435980]
notHelpCount :  [  0   2   1   3   4  11   8   5   6   9  13   7  17  18  20  14  16  12
  10  34  15  19  25  22  21 105 173 117  26  45  74  24  78 103  30  33
  29  39  23  38 112  41 141  35  44  36  28  31  37  27  72  32  47  42
  46 197  55  49  43  58  70  52  60  63  75  51  59 114  48 249 118  71
  85  96  91]
onlineClass :  [nan 'online']
quality :  ['awesome' 'poor' 'awful' 'go

In [6]:
#mapping the helpColor to numeric vals to assign weights to every review for use in prediction
dict_map = {'good':3,'average':2,'poor':1}
df['helpColor_weight'] = df['helpColor'].map(dict_map)
df

Unnamed: 0,teacher_id,attendance,clarityColor,easyColor,helpColor,helpCount,id,notHelpCount,onlineClass,quality,...,sId,takenForCredit,teacher,teacherGrade,teacherRatingTags,unUsefulGrouping,usefulGrouping,overall_rating,review_weight,helpColor_weight
0,1000064,,good,poor,good,1,25262261,0,,awesome,...,1452,Yes,,,"['Inspirational', 'Amazing lectures ']",people,person,5.0,3,3
1,1000064,,good,average,good,1,13252233,0,,awesome,...,1452,,,,[],people,person,5.0,3,3
2,1007241,Mandatory,average,good,average,3,27179836,0,,poor,...,1452,Yes,,,"['Tough grader', 'Lecture heavy']",people,people,2.7,2,2
3,1007241,,poor,good,poor,0,18307613,2,,awful,...,1452,,,,[],people,people,2.7,1,1
4,1007241,,good,average,good,0,17041715,0,,awesome,...,1452,,,,[],people,people,2.7,3,3
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
63946,99965,,good,good,good,0,1408578,0,,good,...,1452,,,,[],people,people,3.0,3,3
63947,99965,,poor,average,average,0,642117,0,,average,...,1452,,,,[],people,people,3.0,2,2
63948,99965,,poor,average,poor,0,538291,0,,poor,...,1452,,,,[],people,people,3.0,1,1
63949,99965,,average,average,average,0,446406,0,,good,...,1452,,,,[],people,people,3.0,2,2


In [8]:
#and then going further, we have the columns helpCount and notHelpCount, taking this to mean the number of people that
#found a particular review helpful and unhelpful respectively, we can end up with the overall review weights
# formula = helpColorWeight + helpCount - notHelpCount

# e.g cases: 
#helpColor - good, helpCount - 0, notHelpCount - 0, review weight = 3 + 0 - 0 = 3
# helpColor - poor, helpCount - 0, notHelpCount - 2 , review weight = 1 + 0 -2 = -1

df['review_weight'] = df['helpColor_weight'] + df['helpCount'] - df['notHelpCount']
df

Unnamed: 0,teacher_id,attendance,clarityColor,easyColor,helpColor,helpCount,id,notHelpCount,onlineClass,quality,...,sId,takenForCredit,teacher,teacherGrade,teacherRatingTags,unUsefulGrouping,usefulGrouping,overall_rating,review_weight,helpColor_weight
0,1000064,,good,poor,good,1,25262261,0,,awesome,...,1452,Yes,,,"['Inspirational', 'Amazing lectures ']",people,person,5.0,4,3
1,1000064,,good,average,good,1,13252233,0,,awesome,...,1452,,,,[],people,person,5.0,4,3
2,1007241,Mandatory,average,good,average,3,27179836,0,,poor,...,1452,Yes,,,"['Tough grader', 'Lecture heavy']",people,people,2.7,5,2
3,1007241,,poor,good,poor,0,18307613,2,,awful,...,1452,,,,[],people,people,2.7,-1,1
4,1007241,,good,average,good,0,17041715,0,,awesome,...,1452,,,,[],people,people,2.7,3,3
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
63946,99965,,good,good,good,0,1408578,0,,good,...,1452,,,,[],people,people,3.0,3,3
63947,99965,,poor,average,average,0,642117,0,,average,...,1452,,,,[],people,people,3.0,2,2
63948,99965,,poor,average,poor,0,538291,0,,poor,...,1452,,,,[],people,people,3.0,1,1
63949,99965,,average,average,average,0,446406,0,,good,...,1452,,,,[],people,people,3.0,2,2
