In [1]:
import pandas as pd

In [18]:
review = pd.read_csv('review-sentiment.csv')
len(review)

92354

In [19]:
review.dropna(inplace=True)
len(review)

92325

In [20]:
import torch.nn.functional as F 
from transformers import AutoTokenizer, AutoModelForSequenceClassification
tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased-finetuned-sst-2-english")
model = AutoModelForSequenceClassification.from_pretrained("distilbert-base-uncased-finetuned-sst-2-english")

In [21]:
data = list(review.cleanedText)

In [22]:
# for each in review.lemmatization:
from tqdm import tqdm
res = []
for each in tqdm(data):
    tokenized_segments = tokenizer(each, return_tensors="pt", padding=True, truncation=True, max_length = 512)
    tokenized_segments_input_ids, tokenized_segments_attention_mask = tokenized_segments.input_ids, tokenized_segments.attention_mask
    model_predictions = F.softmax(model(input_ids=tokenized_segments_input_ids, attention_mask=tokenized_segments_attention_mask)['logits'], dim=1)
    res += model_predictions.tolist()

100%|██████████| 92325/92325 [32:18<00:00, 47.62it/s]  


In [25]:
import numpy as np
np.array(res)

array([[0.99207002, 0.00792995],
       [0.99525428, 0.00474573],
       [0.75462365, 0.2453763 ],
       ...,
       [0.9965564 , 0.0034436 ],
       [0.99223912, 0.00776081],
       [0.99709976, 0.00290033]])

In [26]:
len(res)

92325

In [27]:
len(review)

92325

In [32]:
review['distilbert_positive'] = np.array(res)[:, 0]

In [34]:
review['distilbert_negative'] = np.array(res)[:, 1]

In [36]:
review.head()

Unnamed: 0,reviewerID,asin,reviewerName,helpful,reviewText,overall,summary,unixReviewTime,reviewTime,cleanedText,nltk,distilbert_positive,distilbert_negative
0,A361M14PU2GUEG,700099867,"Angry Ryan ""Ryan A. Forrest""","[2, 2]",i had dirt 2 on xbox 360 and it was an okay ga...,4,DIRT 3,1308009600,2011-06-14,dirt xbox 360 okay game started playing games ...,0.9136,0.99207,0.00793
1,A1QJJU33VNC4S7,700099867,D@rkFX,"[0, 1]",i initially gave this one star because it was ...,4,A great game ruined by Microsoft's account man...,1352851200,2012-11-14,initially gave one star crashing constantly re...,0.4019,0.995254,0.004746
2,ANW6EGY12V5XS,700099867,Henri Savin,"[0, 0]",i have been playing car racing games since the...,5,A 5 stars winner!,1318982400,2011-10-19,playing car racing games since early beginning...,0.9926,0.754624,0.245376
3,AHT34BRYFBFT1,700099867,hewimp,"[0, 1]",dirt 3 on dvdi collect racing games so had to ...,5,Cars,1388275200,2013-12-29,dirt dvdi collect racing games add collections...,-0.34,0.988628,0.011372
4,A248LSBZT4P38V,700099867,Joseph R. Kennedy,"[0, 0]",i bought this and the key didn't work. it was...,1,"It might have been a good game, but I never fo...",1404086400,2014-06-30,bought key work gift recipient able solve prob...,0.7651,0.932582,0.067418


In [38]:
review.to_csv('review_total', index=False)

## Split Dataset

In [42]:
review.reviewerID.nunique()

4939

In [43]:
total = review.reset_index().drop('index', axis=1).reset_index()
total.head()

Unnamed: 0,index,reviewerID,asin,reviewerName,helpful,reviewText,overall,summary,unixReviewTime,reviewTime,cleanedText,nltk,distilbert_positive,distilbert_negative
0,0,A361M14PU2GUEG,700099867,"Angry Ryan ""Ryan A. Forrest""","[2, 2]",i had dirt 2 on xbox 360 and it was an okay ga...,4,DIRT 3,1308009600,2011-06-14,dirt xbox 360 okay game started playing games ...,0.9136,0.99207,0.00793
1,1,A1QJJU33VNC4S7,700099867,D@rkFX,"[0, 1]",i initially gave this one star because it was ...,4,A great game ruined by Microsoft's account man...,1352851200,2012-11-14,initially gave one star crashing constantly re...,0.4019,0.995254,0.004746
2,2,ANW6EGY12V5XS,700099867,Henri Savin,"[0, 0]",i have been playing car racing games since the...,5,A 5 stars winner!,1318982400,2011-10-19,playing car racing games since early beginning...,0.9926,0.754624,0.245376
3,3,AHT34BRYFBFT1,700099867,hewimp,"[0, 1]",dirt 3 on dvdi collect racing games so had to ...,5,Cars,1388275200,2013-12-29,dirt dvdi collect racing games add collections...,-0.34,0.988628,0.011372
4,4,A248LSBZT4P38V,700099867,Joseph R. Kennedy,"[0, 0]",i bought this and the key didn't work. it was...,1,"It might have been a good game, but I never fo...",1404086400,2014-06-30,bought key work gift recipient able solve prob...,0.7651,0.932582,0.067418


In [44]:
test = total.groupby('reviewerID').apply(lambda x: x.sample(n = 4)).reset_index(drop = True)
len(test)

19756

In [46]:
train = total[~total['index'].isin(test['index'])]
len(train)

72569

In [50]:
test.drop('index', axis=1).to_csv('test.csv', index=False)
train.drop('index', axis=1).to_csv('train.csv', index=False)

In [51]:
train

Unnamed: 0,index,reviewerID,asin,reviewerName,helpful,reviewText,overall,summary,unixReviewTime,reviewTime,cleanedText,nltk,distilbert_positive,distilbert_negative
0,0,A361M14PU2GUEG,0700099867,"Angry Ryan ""Ryan A. Forrest""","[2, 2]",i had dirt 2 on xbox 360 and it was an okay ga...,4,DIRT 3,1308009600,2011-06-14,dirt xbox 360 okay game started playing games ...,0.9136,0.992070,0.007930
1,1,A1QJJU33VNC4S7,0700099867,D@rkFX,"[0, 1]",i initially gave this one star because it was ...,4,A great game ruined by Microsoft's account man...,1352851200,2012-11-14,initially gave one star crashing constantly re...,0.4019,0.995254,0.004746
2,2,ANW6EGY12V5XS,0700099867,Henri Savin,"[0, 0]",i have been playing car racing games since the...,5,A 5 stars winner!,1318982400,2011-10-19,playing car racing games since early beginning...,0.9926,0.754624,0.245376
3,3,AHT34BRYFBFT1,0700099867,hewimp,"[0, 1]",dirt 3 on dvdi collect racing games so had to ...,5,Cars,1388275200,2013-12-29,dirt dvdi collect racing games add collections...,-0.3400,0.988628,0.011372
4,4,A248LSBZT4P38V,0700099867,Joseph R. Kennedy,"[0, 0]",i bought this and the key didn't work. it was...,1,"It might have been a good game, but I never fo...",1404086400,2014-06-30,bought key work gift recipient able solve prob...,0.7651,0.932582,0.067418
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
92318,92318,AJQB6I00X8WAI,B00KAI3KW2,SmokeyNYY,"[3, 6]",wow this really is an all in one gaming consol...,5,Just got it and really like it so far,1403568000,2014-06-24,wow really one gaming console multimedia devic...,0.9801,0.002039,0.997961
92320,92320,A2IGEPJJYKMOWK,B00KAI3KW2,UziHorowitz,"[2, 4]",xbox one was initially announced and revealed ...,5,An Incredibly Reliable & Advanced Game Console,1405728000,2014-07-19,xbox one initially announced revealed past jun...,0.9920,0.956405,0.043595
92321,92321,A1M19BGTJ5PI1I,B00KAI3KW2,V. T.,"[24, 113]",the titanfall bundle has been as low as $450. ...,2,"No kinect, worse value",1402272000,2014-06-09,titanfall bundle low 450 getting game kinect s...,0.9605,0.970660,0.029340
92322,92322,A1ICREREXO9J81,B00KHECZXO,Frustrated gamer,"[0, 1]",funny people on here are rating sellers that a...,5,this is for rating the system not the seller,1405814400,2014-07-20,funny people rating sellers ripping supposedly...,0.8608,0.996556,0.003444
