In [19]:
!pip install -U -q PyDrive
import os
from pydrive.auth import GoogleAuth
from pydrive.drive import GoogleDrive
from google.colab import auth
from oauth2client.client import GoogleCredentials

# 1. Authenticate and create the PyDrive client.
auth.authenticate_user()
gauth = GoogleAuth()
gauth.credentials = GoogleCredentials.get_application_default()
drive = GoogleDrive(gauth)

# choose a local (colab) directory to store the data.
local_download_path = os.path.expanduser('~/data')
try:
  os.makedirs(local_download_path)
except: pass

# 2. Auto-iterate using the query syntax
#    https://developers.google.com/drive/v2/web/search-parameters
# Upload the text file to your google drive
# For example the link to your google drive is this https://drive.google.com/drive/u/1/folders/18nPCEXEO6s2baDkEzRljoTo0OLC1QAOX
# Copy the last portion "18nPCEXEO6s2baDkEzRljoTo0OLC1QAOX" and paste it  {'q': "'18nPCEXEO6s2baDkEzRljoTo0OLC1QAOX' in parents"}).GetList() here
file_list = drive.ListFile(
    {'q': "'18nPCEXEO6s2baDkEzRljoTo0OLC1QAOX' in parents"}).GetList() 

for f in file_list:
  # 3. Create & download by id.
  print('title: %s, id: %s' % (f['title'], f['id']))
  fname = os.path.join(local_download_path, f['title'])
  print('downloading to {}'.format(fname))
  f_ = drive.CreateFile({'id': f['id']})
  f_.GetContentFile(fname)


with open(fname, 'r') as f:
  corpus = f.read()
  print(corpus)

title: Movies_TV.txt, id: 1yDGJOC-HmelzTqIzVwI1fF5yHqPaH4dx
downloading to /root/data/Movies_TV.txt
Domain	Label	Rating	Review
Movies_TV	POS	5	my boy love this film . sometime my youngest get scared of Captain Hook . the story be great , and I think little boy can really connect to it . beautiful , fun music as well .
Movies_TV	NEU	3	on my disk the last scene of episode 2 : New Earth be miss . instead the disk cut to a gory scene from what appear to be `` the Texas Chainsaw Massacure '' . look at the dvd menue I see that the last scene be title `` Time to die '' . I can only guess that the manufacturer of the dvd insert a simularly title scene from another movie . parent be warn , the scene in question be NOT for child .
Movies_TV	POS	4	I have a 4yr old son and he love this cartoon , and I too i buy it because of the story its so intresting .
Movies_TV	POS	5	this sequal be wonderful . the animation be excellent , though other may comment that it be not correct to the original style . I

In [20]:
corpus = corpus.lower()
corpus



In [21]:
docs = corpus.split('\n')
docs

['domain\tlabel\trating\treview',
 'movies_tv\tpos\t5\tmy boy love this film . sometime my youngest get scared of captain hook . the story be great , and i think little boy can really connect to it . beautiful , fun music as well .',
 "movies_tv\tneu\t3\ton my disk the last scene of episode 2 : new earth be miss . instead the disk cut to a gory scene from what appear to be `` the texas chainsaw massacure '' . look at the dvd menue i see that the last scene be title `` time to die '' . i can only guess that the manufacturer of the dvd insert a simularly title scene from another movie . parent be warn , the scene in question be not for child .",
 'movies_tv\tpos\t4\ti have a 4yr old son and he love this cartoon , and i too i buy it because of the story its so intresting .',
 "movies_tv\tpos\t5\tthis sequal be wonderful . the animation be excellent , though other may comment that it be not correct to the original style . i hate to be the one to point this out , but we be no longer in the 

In [22]:
docs.remove(docs[0])

In [24]:
docs.remove(docs[-1])

In [27]:
len(docs)


1000

In [28]:
docs[0]

'movies_tv\tpos\t5\tmy boy love this film . sometime my youngest get scared of captain hook . the story be great , and i think little boy can really connect to it . beautiful , fun music as well .'

In [31]:
X = []
label = []
rating = []

for doc in docs:
    _, l, r, t = doc.split('\t')
    X.append(t)
    label.append(l)
    rating.append(r)

In [35]:
set(rating)
print(X)
print(label)
print(rating)

['pos', 'neu', 'pos', 'pos', 'pos', 'neu', 'pos', 'pos', 'neu', 'pos', 'pos', 'pos', 'pos', 'pos', 'pos', 'pos', 'pos', 'neu', 'pos', 'pos', 'pos', 'pos', 'pos', 'pos', 'pos', 'pos', 'neu', 'neg', 'neu', 'pos', 'pos', 'pos', 'pos', 'pos', 'pos', 'pos', 'pos', 'pos', 'pos', 'pos', 'pos', 'pos', 'pos', 'pos', 'pos', 'pos', 'pos', 'pos', 'pos', 'pos', 'pos', 'pos', 'pos', 'pos', 'pos', 'pos', 'pos', 'pos', 'pos', 'pos', 'pos', 'pos', 'pos', 'pos', 'neu', 'pos', 'pos', 'pos', 'pos', 'pos', 'pos', 'neg', 'neu', 'neg', 'pos', 'pos', 'pos', 'pos', 'pos', 'pos', 'pos', 'pos', 'neg', 'pos', 'pos', 'pos', 'pos', 'pos', 'pos', 'pos', 'pos', 'pos', 'pos', 'pos', 'pos', 'pos', 'pos', 'pos', 'pos', 'pos', 'pos', 'pos', 'pos', 'neu', 'pos', 'pos', 'pos', 'pos', 'pos', 'pos', 'pos', 'pos', 'pos', 'neu', 'pos', 'neg', 'neg', 'pos', 'pos', 'pos', 'neu', 'pos', 'pos', 'neu', 'pos', 'pos', 'neu', 'pos', 'pos', 'pos', 'pos', 'pos', 'pos', 'pos', 'pos', 'neg', 'pos', 'neg', 'neu', 'pos', 'pos', 'pos', 'pos'

In [38]:
from sklearn.feature_extraction.text import CountVectorizer
vec = CountVectorizer()
matrix_X = vec.fit_transform(X)

In [39]:
matrix_X

<1000x7800 sparse matrix of type '<class 'numpy.int64'>'
	with 74549 stored elements in Compressed Sparse Row format>

In [40]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import MultinomialNB 

knn = KNeighborsClassifier(n_neighbors = 1)
dt = DecisionTreeClassifier()
nb = MultinomialNB()

In [41]:
knn.fit(matrix_X[:-200], rating[:-200])
dt.fit(matrix_X[:-200], rating[:-200])
nb.fit(matrix_X[:-200], rating[:-200])

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [46]:
rating_knn = knn.predict(matrix_X[-200:])
rating_dt = dt.predict(matrix_X[-200:])
rating_nb = nb.predict(matrix_X[-200:])

In [47]:
from sklearn.metrics import accuracy_score

In [48]:
acc_knn = accuracy_score(rating_knn, rating[-200:])
acc_dt = accuracy_score(rating_dt, rating[-200:])
acc_nb = accuracy_score(rating_nb, rating[-200:])

In [49]:
print('KNN Accuracy: ', acc_knn)
print('DT Accuracy: ', acc_dt)
print('NB Accuracy: ', acc_nb)

KNN Accuracy:  0.455
DT Accuracy:  0.625
NB Accuracy:  0.665


In [50]:
from sklearn.metrics import precision_score, recall_score, f1_score

In [51]:
pc_knn = precision_score(rating_knn, rating[-200:], average = 'macro')
pc_dt = precision_score(rating_dt, rating[-200:], average = 'macro')
pc_nb = precision_score(rating_nb, rating[-200:], average = 'macro')

rc_knn = recall_score(rating_knn, rating[-200:], average = 'macro')
rc_dt = recall_score(rating_dt, rating[-200:], average = 'macro')
rc_nb = recall_score(rating_nb, rating[-200:], average = 'macro')

f1_knn = f1_score(rating_knn, rating[-200:], average = 'macro')
f1_dt = f1_score(rating_dt, rating[-200:], average = 'macro')
f1_nb = f1_score(rating_nb, rating[-200:], average = 'macro')

  _warn_prf(average, modifier, msg_start, len(result))


In [52]:
print('KNN Precision: ', pc_knn)
print('DT Precision: ', pc_dt)
print('NB Precision: ', pc_nb)

print('KNN Recall: ', rc_knn)
print('DT Recall: ', rc_dt)
print('NB Recall: ', rc_nb)

print('KNN F1-score: ', f1_knn)
print('DT F1-score: ', f1_dt)
print('NB F1-score: ', f1_nb)

KNN Precision:  0.17255926074728806
DT Precision:  0.2871034456369933
NB Precision:  0.2516271595018079
KNN Recall:  0.19479166666666664
DT Recall:  0.4671276918410676
NB Recall:  0.22062717770034843
KNN F1-score:  0.18202554202554203
DT F1-score:  0.28892156862745094
NB F1-score:  0.2339447411191084


In [53]:
from sklearn.metrics import confusion_matrix

In [55]:
confusion_matrix(rating_knn, rating[-200:])

array([[ 0,  0,  0,  3, 27],
       [ 0,  0,  0,  1,  1],
       [ 5,  0,  0,  2,  6],
       [ 1,  0,  2,  9, 15],
       [ 8,  5, 10, 23, 82]])

In [56]:
confusion_matrix(rating_dt, rating[-200:])

array([[  1,   0,   0,   0,   0],
       [  2,   1,   0,   0,   4],
       [  1,   0,   1,   3,   0],
       [  4,   3,   2,   8,  13],
       [  6,   1,   9,  27, 114]])

In [57]:
confusion_matrix(rating_nb, rating[-200:])

array([[  0,   0,   0,   0,   1],
       [  0,   0,   0,   0,   0],
       [  0,   0,   0,   0,   0],
       [  5,   2,   5,  13,  10],
       [  9,   3,   7,  25, 120]])