In [1]:
import nltk

In [2]:
nltk.download("movie_reviews")

[nltk_data] Downloading package movie_reviews to
[nltk_data]     C:\Users\User\AppData\Roaming\nltk_data...
[nltk_data]   Package movie_reviews is already up-to-date!


True

In [3]:
import os
examples = []
for sentiment in ("neg", "pos"):
    for filename in os.listdir(os.path.join(nltk.corpus.movie_reviews.root.path, sentiment)):
        with open(os.path.join(nltk.corpus.movie_reviews.root.path, sentiment, filename), "r", encoding="utf-8") as file:
            text = file.read().strip()
            examples.append({"text": text, "sentiment": int(sentiment=="pos")})

In [4]:
import pandas as pd
examples_df = pd.DataFrame(examples)
examples_df

Unnamed: 0,text,sentiment
0,"plot : two teen couples go to a church party ,...",0
1,the happy bastard's quick movie review \ndamn ...,0
2,it is movies like these that make a jaded movi...,0
3,""" quest for camelot "" is warner bros . ' first...",0
4,synopsis : a mentally unstable man undergoing ...,0
...,...,...
1995,wow ! what a movie . \nit's everything a movie...,1
1996,"richard gere can be a commanding actor , but h...",1
1997,"glory--starring matthew broderick , denzel was...",1
1998,steven spielberg's second epic film on world w...,1


In [5]:
examples_df = examples_df.sample(frac=1)
train_df = examples_df.sample(frac=0.7)
test_df = examples_df.drop(index=train_df.index) 
train_texts, train_labels = train_df.text.values, train_df.sentiment.values
test_texts, test_labels = test_df.text.values, test_df.sentiment.values

**train data**

In [6]:
train_df

Unnamed: 0,text,sentiment
895,2 days in the valley is more or less a pulp fi...,0
1505,"some critics , including siskel & ebert , are ...",1
1614,mickey mouse had better watch his back -- ther...,1
1612,"in the wake of the smashing success of "" rumbl...",1
1984,"now that "" boogie nights "" has made disco resp...",1
...,...,...
807,"and i thought "" stigmata "" would be the worst ...",0
439,you know the plot : a dimwit with a shady past...,0
962,humanities quest for knowledge never ends . \n...,0
1066,ingredients : starving artist lusting after a ...,1


**test data**

In [7]:
test_df

Unnamed: 0,text,sentiment
721,tim burton has now completed his evolution fro...,0
663,what hath kevin williamson wrought ? \nwhile t...,0
757,"one night , during a torrential downpour that ...",0
1509,as i write the review for the new hanks/ryan r...,1
506,this film is extraordinarily horrendous and i'...,0
...,...,...
626,""" tina ! ! ! fetch me the axe ! ! ! "" \na favo...",0
1141,""" very bad things , "" is the most delightfully...",1
1418,capsule : the world will come to an end at mid...,1
947,what were they thinking ? \nnostalgia for the ...,0


In [8]:
from gensim import corpora
import gensim.downloader as api
from gensim.utils import simple_preprocess
from gensim.models.word2vec import Word2Vec
from gensim.models.doc2vec import TaggedDocument, Doc2Vec

**corpora**

In [9]:
corpus = pd.DataFrame(examples_df.text.values)
corpus

Unnamed: 0,0
0,capsule : lesbianism examined in the same hush...
1,warning ! : may contain some mild spoilers and...
2,tim burton has now completed his evolution fro...
3,what hath kevin williamson wrought ? \nwhile t...
4,"one night , during a torrential downpour that ..."
...,...
1995,what were they thinking ? \nnostalgia for the ...
1996,"`the skulls' is a laughably bad thriller , a t..."
1997,a couple of criminals ( mario van peebles and ...
1998,the rich man's wife is one of those movies lik...


In [10]:
corpus_list = list(corpus[0])

**dictionary** + preprocessing

In [11]:
# Tokenize the docs
tokenized_list = [simple_preprocess(doc) for doc in corpus_list]

# Create the Corpus
dictionary = corpora.Dictionary()

#allow_update=True - add new words to dictionary
bow_corpus = [dictionary.doc2bow(doc, allow_update=True) for doc in tokenized_list]
###print(bow_corpus)

len(dictionary.token2id), dictionary.token2id

(38864,
 {'about': 0,
  'absorbing': 1,
  'adult': 2,
  'after': 3,
  'aids': 4,
  'all': 5,
  'almost': 6,
  'also': 7,
  'always': 8,
  'amazing': 9,
  'an': 10,
  'and': 11,
  'another': 12,
  'any': 13,
  'anyone': 14,
  'are': 15,
  'artistically': 16,
  'as': 17,
  'at': 18,
  'authority': 19,
  'be': 20,
  'beat': 21,
  'because': 22,
  'been': 23,
  'behavior': 24,
  'being': 25,
  'benefit': 26,
  'better': 27,
  'bloom': 28,
  'boring': 29,
  'both': 30,
  'brashness': 31,
  'break': 32,
  'but': 33,
  'by': 34,
  'cabin': 35,
  'calls': 36,
  'camp': 37,
  'can': 38,
  'capsule': 39,
  'chaired': 40,
  'changed': 41,
  'claire': 42,
  'clerk': 43,
  'come': 44,
  'comedy': 45,
  'complete': 46,
  'course': 47,
  'creative': 48,
  'dead': 49,
  'deals': 50,
  'degenerates': 51,
  'delivered': 52,
  'desert': 53,
  'didn': 54,
  'dmv': 55,
  'do': 56,
  'docudrama': 57,
  'earlier': 58,
  'earnest': 59,
  'ends': 60,
  'energy': 61,
  'engaging': 62,
  'entire': 63,
  'even': 

### Building X matrix based on *Bag Of Words*:

In [12]:
import pandas as pd
import numpy as np

In [13]:
n = len(dictionary.token2id)

matrix = []
for j, doc in enumerate(corpus_list):  
    x = np.zeros(n)
    for i, nu in bow_corpus[j]:  
        x[i] = nu
    matrix.append(x)

In [14]:
matrix = pd.DataFrame(matrix)
matrix

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,38854,38855,38856,38857,38858,38859,38860,38861,38862,38863
0,6.0,1.0,1.0,1.0,1.0,3.0,1.0,1.0,1.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,3.0,0.0,0.0,0.0,0.0,5.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,2.0,0.0,0.0,0.0,0.0,2.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,2.0,0.0,0.0,0.0,0.0,2.0,0.0,4.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,2.0,0.0,2.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1995,2.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1996,2.0,0.0,0.0,1.0,0.0,3.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1997,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1998,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [15]:
df_bow = matrix.copy()

### Building X matrix based on *TF-IDF*:

In [16]:
from gensim import models
import numpy as np


# Create the Dictionary and Corpus
mydict = corpora.Dictionary([simple_preprocess(line) for line in corpus_list])
corpus = [mydict.doc2bow(simple_preprocess(line)) for line in corpus_list]


# Create the TF-IDF model
tfidf = models.TfidfModel(corpus, smartirs='ntc')

**X matrix**

In [17]:
n = len(mydict.token2id)

matrix = []
for j, doc in enumerate(corpus_list):  # для каждого документа определяем его порядковый номер
    x = np.zeros(n)
    for i, nu in tfidf[corpus][j]:  #  для каждого слова из документа есть его номер в словаре и к-т tf-idf
        x[i] = nu
    matrix.append(x)

In [18]:
n

38864

In [19]:
matrix = pd.DataFrame(matrix)
matrix

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,38854,38855,38856,38857,38858,38859,38860,38861,38862,38863
0,0.023527,0.0602,0.038529,0.007827,0.059444,0.008294,0.015247,0.007792,0.018826,0.032721,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
1,0.009033,0.0000,0.000000,0.000000,0.000000,0.010615,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
2,0.006488,0.0000,0.000000,0.000000,0.000000,0.004575,0.000000,0.000000,0.000000,0.027071,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
3,0.006800,0.0000,0.000000,0.000000,0.000000,0.004795,0.000000,0.027028,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
4,0.000000,0.0000,0.000000,0.018197,0.000000,0.006427,0.000000,0.000000,0.021883,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1995,0.013670,0.0000,0.000000,0.000000,0.000000,0.004819,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
1996,0.005301,0.0000,0.000000,0.005291,0.000000,0.005606,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
1997,0.008368,0.0000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
1998,0.004894,0.0000,0.000000,0.009769,0.000000,0.000000,0.019028,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000


In [20]:
df_tfidf = matrix.copy()

preparing data for learning

In [21]:
df_bow['y'] = examples_df.sentiment

In [22]:
df_tfidf['y'] = examples_df.sentiment

In [23]:
df_bow

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,38855,38856,38857,38858,38859,38860,38861,38862,38863,y
0,6.0,1.0,1.0,1.0,1.0,3.0,1.0,1.0,1.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
1,3.0,0.0,0.0,0.0,0.0,5.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
2,2.0,0.0,0.0,0.0,0.0,2.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
3,2.0,0.0,0.0,0.0,0.0,2.0,0.0,4.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
4,0.0,0.0,0.0,2.0,0.0,2.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1995,2.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1
1996,2.0,0.0,0.0,1.0,0.0,3.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1
1997,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1
1998,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1


In [24]:
df_tfidf

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,38855,38856,38857,38858,38859,38860,38861,38862,38863,y
0,0.023527,0.0602,0.038529,0.007827,0.059444,0.008294,0.015247,0.007792,0.018826,0.032721,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0
1,0.009033,0.0000,0.000000,0.000000,0.000000,0.010615,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0
2,0.006488,0.0000,0.000000,0.000000,0.000000,0.004575,0.000000,0.000000,0.000000,0.027071,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0
3,0.006800,0.0000,0.000000,0.000000,0.000000,0.004795,0.000000,0.027028,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0
4,0.000000,0.0000,0.000000,0.018197,0.000000,0.006427,0.000000,0.000000,0.021883,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1995,0.013670,0.0000,0.000000,0.000000,0.000000,0.004819,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,1
1996,0.005301,0.0000,0.000000,0.005291,0.000000,0.005606,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,1
1997,0.008368,0.0000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,1
1998,0.004894,0.0000,0.000000,0.009769,0.000000,0.000000,0.019028,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,1


### Logistic Regression

In [25]:
# for Bag Of Words
X = df_bow.drop(['y'], axis = 1)
y = df_bow['y']

In [26]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=20)

In [28]:
from sklearn.linear_model import LogisticRegression
model = LogisticRegression(
    solver = 'liblinear',
    penalty = 'l2',
    multi_class = 'ovr'
    )

In [29]:
model.fit(X,y)

In [30]:
preds = model.predict(X)
#pd.DataFrame(preds)

In [31]:
from sklearn import metrics
conf_matrix = metrics.confusion_matrix(y, preds)
pd.DataFrame(conf_matrix)

Unnamed: 0,0,1
0,998,2
1,1,999


In [32]:
accuracy = sum(preds == y) / len(y)
accuracy

0.9985

In [33]:
# for tfidf
X = df_tfidf.drop(['y'], axis = 1)
y = df_tfidf['y']

In [34]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=20)

In [36]:
from sklearn.linear_model import LogisticRegression
model = LogisticRegression(
    solver = 'liblinear',
    penalty = 'l2',
    multi_class = 'ovr'
    )

In [37]:
model.fit(X,y)

In [38]:
preds = model.predict(X)
#pd.DataFrame(preds)

In [41]:
from sklearn import metrics
conf_matrix = metrics.confusion_matrix(y, preds)
pd.DataFrame(conf_matrix)

Unnamed: 0,0,1
0,966,34
1,17,983


In [40]:
accuracy = sum(preds == y) / len(y)
accuracy

0.9745