# Import

In [1]:
!wget http://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz
!tar -xzf aclImdb_v1.tar.gz

--2023-04-04 19:41:33--  http://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz
Resolving ai.stanford.edu (ai.stanford.edu)... 171.64.68.10
Connecting to ai.stanford.edu (ai.stanford.edu)|171.64.68.10|:80... connected.
HTTP request sent, awaiting response... 200 OK
Length: 84125825 (80M) [application/x-gzip]
Saving to: ‘aclImdb_v1.tar.gz’


2023-04-04 19:41:38 (18.6 MB/s) - ‘aclImdb_v1.tar.gz’ saved [84125825/84125825]



In [2]:
import os
import numpy as np
from sklearn.model_selection import train_test_split
print(os.listdir("./aclImdb"))
print(os.listdir("./aclImdb/train"))
print(os.listdir("./aclImdb/test"))

['imdb.vocab', 'train', 'imdbEr.txt', 'test', 'README']
['unsup', 'urls_neg.txt', 'urls_pos.txt', 'labeledBow.feat', 'urls_unsup.txt', 'pos', 'unsupBow.feat', 'neg']
['urls_neg.txt', 'urls_pos.txt', 'labeledBow.feat', 'pos', 'neg']


In [3]:
def get_text_data(path, label = None):
    
    X = []
    for review in os.listdir(path):
        with open(os.path.join(path,review)) as f:
            rev = f.read()
            
        X.append(rev)
        
    y = [label]*len(X)
        
    return X, y

In [4]:
X,y = get_text_data("./aclImdb/train/pos", label = "pos")
X_neg, y_neg = get_text_data("./aclImdb/train/neg", label = "neg")

X += X_neg
y += y_neg

# Разделение данных
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size = 0.33, random_state = 42)

In [5]:
vocab = {}

for word in X[0].split():
    if word in vocab.keys():
        vocab[word] += 1
    else:
        vocab[word] = 0

In [6]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction import DictVectorizer

vectorizer = CountVectorizer(stop_words = "english")
X_train_vectorizer = vectorizer.fit_transform(X_train)
X_train_vectorizer.shape


(16750, 63420)

In [7]:
from sklearn.feature_extraction.text import TfidfTransformer

tfidf_transf = TfidfTransformer()

X_train_tfidf = tfidf_transf.fit_transform(X_train_vectorizer)
X_train_tfidf.shape

(16750, 63420)

In [8]:
from sklearn.svm import LinearSVC

classifier = LinearSVC()
classifier.fit(X_train_tfidf,y_train)

In [9]:
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import LinearSVC

text_classifier = Pipeline([
    ('tfidvectorizer',TfidfVectorizer()),
    ('LinearSVC',LinearSVC())
])

text_classifier.fit(X_train,y_train)

In [10]:
y_pred = text_classifier.predict(X_test) 
from sklearn.metrics import accuracy_score
print(accuracy_score(y_test, y_pred))

0.8883636363636364
