In [1]:
import os
import codecs
import numpy as np
import re
import collections
import torch

In [2]:
os.path.abspath("")

'/home/topkek/python_directory/DS_internship-master'

In [3]:
data_path = '/home/topkek/python_directory/DS_internship-master/aclImdb'
train_texts = []
train_labels = []
for category in ['pos', 'neg']:
    train_data_path = os.path.join(data_path, 'train', category)
    for fname in sorted(os.listdir(train_data_path)):
        if fname.endswith('.txt'):
            with codecs.open(os.path.join(train_data_path, fname), 'r', 'utf_8_sig') as f:
                train_texts.append(f.read())
            train_labels.append(0 if category == 'neg' else 1)

test_texts = []
test_labels = []
for category in ['pos', 'neg']:
    test_data_path = os.path.join(data_path, 'test', category)
    for fname in sorted(os.listdir(test_data_path)):
        if fname.endswith('.txt'):
            with codecs.open(os.path.join(test_data_path, fname), 'r', 'utf_8_sig') as f:
                test_texts.append(f.read())
            test_labels.append(0 if category == 'neg' else 1)

In [4]:
_patterns = [r'\"',
             r'<br /><br />',
             r'\;',
             r'\:',
             r'\s+',
             r'\(',
             r'\)']

_replacements = ['',
                 ' ',
                 ' ',
                 ' ',
                 ' ',
                '',
                '']
_patterns_dict = list((re.compile(p), r) for p, r in zip(_patterns, _replacements))

def normalize(line):
    line = line.lower()
    for pattern_re, replaced_str in _patterns_dict:
        line = pattern_re.sub(replaced_str, line)
    return line

TOKEN_RE = re.compile(r'[a-z]+|\d+[.,]\d+|\d+')

def tokenize(txt, min_token_size = 3):
    txt = normalize(txt)
    all_tokens = TOKEN_RE.findall(txt)
    return [token for token in all_tokens if len(token) >= min_token_size]

def tokenize_corpus(texts, tokenizer=tokenize, **tokenizer_kwargs):
    return [tokenizer(text, **tokenizer_kwargs) for text in texts]

In [5]:
texts = train_texts + test_texts
labels = train_labels + test_labels

In [6]:
from string import punctuation

# Функция, собирающая в себе нормализацию и токенизацию всего корпуса одновременно
def preprocess(text):
    text = [normalize(texti) for texti in text]
    all_reviews = tokenize_corpus(text, min_token_size = 3)
    text = " ".join(text)
    all_words = tokenize(text, min_token_size = 3)
    
    return all_reviews, all_words


all_reviews, all_words = preprocess(train_texts)

-----------
# FastText

In [12]:
import pyfasttext
import pandas as pd
from sklearn.utils import shuffle
from sklearn.model_selection import train_test_split

In [101]:
df = pd.DataFrame(list(zip(texts, labels)), columns = ['texts', 'labels'])

df = '__label__' + df['labels'].apply(str) + ' ' + df['texts']

train, test = train_test_split(df, train_size = 0.95, random_state  = 0)

In [102]:
np.savetxt('train.txt', train, delimiter = ' ', fmt = '%s')

## Training

In [107]:
model = pyfasttext.FastText()
model.supervised(input='train.txt', output='model', epoch=50, wordNgrams = 2, label = '__label__')

In [108]:
test_accuracy = np.mean(np.abs(pd.Series(model.predict(test.apply(lambda x: x[11:]))).apply(lambda x: int(x[0])) - \
    test.apply(lambda x: int(x[9])).reset_index()[0]))

train_accuracy = np.mean(np.abs(pd.Series(model.predict(train.apply(lambda x: x[11:]))).apply(lambda x: int(x[0])) - \
    train.apply(lambda x: int(x[9])).reset_index()[0]))

print('train accuracy:', 1- train_accuracy)
print('test accuracy:', 1- test_accuracy)

train accuracy: 0.9999789473684211
test accuracy: 0.9164


---
model.supervised(input='train.txt', output='model', epoch=50, wordNgrams = 2, label = '__label__')    
train accuracy: 0.9999789473684211
test accuracy: 0.9164

---





In [115]:
model.predict(['good terrible'])

[['0']]

# Loading model

In [8]:
from pyfasttext import FastText

In [9]:
model = FastText('model.bin')

In [14]:
int(model.predict(['this movie is bas'])[0][0])

0

In [15]:
import pyfasttext

In [16]:
pyfasttext.__version__

'0.4.6'