In [66]:
import os
import re
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import nltk

# Load data

In [8]:
train = []
test = []

for line in open(os.getcwd() + '/data/imbd_train.txt', 'r', encoding='latin1'):
    train.append(line.strip())

for line in open(os.getcwd() + '/data/imbd_test.txt', 'r', encoding='latin1'):    
    test.append(line.strip())

# Feature engineering

REMOVING NON-TEXT CHARACTERS

In [16]:
REPLACE_NO_SPACE = re.compile("(\.)|(\;)|(\:)|(\!)|(\?)|(\,)|(\")|(\()|(\))|(\[)|(\])|(\d+)")
REPLACE_WITH_SPACE = re.compile("(<br\s*/><br\s*/>)|(\-)|(\/)")

def preprocess_reviews(reviews):
    
    # Para todas las reviews en minuscula, sustituye algunas cosas por espacio y otras por vacio.
    reviews = [REPLACE_NO_SPACE.sub('', line.lower()) for line in reviews]
    reviews = [REPLACE_WITH_SPACE.sub(' ', line) for line in reviews]
    
    return reviews

In [17]:
train_clean = preprocess_reviews(train)
test_clean = preprocess_reviews(test)

In [23]:
example1 = pd.DataFrame(
    data={
        'Before': [train[0][:99], train[1][:99], train[2][:99]],
        'After': [train_clean[0][:99], train_clean[1][:99], train_clean[2][:99]]
    },
    index=[0,1,2]
)
example1

Unnamed: 0,Before,After
0,Bromwell High is a cartoon comedy. It ran at t...,bromwell high is a cartoon comedy it ran at th...
1,Homelessness (or Houselessness as George Carli...,homelessness or houselessness as george carlin...
2,Brilliant over-acting by Lesley Ann Warren. Be...,brilliant over acting by lesley ann warren bes...


REMOVING STOP WORDS

In [26]:
nltk.download('stopwords')

stop_words = nltk.corpus.stopwords.words('english')
stop_words[:10]

[nltk_data] Downloading package stopwords to /home/n0t10n/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're"]

In [27]:
def remove_stop_words(corpus):
    clean_corpus = []
    
    for review in corpus:  
        clean_corpus.append(
            ' '.join([word for word in review.split() if word not in stop_words])
        )
        
    return clean_corpus

In [28]:
stw_train = remove_stop_words(train_clean)
stw_test = remove_stop_words(test_clean)

TARGET

In [65]:
target = [1 if i < 12500 else 0 for i in range(25000)]



12500

# Model (Random search)

In [None]:



def train_model(X_TRAIN, X_TEST):
    
    lr = LogisticRegression()
    
    params = {
        'C': [0.01, 0.05, 0.25, 0.5, 1]
    }
    
    grid = GridSearchCV(lr, params, cv=5)
    grid.fit(X_TRAIN, target)

    print ("Final Accuracy: %s" % accuracy_score(target, grid.best_estimator_.predict(X_TEST)))

In [38]:
from sklearn.pipeline import Pipeline

from sklearn.model_selection import train_test_split
from sklearn.model_selection import RandomizedSearchCV

from sklearn.feature_extraction.text import CountVectorizer

from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC

In [None]:
logreg = Pipeline(steps=[
    ('vectorizer', CountVectorizer()),
    ('logreg', LogisticRegression())
])

svm = Pipeline(step=[
    ('vectorizer', CountVectorizer()),
    ('svm', LinearSVC())
])

logreg_param = {
    'vectorizer__binary': [True],
    'vectorizer__ngram_range': (1, 3),
    'vectorizer__stop_words': stop_words,
    'logreg__C': [0.01, 0.05, 0.25, 0.5, 1],
    'logreg__penalty': ['l1', 'l2']
}

svm_param = {
    'vectorizer__binary': [True],
    'vectorizer__ngram_range': (1, 3),
    'vectorizer__stop_words': stop_words,
    'scm__C': [0.01, 0.05, 0.25, 0.5, 1],
    'svm__penalty': ['l1', 'l2']
}

logreg_search = RandomizedSearchCV(
    logreg,
    logreg_param,
    n_iter = 50,
    scoring = 'accuracy',
    n_jobs = -1,
    cv = 5
)

svm_search = RandomizedSearchCV(
    svm,
    svm_param,
    n_iter = 50,
    scoring = 'accuracy',
    n_jobs = -1,
    cv = 5
)

grids = {
    'rs_log_reg': logreg_search,
    'rs_svm': svm_search
}