In [1]:
import io
import os
import pathlib
import requests
import tarfile

import numpy as np
import pandas as pd
from sklearn import feature_extraction, metrics, model_selection, pipeline, preprocessing
import xgboost as xgb


In [2]:
DATA_ARCHIVE = "https://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz"
DATA_DIR = pathlib.Path("./sample_data")

response = requests.get(DATA_ARCHIVE, stream=True)
with tarfile.open(fileobj=response.raw, mode="r|gz") as t:
    t.extractall(DATA_DIR)


In [3]:
def _reviews_to_df(filepath, sentiment):
    d = {"text": [], "sentiment": []}
    review_filepaths = sorted(filepath.glob("*.txt"))
    for review_filepath in review_filepaths:
        with open(review_filepath, 'r') as f:
            review = f.read()
        d["text"].append(review)
        d["sentiment"].append("positive" if sentiment == "pos" else "negative")
    df = pd.DataFrame.from_dict(d)
    return df


def _combine_reviews():
    dfs = []
    for split in ["train", "test"]:
        for sentiment in ["pos", "neg"]:
            df = _reviews_to_df(DATA_DIR / "aclImdb" / split / sentiment, sentiment)
            dfs.append(df)
    combined_df = pd.concat(dfs, ignore_index=True)
    return combined_df


def partition_reviews(seed=42, test_size=0.2):
    combined_df = _combine_reviews()
    random_state = np.random.RandomState(seed)
    train_df, _val_df = model_selection.train_test_split(
        combined_df,
        random_state=random_state,
        shuffle=True,
        stratify=combined_df["sentiment"],
        test_size=test_size
    )

    # split the _val_df equatly into val and test sets
    val_df, test_df = model_selection.train_test_split(
        _val_df,
        random_state=random_state,
        shuffle=True,
        stratify=_val_df["sentiment"],
        test_size=0.5
    )

    return train_df, val_df, test_df


In [4]:
train_df, val_df, test_df = partition_reviews(seed=42)

In [None]:
feature_extraction.text.TfidfVectorizer?

In [5]:
vectorizer = feature_extraction.text.TfidfVectorizer(
    lowercase=True,
    max_df=0.99, # ignore words that show up in more than 99% of reviews
    min_df=0.01, # ignore words than show up in less than 1% of reviews
    norm="l2", # normalize rows using
    dtype=np.float32,
)


train_corpus = train_df.loc[:, "text"]
_ = vectorizer.fit(train_corpus)

In [6]:
X_train = vectorizer.fit_transform(train_corpus)

In [7]:
val_corpus = val_df.loc[:, "text"]
X_val = vectorizer.transform(val_df.loc[:, "text"])

test_corpus = test_df.loc[:, "text"]
X_test = vectorizer.transform(test_df.loc[:, "text"])