In [1]:
import io
import os
import pathlib
import requests
import tarfile

import numpy as np
import pandas as pd
from scipy import sparse
from sklearn import feature_extraction, metrics, model_selection, pipeline, preprocessing
import xgboost as xgb


In [2]:
DATA_ARCHIVE = "https://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz"
DATA_DIR = pathlib.Path("./sample_data")

response = requests.get(DATA_ARCHIVE, stream=True)
with tarfile.open(fileobj=response.raw, mode="r|gz") as t:
    t.extractall(DATA_DIR)


In [3]:
def _reviews_to_df(filepath, sentiment):
    d = {"text": [], "sentiment": []}
    review_filepaths = sorted(filepath.glob("*.txt"))
    for review_filepath in review_filepaths:
        with open(review_filepath, 'r') as f:
            review = f.read()
        d["text"].append(review)
        d["sentiment"].append("positive" if sentiment == "pos" else "negative")
    df = pd.DataFrame.from_dict(d)
    return df


def _combine_reviews():
    dfs = []
    for split in ["train", "test"]:
        for sentiment in ["pos", "neg"]:
            df = _reviews_to_df(DATA_DIR / "aclImdb" / split / sentiment, sentiment)
            dfs.append(df)
    combined_df = pd.concat(dfs, ignore_index=True)
    return combined_df


def partition_reviews(seed=42, test_size=0.2):
    combined_df = _combine_reviews()
    random_state = np.random.RandomState(seed)
    train_df, _val_df = model_selection.train_test_split(
        combined_df,
        random_state=random_state,
        shuffle=True,
        stratify=combined_df["sentiment"],
        test_size=test_size
    )

    # split the _val_df equally into val and test sets
    val_df, test_df = model_selection.train_test_split(
        _val_df,
        random_state=random_state,
        shuffle=True,
        stratify=_val_df["sentiment"],
        test_size=0.5
    )

    return train_df, val_df, test_df


In [4]:
train_df, val_df, test_df = partition_reviews(seed=42)

In [43]:
feature_extraction.text.TfidfVectorizer?

In [5]:
vectorizer = feature_extraction.text.TfidfVectorizer(
    lowercase=True,
    max_df=0.99, # ignore words that show up in more than 99% of reviews
    min_df=0.01, # ignore words than show up in less than 1% of reviews
    norm="l2", # normalize rows using
    dtype=np.float32,
)


train_corpus = train_df.loc[:, "text"]
_ = vectorizer.fit(train_corpus)

In [6]:
X_train = vectorizer.fit_transform(train_corpus)

In [7]:
X_train

<40000x1792 sparse matrix of type '<class 'numpy.float32'>'
	with 3995100 stored elements in Compressed Sparse Row format>

In [12]:
vectorizer_hyperparams = {
    "lowercase": True,
    "max_df": 0.9, # ignore words that show up in more than 90% of reviews
    "min_df": 0.1, # ignore words than show up in less than 10% of reviews
    "norm": "l2", # normalize rows using
    "dtype": np.float32,
}

features_preprocessing = pipeline.make_pipeline(
    feature_extraction.text.TfidfVectorizer(
        **vectorizer_hyperparams,
    ),
    preprocessing.FunctionTransformer(lambda csr: csr.toarray(), lambda arr: sparse.csr_matrix(arr))
)

In [13]:
X_train = features_preprocessing.fit_transform(train_corpus)

val_corpus = val_df.loc[:, "text"]
X_val = features_preprocessing.transform(val_df.loc[:, "text"])

test_corpus = test_df.loc[:, "text"]
X_test = features_preprocessing.transform(test_df.loc[:, "text"])

In [14]:
X_train.shape

(40000, 208)

In [15]:
len(features_preprocessing["tfidfvectorizer"].vocabulary_)

208

In [20]:
_vocabulary = features_preprocessing["tfidfvectorizer"].vocabulary_
_by_values = sorted(_vocabulary.items(), key=lambda item: item[1])
_columns = [k for k, _ in _by_values]

(pd.DataFrame(X_train, index=train_df.index, columns=_columns)
   .assign(sentiment=train_df.loc[:, "sentiment"] == "positive")
   .corr(method="spearman", numeric_only=True)
   .style
   .background_gradient("RdBu", vmax=1, vmin=-1)
   .set_sticky(axis="index")
)


Output hidden; open in https://colab.research.google.com to view.

In [21]:
(pd.DataFrame(X_train, index=train_df.index, columns=_columns)
   .assign(sentiment=train_df.loc[:, "sentiment"] == "positive")
   .corr(method="spearman", numeric_only=True)
   .loc[:, "sentiment"]
   .sort_values(ascending=True)
)

bad         -0.291273
no          -0.158781
nothing     -0.146109
even        -0.143171
acting      -0.129765
               ...   
well         0.106275
love         0.131077
best         0.150646
great        0.213601
sentiment    1.000000
Name: sentiment, Length: 209, dtype: float64

In [24]:
(pd.DataFrame(X_train, index=train_df.index, columns=_columns)
   .assign(sentiment=train_df.loc[:, "sentiment"])
   .loc[:, ["sentiment", "bad"]]
   .groupby("sentiment")
   .describe()
)

Unnamed: 0_level_0,bad,bad,bad,bad,bad,bad,bad,bad
Unnamed: 0_level_1,count,mean,std,min,25%,50%,75%,max
sentiment,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2
negative,20000.0,0.050004,0.088824,0.0,0.0,0.0,0.085557,0.824588
positive,20000.0,0.011588,0.038542,0.0,0.0,0.0,0.0,0.82449


In [25]:
(pd.DataFrame(X_train, index=train_df.index, columns=_columns)
   .assign(sentiment=train_df.loc[:, "sentiment"])
   .loc[:, ["sentiment", "great"]]
   .groupby("sentiment")
   .describe()
)

Unnamed: 0_level_0,great,great,great,great,great,great,great,great
Unnamed: 0_level_1,count,mean,std,min,25%,50%,75%,max
sentiment,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2
negative,20000.0,0.016458,0.043807,0.0,0.0,0.0,0.0,0.533142
positive,20000.0,0.045872,0.082885,0.0,0.0,0.0,0.078369,0.790528
