In [None]:
import re
import urllib
import tarfile
from io import BytesIO

import pandas as pd

In [None]:
data_zipped = urllib.request.urlopen("https://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz")
data_zipped = BytesIO(data_zipped.read())

In [None]:
data = []
with tarfile.open(fileobj=data_zipped, mode="r:gz") as tar:
    for member in tar.getmembers():

        # Determine dataset
        if "/train/" in member.path:
            dataset = "Train"
        elif "/test/" in member.path:
            dataset = "Test"
        else:
            continue

        # Determine label
        if "/pos/" in member.path:
            label = "Positive"
        elif "/neg/" in member.path:
            label = "Negative"
        else:
            continue

        # Extract ID and star rating.
        match = re.search(r"/(?P<id>\d+)_(?P<rating>[0-9]{1,2})\.txt", member.path)
        if match:
            uid = match.group("id")
            rating = match.group("rating")
        else:
            continue

        # Extract contents
        extracted = tar.extractfile(member)
        if extracted:
            content = extracted.read().decode("utf8")

            # Remove HTML breaks.
            content = content.replace("<br />", " ")

            # Remove incorrectly encoded characters.
            # content = content.replace("\x9A", " ")
            content = content.replace("\x84", "\"")
            content = content.replace("\x91", "'")
            content = content.replace("\x95", " ")
            content = content.replace("\x96", "-")
            content = content.replace("\x97", "-")

            # Remove excess white space.
            content = " ".join(content.split())

        else:
            continue

        data.append({
            "id": uid,
            "dataset": dataset,
            "label": label,
            "rating": rating,
            "content": content,
        })

In [None]:
df = pd.DataFrame(data)

In [None]:
df

In [None]:
df.to_csv("imdb_cleansed.csv", index=False)