# Fetch data

Dataset created by Maas et al. (2011), available [here](https://ai.stanford.edu/~amaas/data/sentiment/]).

In [None]:
import glob
import os
import pandas as pd
import re
import requests
import tarfile

from tqdm import tqdm

In [None]:
os.makedirs("data/processed/", exist_ok=True)

In [None]:
# Download
def get_data():
    url = "https://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz"
    filename = "aclImdb_v1.tar.gz"
    
    print("Downloading file...")
    response = requests.get(url, stream=True)
    total_size = int(response.headers.get('content-length', 0))
    
    with open(filename, 'wb') as file, tqdm(
            desc=filename,
            total=total_size,
            unit='B',
            unit_scale=True,
            unit_divisor=1024,
        ) as bar:
        for data in response.iter_content(chunk_size=1024):
            size = file.write(data)
            bar.update(size)
    
    print("Extracting file...")
    if os.path.exists("aclImdb"):
        shutil.rmtree("aclImdb")
    with tarfile.open(filename) as tar:
        tar.extractall(filter='data')

    print("Renaming folder...")
    if os.path.exists("data/raw"): 
        shutil.rmtree("data/raw")
    if not os.path.exists("data"):
        os.makedirs("data")
    os.rename("aclImdb", "data/raw")
    
    print("Cleaning up...")
    os.remove(filename)
    
    print("Done! The dataset is now available in the 'data/raw' folder.")

if __name__ == "__main__":
    get_data()

In [None]:
# Restructure
def process_data(base_dir):
    train_data = []
    test_data = []
    unsup_data = []
    
    # Process both train and test splits
    for split in ['train', 'test']:
        split_path = os.path.join(base_dir, split)
            
        # Process positive and negative reviews
        for sentiment in ['pos', 'neg']:
            sentiment_path = os.path.join(split_path, sentiment)
            sentiment_label = 'positive' if sentiment == 'pos' else 'negative'
            
            # Skip if directory doesn't exist
            if not os.path.exists(sentiment_path):
                print(f"Warning: {sentiment_path} does not exist")
                continue
                
            # Find all text files
            files = glob.glob(os.path.join(sentiment_path, '*.txt'))
            
            for file_path in files:
                # Extract id and rating from filename
                filename = os.path.basename(file_path)
                match = re.match(r'(\d+)_(\d+)\.txt', filename)
                
                if match:
                    review_id, rating = match.groups()
                    
                    # Read the review text
                    with open(file_path, 'r', encoding='utf-8', errors='replace') as f:
                        text = f.read().strip()
                    
                    # Add to appropriate list based on split
                    entry = {
                        'id': int(review_id),
                        'text': text,
                        'sentiment': sentiment_label,
                        'rating': int(rating)
                    }
                    
                    if split == 'train':
                        train_data.append(entry)
                    else:
                        test_data.append(entry)
        
        # Process unsupervised data in train split
        if split == 'train':
            unsup_path = os.path.join(split_path, 'unsup')
            files = glob.glob(os.path.join(unsup_path, '*.txt'))
                
            for file_path in files:
                # Extract id from filename
                filename = os.path.basename(file_path)
                match = re.match(r'(\d+)_0\.txt', filename)
                    
                if match:
                    review_id = match.groups()[0]
                        
                    # Read the review text
                    with open(file_path, 'r', encoding='utf-8', errors='replace') as f:
                        text = f.read().strip()
                        
                    # Add to unsupervised data list
                    unsup_data.append({
                        'id': int(review_id),
                        'text': text
                    })
    
    # Convert lists to DataFrames
    train = pd.DataFrame(train_data)
    test = pd.DataFrame(test_data)
    unsup = pd.DataFrame(unsup_data)
    
    return train, test, unsup

train, test, unsup = process_data("data/raw/")

In [None]:
# Check that data is correctly loaded
print(f"Train data shape: {train.shape}")
print(f"Test data shape: {test.shape}")
print(f"Unsupervised data shape: {unsup.shape}")

print("\nTrain data:")
print(train.head(2))
print("\n")
print("\nTest data sample:")
print(test.head(2))
print("\n")
print("\nUnsupervised data sample:")
print(unsup.head(2))

In [None]:
# Rename id column as movie_id to avoid confusion
train = train.rename(columns={'id': 'movie_id'})
test = test.rename(columns={'id': 'movie_id'})

# Create reviews id for matching with the original dataset
train['review_id'] = train.index + 1
test['review_id'] = test.index + 1

In [None]:
# Save
train.to_csv("data/processed/train.csv", index=False)
test.to_csv("data/processed/test.csv", index=False)
unsup.to_csv("data/processed/unsup.csv", index=False)