In [3]:
# Step 0: Setup
import pandas as pd
import string
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords




[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\harol\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [6]:
# Step 1: Load dataset
df = pd.read_csv('../imdb_movie_reviews.csv')  
df.head()





Unnamed: 0,label,review
0,negative,"In the ten years since Wildside aired, nothing..."
1,positive,This is a better-than-average entry in the Sai...
2,negative,"""The Mayor Of Hell"" has the feel of an early D..."
3,positive,This is a really great short from Hal Roach. T...
4,positive,A rather charming depiction of European union ...


In [10]:
# Step 2: Remove punctuation
def remove_punctuation(review):
    return ''.join([char for char in review if char not in string.punctuation])

In [11]:
df['review_no_punct'] = df['review'].apply(remove_punctuation)



In [12]:
# Step 3: Tokenize & lowercase
def tokenize(review):
    return review.lower().split()



In [13]:
df['review_tokens'] = df['review_no_punct'].apply(tokenize)



In [14]:
# Analyze word counts
df['token_count'] = df['review_tokens'].apply(len)
print("Token Count Stats:")
print(df['token_count'].describe())



Token Count Stats:
count    50000.000000
mean       231.996520
std        176.912645
min         10.000000
25%        124.000000
50%        171.000000
75%        282.000000
max       2469.000000
Name: token_count, dtype: float64


In [15]:
# Step 4: Remove stopwords
stop_words = stopwords.words('english')

def remove_stopwords(review_tokens):
    return [word for word in review_tokens if word not in stop_words]

df['review_tokens_nostop'] = df['review_tokens'].apply(remove_stopwords)



In [16]:
# Compare before vs after
df['token_count_nostop'] = df['review_tokens_nostop'].apply(len)
print("\nToken Count After Stopword Removal:")
print(df['token_count_nostop'].describe())




Token Count After Stopword Removal:
count    50000.000000
mean       124.062000
std         96.185059
min          6.000000
25%         65.000000
50%         91.000000
75%        152.000000
max       1449.000000
Name: token_count_nostop, dtype: float64


In [17]:
# Step 5: Save report
df.to_csv('cleaned_movie_reviews.csv', index=False)

In [18]:
import nltk
nltk.download('punkt')
from nltk.stem import PorterStemmer

# Initialize stemmer
stemmer = PorterStemmer()

# Apply stemming
def stem_tokens(tokens):
    return [stemmer.stem(token) for token in tokens]

# Apply to your no-stop-word tokens
df['stemmed_tokens'] = df['review_tokens_nostop'].apply(stem_tokens)


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\harol\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping tokenizers\punkt.zip.


In [19]:
nltk.download('wordnet')
from nltk.stem import WordNetLemmatizer

lemmatizer = WordNetLemmatizer()

# Lemmatize tokens (noun-focused; you can expand to other POS if you like)
def lemmatize_tokens(tokens, pos='n'):
    return [lemmatizer.lemmatize(token, pos=pos) for token in tokens]

# Apply to the same token list
df['lemmatized_tokens'] = df['review_tokens_nostop'].apply(lambda x: lemmatize_tokens(x, pos='v'))


[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\harol\AppData\Roaming\nltk_data...


In [20]:
df[['review_tokens_nostop', 'stemmed_tokens', 'lemmatized_tokens']].head(10)


Unnamed: 0,review_tokens_nostop,stemmed_tokens,lemmatized_tokens
0,"[ten, years, since, wildside, aired, nothing, ...","[ten, year, sinc, wildsid, air, noth, realli, ...","[ten, years, since, wildside, air, nothing, re..."
1,"[betterthanaverage, entry, saint, series, hold...","[betterthanaverag, entri, saint, seri, hold, i...","[betterthanaverage, entry, saint, series, hold..."
2,"[mayor, hell, feel, early, dead, end, kids, fi...","[mayor, hell, feel, earli, dead, end, kid, fil...","[mayor, hell, feel, early, dead, end, kid, fil..."
3,"[really, great, short, hal, roach, two, main, ...","[realli, great, short, hal, roach, two, main, ...","[really, great, short, hal, roach, two, main, ..."
4,"[rather, charming, depiction, european, union,...","[rather, charm, depict, european, union, begin...","[rather, charm, depiction, european, union, be..."
5,"[despite, title, unlike, stories, love, war, f...","[despit, titl, unlik, stori, love, war, film, ...","[despite, title, unlike, stories, love, war, f..."
6,"[found, one, video, store, rented, one, quirky...","[found, one, video, store, rent, one, quirki, ...","[find, one, video, store, rent, one, quirky, q..."
7,"[would, say, film, gives, insight, trauma, you...","[would, say, film, give, insight, trauma, youn...","[would, say, film, give, insight, trauma, youn..."
8,"[prequel, reimagined, battlestar, galactica, s...","[prequel, reimagin, battlestar, galactica, ser...","[prequel, reimagined, battlestar, galactica, s..."
9,"[watched, movie, bad, tense, moments, lot, lon...","[watch, movi, bad, tens, moment, lot, long, di...","[watch, movie, bad, tense, moments, lot, long,..."


In [2]:
!pip install datasets --quiet


In [2]:
from datasets import load_dataset

print("Loading IMDb dataset...")
ds = load_dataset('imdb')
print("Loaded successfully!")


Loading IMDb dataset...
Loaded successfully!


In [3]:
ds

DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 25000
    })
    test: Dataset({
        features: ['text', 'label'],
        num_rows: 25000
    })
    unsupervised: Dataset({
        features: ['text', 'label'],
        num_rows: 50000
    })
})

In [2]:
# First row
ds['train'][0]

# Dataset features
ds['train'].features


{'text': Value(dtype='string', id=None),
 'label': ClassLabel(names=['neg', 'pos'], id=None)}

In [8]:
import pandas as pd

ds_train = pd.DataFrame(ds['train'])
ds_test = pd.DataFrame(ds['test'])

ds_test.head()


Unnamed: 0,text,label
0,I love sci-fi and am willing to put up with a ...,0
1,"Worth the entertainment value of a rental, esp...",0
2,its a totally average film with a few semi-alr...,0
3,STAR RATING: ***** Saturday Night **** Friday ...,0
4,"First off let me say, If you haven't enjoyed a...",0


In [9]:
ds_train.head()

Unnamed: 0,text,label
0,I rented I AM CURIOUS-YELLOW from my video sto...,0
1,"""I Am Curious: Yellow"" is a risible and preten...",0
2,If only to avoid making this type of film in t...,0
3,This film was probably inspired by Godard's Ma...,0
4,"Oh, brother...after hearing about this ridicul...",0


In [10]:
from datasets import Dataset, DatasetDict

# Convert cleaned DataFrames back to HF format
train = Dataset.from_pandas(ds_train)
test = Dataset.from_pandas(ds_test)

# Combine into a DatasetDict
new_ds = DatasetDict({
    'train': train,
    'test': test
})

# View the new dataset object
new_ds


DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 25000
    })
    test: Dataset({
        features: ['text', 'label'],
        num_rows: 25000
    })
})