In [None]:
from google.colab import drive
import pandas as pd

# Mount Google Drive
drive.mount('/content/drive')

# Provide the correct file path (modify it based on your folder structure)
file_path = '/content/drive/My Drive/IMDB Dataset.csv'

# Load the dataset
df = pd.read_csv(file_path)

# Display the first few rows
print(df.head())

# Check for missing values
print(df.isnull().sum())

# Check data distribution
print(df['sentiment'].value_counts())


Mounted at /content/drive
                                              review sentiment
0  One of the other reviewers has mentioned that ...  positive
1  A wonderful little production. <br /><br />The...  positive
2  I thought this was a wonderful way to spend ti...  positive
3  Basically there's a family where a little boy ...  negative
4  Petter Mattei's "Love in the Time of Money" is...  positive
review       0
sentiment    0
dtype: int64
sentiment
positive    25000
negative    25000
Name: count, dtype: int64


In [None]:

# Summary of dataset
print("Dataset Summary:\n", df.describe())

# Check dataset information
print("\nDataset Info:")
print(df.info())

# Count of each sentiment category
print("\nSentiment Distribution:")
print(df['sentiment'].value_counts())


Dataset Summary:
                                                    review sentiment
count                                               50000     50000
unique                                              49582         2
top     Loved today's show!!! It was a variety and not...  positive
freq                                                    5     25000

Dataset Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50000 entries, 0 to 49999
Data columns (total 2 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   review     50000 non-null  object
 1   sentiment  50000 non-null  object
dtypes: object(2)
memory usage: 781.4+ KB
None

Sentiment Distribution:
sentiment
positive    25000
negative    25000
Name: count, dtype: int64


In [None]:
import re
from bs4 import BeautifulSoup

def clean_text(text):
    # Remove HTML tags
    text = BeautifulSoup(text, "html.parser").get_text()

    # Remove special characters and numbers
    text = re.sub(r'[^a-zA-Z]', ' ', text)

    # Convert to lowercase
    text = text.lower()

    return text

# Apply the cleaning function
df['cleaned_review'] = df['review'].apply(clean_text)

# Display first 5 cleaned reviews
print(df[['review', 'cleaned_review']].head())


                                              review  \
0  One of the other reviewers has mentioned that ...   
1  A wonderful little production. <br /><br />The...   
2  I thought this was a wonderful way to spend ti...   
3  Basically there's a family where a little boy ...   
4  Petter Mattei's "Love in the Time of Money" is...   

                                      cleaned_review  
0  one of the other reviewers has mentioned that ...  
1  a wonderful little production  the filming tec...  
2  i thought this was a wonderful way to spend ti...  
3  basically there s a family where a little boy ...  
4  petter mattei s  love in the time of money  is...  


In [None]:
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

# Download required NLTK data
nltk.download('punkt_tab')  # Updated tokenizer resource
nltk.download('stopwords')
nltk.download('wordnet')

stop_words = set(stopwords.words('english'))

def remove_stopwords_and_tokenize(text):
    words = word_tokenize(text)  # Tokenize text
    words = [word for word in words if word not in stop_words]  # Remove stopwords
    return words

# Apply stopword removal and tokenization
df['tokenized_review'] = df['cleaned_review'].apply(remove_stopwords_and_tokenize)

# Display first 5 tokenized reviews
print(df[['cleaned_review', 'tokenized_review']].head())


[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...


                                      cleaned_review  \
0  one of the other reviewers has mentioned that ...   
1  a wonderful little production  the filming tec...   
2  i thought this was a wonderful way to spend ti...   
3  basically there s a family where a little boy ...   
4  petter mattei s  love in the time of money  is...   

                                    tokenized_review  
0  [one, reviewers, mentioned, watching, oz, epis...  
1  [wonderful, little, production, filming, techn...  
2  [thought, wonderful, way, spend, time, hot, su...  
3  [basically, family, little, boy, jake, thinks,...  
4  [petter, mattei, love, time, money, visually, ...  


In [None]:
from nltk.stem import PorterStemmer

stemmer = PorterStemmer()

def stem_words(words):
    return [stemmer.stem(word) for word in words]

# Apply stemming
df['stemmed_review'] = df['tokenized_review'].apply(stem_words)

# Display first 5 stemmed reviews
print(df[['tokenized_review', 'stemmed_review']].head())


                                    tokenized_review  \
0  [one, reviewers, mentioned, watching, oz, epis...   
1  [wonderful, little, production, filming, techn...   
2  [thought, wonderful, way, spend, time, hot, su...   
3  [basically, family, little, boy, jake, thinks,...   
4  [petter, mattei, love, time, money, visually, ...   

                                      stemmed_review  
0  [one, review, mention, watch, oz, episod, hook...  
1  [wonder, littl, product, film, techniqu, unass...  
2  [thought, wonder, way, spend, time, hot, summe...  
3  [basic, famili, littl, boy, jake, think, zombi...  
4  [petter, mattei, love, time, money, visual, st...  


In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

# Convert stemmed reviews back to text format
df['stemmed_review_text'] = df['stemmed_review'].apply(lambda x: ' '.join(x))

# Initialize TF-IDF Vectorizer
tfidf = TfidfVectorizer(max_features=5000)  # Limiting features to 5000 for efficiency

# Apply TF-IDF transformation
X_tfidf = tfidf.fit_transform(df['stemmed_review_text'])

# Display shape of TF-IDF matrix
print("TF-IDF Matrix Shape:", X_tfidf.shape)


TF-IDF Matrix Shape: (50000, 5000)


In [None]:
from sklearn.preprocessing import LabelEncoder

# Initialize Label Encoder
label_encoder = LabelEncoder()

# Transform sentiment labels
df['sentiment_label'] = label_encoder.fit_transform(df['sentiment'])

# Display first 5 encoded sentiment labels
print(df[['sentiment', 'sentiment_label']].head())


  sentiment  sentiment_label
0  positive                1
1  positive                1
2  positive                1
3  negative                0
4  positive                1


In [None]:
from sklearn.model_selection import train_test_split

# Define features (X) and target labels (y)
X = X_tfidf
y = df['sentiment_label']

# Split into 80% training and 20% testing
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# Display dataset sizes
print("Training set size:", X_train.shape)
print("Testing set size:", X_test.shape)


Training set size: (40000, 5000)
Testing set size: (10000, 5000)


In [None]:
from sklearn.linear_model import LogisticRegression

# Initialize Logistic Regression model
log_reg = LogisticRegression(solver='liblinear', random_state=42)

# Train the model
log_reg.fit(X_train, y_train)

# Display training completion message
print("Model training complete! ✅")


Model training complete! ✅


In [None]:
# Predict on training data
train_predictions = log_reg.predict(X_train)
train_accuracy = accuracy_score(y_train, train_predictions)

# Predict on testing data
test_predictions = log_reg.predict(X_test)
test_accuracy = accuracy_score(y_test, test_predictions)

# Print results
print(f"Training Accuracy: {train_accuracy:.4f}")
print(f"Testing Accuracy: {test_accuracy:.4f}")


Training Accuracy: 0.9092
Testing Accuracy: 0.8884


In [None]:
import joblib

# Save the trained model
joblib.dump(log_reg, "sentiment_model.pkl")
# Save the trained TF-IDF vectorizer correctly
joblib.dump(tfidf, "tfidf_vectorizer.pkl")


['tfidf_vectorizer.pkl']