In [9]:
# 1. Import Libraries
import pandas as pd

# Load the IMDb dataset (make sure the file name matches your uploaded file)
df = pd.read_csv('/content/IMDB Dataset.csv')

# Display the first few rows of the dataset
print(df.head())

# Check the distribution of sentiment classes
print(df['sentiment'].value_counts())

# 2. Preprocess the Text
import re
import string
import nltk
from nltk.corpus import stopwords

# Download the list of English stopwords from NLTK
nltk.download('stopwords')

# Create a set of English stopwords
stop_words = set(stopwords.words('english'))

# Define a function to clean and preprocess the review text
def preprocess_text(text):
    text = text.lower()  # Convert to lowercase
    text = re.sub(r"<.*?>", "", text)  # Remove HTML tags
    text = re.sub(f"[{re.escape(string.punctuation)}]", "", text)  # Remove punctuation
    text = re.sub(r"\d+", "", text)  # Remove numbers
    words = text.split()  # Tokenize by whitespace
    words = [word for word in words if word not in stop_words]  # Remove stopwords
    return " ".join(words)  # Join the cleaned words back into one string

# Apply preprocessing to all reviews
df['review'] = df['review'].apply(preprocess_text)

# Convert sentiment labels to numeric values: 1 = positive, 0 = negative
df['sentiment'] = df['sentiment'].map({'positive': 1, 'negative': 0})

# 3. Train/Test Split
from sklearn.model_selection import train_test_split

# Define features (X) and labels (y)
X = df['review']
y = df['sentiment']

# Split the data into training (80%) and testing (20%) sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# 4. Vectorize Text with TF-IDF
from sklearn.feature_extraction.text import TfidfVectorizer

# Initialize the TF-IDF Vectorizer and limit to 5000 most important features
tfidf = TfidfVectorizer(max_features=5000)

# Fit on training data and transform both training and testing sets
X_train_vec = tfidf.fit_transform(X_train)
X_test_vec = tfidf.transform(X_test)

# 5. Train the Model using Logistic Regression
from sklearn.linear_model import LogisticRegression

# Initialize and train the model
model = LogisticRegression()
model.fit(X_train_vec, y_train)

# 6. Evaluate the Model
from sklearn.metrics import accuracy_score, f1_score

# Make predictions on the test set
y_pred = model.predict(X_test_vec)

# Print evaluation metrics
print("Accuracy:", accuracy_score(y_test, y_pred))
print("F1 Score:", f1_score(y_test, y_pred))

# 7. Save the Model and Vectorizer using Pickle
import pickle

# Save the trained model to a .pkl file
with open('sentiment_model.pkl', 'wb') as f:
    pickle.dump(model, f)

# Save the TF-IDF vectorizer to a .pkl file
with open('tfidf_vectorizer.pkl', 'wb') as f:
    pickle.dump(tfidf, f)

# 8. Download the saved model and vectorizer files (for use in Flask app or elsewhere)
from google.colab import files

files.download('sentiment_model.pkl')  # Download model file
files.download('tfidf_vectorizer.pkl')  # Download vectorizer file


                                              review sentiment
0  One of the other reviewers has mentioned that ...  positive
1  A wonderful little production. <br /><br />The...  positive
2  I thought this was a wonderful way to spend ti...  positive
3  Basically there's a family where a little boy ...  negative
4  Petter Mattei's "Love in the Time of Money" is...  positive
sentiment
positive    25000
negative    25000
Name: count, dtype: int64


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Accuracy: 0.8874
F1 Score: 0.8897375636506071


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [8]:
from google.colab import files

# Download the model
files.download('sentiment_model.pkl')

# Download the vectorizer
files.download('tfidf_vectorizer.pkl')


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>