<a href="https://colab.research.google.com/github/Gutierrezjk/MIAD_ML_NLP_2025/blob/main/Sentiment_Analysis_(NLP).ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import numpy as np
import pandas as pd

# Set a fixed seed to ensure our results are reproducible.
np.random.seed(42)

# Load the IMDb movie reviews dataset from GitHub.
# Note: This dataset contains text reviews and sentiment labels.
url = (
    "https://raw.githubusercontent.com/Ankit152/IMDB-sentiment-analysis/refs/heads/master/IMDB-Dataset.csv"

)
df = pd.read_csv(url)

# Check the shape of the data (rows, columns) and preview the first few records.
print("Data shape:", df.shape)
print(df.head())

Data shape: (50000, 2)
                                              review sentiment
0  One of the other reviewers has mentioned that ...  positive
1  A wonderful little production. <br /><br />The...  positive
2  I thought this was a wonderful way to spend ti...  positive
3  Basically there's a family where a little boy ...  negative
4  Petter Mattei's "Love in the Time of Money" is...  positive


In [None]:
import re

def preprocessor(text):
    """
    Clean the input text:
    - Remove HTML markup.
    - Extract emoticons and preserve them.
    - Remove non-word characters (like punctuation) and convert to lowercase.
    - Append cleaned emoticons (without the hyphen) back to the text.
    """
    # Remove HTML tags using regex
    text = re.sub(r"<[^>]*>", "", text)

    # Find emoticons (patterns like :), :-), :D, etc.)
    emoticons = re.findall(r"(?::|;|=)(?:-)?(?:\)|\(|D|P)", text)

    # Remove non-word characters, change text to lowercase, and append emoticons at the end.
    text = re.sub(r"[\W]+", " ", text.lower()) + " " + " ".join(emoticons).replace("-", "")
    return text

# Apply the preprocessor to our reviews
df["review_clean"] = df["review"].apply(preprocessor)

# Print a sample cleaned review (displaying the last 100 characters for brevity)
print("\nSample cleaned review:", df.loc[0, "review_clean"][-100:])


Sample cleaned review: comfortable with what is uncomfortable viewing thats if you can get in touch with your darker side  


In [None]:
from sklearn.feature_extraction.text import CountVectorizer

# Instantiate the CountVectorizer.
# This will first split our text into tokens and then count occurrences.
vectorizer = CountVectorizer()

# Fit the vectorizer on our cleaned review texts and transform them into numerical feature vectors.
X = vectorizer.fit_transform(df["review_clean"])

# Let’s inspect a small portion of the resulting vocabulary.
print("\nSample vocabulary mapping (word -> index):")
sample_vocab = dict(list(vectorizer.vocabulary_.items())[:10])
print(sample_vocab)

# Also print the bag-of-words array for the first 3 reviews.
print("\nBag-of-words representation for the first 3 reviews:")
print(X[:3].toarray())


Sample vocabulary mapping (word -> index):
{'one': 65523, 'of': 65112, 'the': 91976, 'other': 66197, 'reviewers': 77049, 'has': 41639, 'mentioned': 58782, 'that': 91940, 'after': 3161, 'watching': 100325}

Bag-of-words representation for the first 3 reviews:
[[0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]]


In [None]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

# Convert sentiment labels to binary: "positive" -> 1, "negative" -> 0
df["sentiment_bin"] = df["sentiment"].map({"positive": 1, "negative": 0})

# Split the data: 80% training and 20% testing
X_train, X_test, y_train, y_test = train_test_split(
    X, df["sentiment_bin"], test_size=0.2, random_state=42
)

# Initialize a logistic regression model.
# We use the 'liblinear' solver which is well-suited for small-to-medium datasets.
lr = LogisticRegression(solver="liblinear")

# Train the model on our training data.
lr.fit(X_train, y_train)

# Evaluate the model's performance on the test set.
y_pred = lr.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print(f"\nTest Accuracy: {accuracy:.3f}")



Test Accuracy: 0.889


In [None]:
# Retrieve feature names (words) from our vectorizer.
feature_names = np.array(vectorizer.get_feature_names_out())

# Flatten the array of coefficients from our logistic regression model.
coef = lr.coef_.ravel()

# Find the top 10 words that increase sentiment.
top_pos_indices = np.argsort(coef)[-10:]
print("\nTop 10 words likely to indicate positive sentiment:")
for idx in top_pos_indices:
    print(f"{feature_names[idx]}: {coef[idx]:.4f}")

# Find the top 10 words that decrease sentiment.
top_neg_indices = np.argsort(coef)[:10]
print("\nTop 10 words likely to indicate negative sentiment:")
for idx in top_neg_indices:
    print(f"{feature_names[idx]}: {coef[idx]:.4f}")



Top 10 words likely to indicate positive sentiment:
underrated: 1.2617
dismiss: 1.2719
notting: 1.2729
nevertheless: 1.2860
appreciated: 1.2981
pleased: 1.3016
apocalyptic: 1.3494
disappoint: 1.3961
hooked: 1.6563
refreshing: 1.7364

Top 10 words likely to indicate negative sentiment:
waste: -2.1675
worst: -2.0344
disappointment: -1.9288
forgettable: -1.7623
mst3k: -1.6629
uninteresting: -1.6447
awful: -1.5995
disappointing: -1.5612
lacks: -1.5602
mediocre: -1.5590
