#  Sentiment Analysis with NLP
# (Project 2)

# Objective
Perform sentiment analysis using TF-IDF vectorization and Logistic Regression on an IMDB Dataset containing close to 50000 Movie Reviews.

# 1: Import Required Libraries

In [30]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score


# 2: Load the Dataset

In [33]:
# Adding the IMdb dataset 
df = pd.read_csv('IMDB Dataset.csv')


print("Dataset loaded. Shape:", df.shape)
print("\nFirst 5 rows:")
print(df.head())


Dataset loaded. Shape: (50000, 2)

First 5 rows:
                                              review sentiment
0  One of the other reviewers has mentioned that ...  positive
1  A wonderful little production. <br /><br />The...  positive
2  I thought this was a wonderful way to spend ti...  positive
3  Basically there's a family where a little boy ...  negative
4  Petter Mattei's "Love in the Time of Money" is...  positive


# 3: Clean And Prepare the Data

In [36]:
def clean_text(text):
   
    text = re.sub(r'<.*?>', '', text) # It removes HTML tags.
    return text

# Apply the cleaning function to the review column
df['review'] = df['review'].apply(clean_text)

# Map sentiment labels to numerical values
# positive - 1, negative - 0
df['sentiment'] = df['sentiment'].map({'positive': 1, 'negative': 0})

# Define features (X) and target (y)
X = df['review']
y = df['sentiment']

# 4: Split Data into Traning and Testing Splits

In [39]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.25, random_state=42, stratify=y
)
print("\nData split into training and testing sets.")
print("Training set size:", len(X_train))
print("Testing set size:", len(X_test))



Data split into training and testing sets.
Training set size: 37500
Testing set size: 12500


# 5: Vectorize text using TF-IDF

In [42]:
# This converts text data into numerical vectors
tfidf_vectorizer = TfidfVectorizer(stop_words='english', max_features=5000)

# Fit on training data and transform both training and testing data
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)
X_test_tfidf = tfidf_vectorizer.transform(X_test)


# 6: Build and Train Logistic Regression Model

In [44]:
lr_model = LogisticRegression(solver='liblinear', random_state=42)
print("Training the Logistic Regression model...")
lr_model.fit(X_train_tfidf, y_train)
print("Model training complete.")

Training the Logistic Regression model...
Model training complete.


# 7: Evaluate the Model

In [47]:
y_pred = lr_model.predict(X_test_tfidf)


accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred, target_names=['Negative', 'Positive'])

print(f"\n--- Model Evaluation Results ---")
print(f"Accuracy: {accuracy * 100:.2f}%")
print("\nClassification Report:")
print(report)


--- Model Evaluation Results ---
Accuracy: 89.01%

Classification Report:
              precision    recall  f1-score   support

    Negative       0.90      0.88      0.89      6250
    Positive       0.88      0.90      0.89      6250

    accuracy                           0.89     12500
   macro avg       0.89      0.89      0.89     12500
weighted avg       0.89      0.89      0.89     12500



# Key Notes:


This project demonstrates sentiment analysis on customer reviews using:
1) Text preprocessing
2) TF-IDF vectorization Logistic Regression modeling
3) Evaluation using accuracy and classification report

   Link for the dataset - https://www.kaggle.com/code/nourhankarm/sentiment-analysis-of-movie-reviews-imdb-dataset/notebook
    