In [None]:
import pandas as pd
import re
import string
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
# Load the IMDb dataset
df = pd.read_csv('/content/IMDB Dataset.csv')

In [None]:
df.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


In [None]:
# Convert sentiment to binary values (0 = negative, 1 = positive)
df['sentiment'] = df['sentiment'].apply(lambda x: 1 if x == 'positive' else 0)

In [None]:
# Text cleaning function
def clean_text(text):
    text = text.lower()  # Convert to lowercase
    text = re.sub(r'http\S+', '', text)  # Remove URLs
    text = re.sub(r'@\w+', '', text)  # Remove mentions
    text = re.sub(r'<.*?>', '', text)  # Remove HTML tags
    text = text.translate(str.maketrans('', '', string.punctuation))  # Remove punctuation
    text = re.sub(r'\d+', '', text)  # Remove numbers
    text = text.strip()  # Remove leading and trailing whitespaces
    return text

In [None]:
# Apply text cleaning to the dataset
df['review'] = df['review'].apply(clean_text)

In [None]:
# Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(df['review'], df['sentiment'], test_size=0.2, random_state=42)

In [None]:
# Convert text to numerical features using CountVectorizer
vectorizer = CountVectorizer(max_features=5000)
X_train_vect = vectorizer.fit_transform(X_train)
X_test_vect = vectorizer.transform(X_test)

In [None]:
# Train the logistic regression model
model = LogisticRegression()
model.fit(X_train_vect, y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [None]:
# Make predictions
y_pred = model.predict(X_test_vect)

In [None]:
# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy:.2f}')

Accuracy: 0.88


An accuracy of 0.88 indicates that the model performed well on the test data, successfully identifying the sentiment of 88% of the reviews. This metric is important because it provides a straightforward measure of how well the model is performing overall.

In [None]:
# Display the confusion matrix and classification report
print('Confusion Matrix:')
print(confusion_matrix(y_test, y_pred))

Confusion Matrix:
[[4325  636]
 [ 536 4503]]




1.   True Negatives (TN): 4325 - This indicates the number of negative reviews that were correctly predicted as negative.
2.   False Positives (FP): 636 - This indicates the number of negative reviews that were incorrectly predicted as positive.
3.  False Negatives (FN): 536 - This indicates the number of positive reviews that were incorrectly predicted as negative.
4.  True Positives (TP): 4503 - This indicates the number of positive reviews that were correctly predicted as positive.

In [None]:
print('Classification Report:')
print(classification_report(y_test, y_pred))

Classification Report:
              precision    recall  f1-score   support

           0       0.89      0.87      0.88      4961
           1       0.88      0.89      0.88      5039

    accuracy                           0.88     10000
   macro avg       0.88      0.88      0.88     10000
weighted avg       0.88      0.88      0.88     10000



This report indicates that your model is performing reasonably well, with an overall accuracy of 88%. Precision and recall values are also relatively high, suggesting that the model is effectively distinguishing between positive and negative sentiments. The F1-scores of 0.88 for both classes indicate a good balance between precision and recall.

In [None]:
# Function to predict sentiment for a single line of text
def predict_sentiment(text):
    # Clean the text
    cleaned_text = clean_text(text)  # Use the same clean_text function
    # Vectorize the cleaned text
    text_vect = vectorizer.transform([cleaned_text])  # Note the input is a list
    # Predict using the trained model
    prediction = model.predict(text_vect)
    return "Positive" if prediction[0] == 1 else "Negative"

# Example usage
input_text = "I absolutely loved this movie! It was fantastic."
sentiment = predict_sentiment(input_text)
print(f'The sentiment of the review is: {sentiment}')

The sentiment of the review is: Positive


In [None]:
# Function to predict sentiment for a single line of text
def predict_sentiment(text):
    # Clean the text
    cleaned_text = clean_text(text)  # Use the same clean_text function
    # Vectorize the cleaned text
    text_vect = vectorizer.transform([cleaned_text])  # Note the input is a list
    # Predict using the trained model
    prediction = model.predict(text_vect)
    return "Positive" if prediction[0] == 1 else "Negative"

# Example usage
input_text1 = "I movie was absolutely horrible."
sentiment = predict_sentiment(input_text1)
print(f'The sentiment of the review is: {sentiment}')

The sentiment of the review is: Negative
