In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

## Sentiment Analysis of Movie Reviews

**Objective**: Build a sentiment analysis model to classify IMDb movie reviews as positive or negative.

In [None]:
pip install nltk


There was an issue accessing the IMDb movie review dataset. We used an alternative method to load the dataset `nltk` library, which provides access to the IMDb movie reviews dataset.

## Data Preparation 

In [None]:
import nltk
from nltk.corpus import movie_reviews

# Download the IMDb movie reviews dataset
nltk.download('movie_reviews')

# Load the movie reviews
reviews = []
for category in movie_reviews.categories():
    for fileid in movie_reviews.fileids(category):
        review = {
            'text': movie_reviews.raw(fileid),
            'label': category
        }
        reviews.append(review)

# Convert to DataFrame
imdb_df = pd.DataFrame(reviews)

# Display the first few rows of the dataset
print(imdb_df.head())

# Check the distribution of labels
print(imdb_df['label'].value_counts())


## Text Preprocessing 

In [None]:
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
import string

# Download stopwords
nltk.download('stopwords')

# Initialize stemmer and set of stopwords
stemmer = PorterStemmer()
stop_words = set(stopwords.words('english'))

# Function to preprocess text
def preprocess_text(text):
    # Convert text to lowercase
    text = text.lower()
    
    # Tokenization
    tokens = word_tokenize(text)
    
    # Removing special characters and stopwords, and performing stemming
    preprocessed_tokens = []
    for token in tokens:
        if token not in string.punctuation and token not in stop_words:
            preprocessed_tokens.append(stemmer.stem(token))
    
    # Join tokens back into text
    preprocessed_text = ' '.join(preprocessed_tokens)
    
    return preprocessed_text

# Apply text preprocessing to the 'text' column of the DataFrame
imdb_df['preprocessed_text'] = imdb_df['text'].apply(preprocess_text)

# Display the preprocessed text
print(imdb_df['preprocessed_text'].head())


## TF-IDF Feature Extraction 

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

# Initialize the TfidfVectorizer
tfidf_vectorizer = TfidfVectorizer()

# Fit and transform the preprocessed text data
X_tfidf = tfidf_vectorizer.fit_transform(imdb_df['preprocessed_text'])

# Display the shape of the TF-IDF matrix
print("TF-IDF matrix shape:", X_tfidf.shape)

## Model Selection 

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_tfidf, imdb_df['label'], test_size=0.2, random_state=42)

# Initialize the logistic regression model
logistic_regression_model = LogisticRegression()

# Train the model on the training data
logistic_regression_model.fit(X_train, y_train)


## Model Evaluation 

In [None]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

# Make predictions on the test data
y_pred = logistic_regression_model.predict(X_test)

# Calculate evaluation metrics
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, pos_label='pos')
recall = recall_score(y_test, y_pred, pos_label='pos')
f1 = f1_score(y_test, y_pred, pos_label='pos')

# Print the evaluation metrics
print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("F1-Score:", f1)


## Model Tuning (Optional)

In [None]:
from sklearn.model_selection import GridSearchCV

# Define the hyperparameters grid
param_grid = {'C': [0.001, 0.01, 0.1, 1, 10], 'penalty': ['l1', 'l2']}

# Initialize the grid search with cross-validation
grid_search = GridSearchCV(estimator=logistic_regression_model, param_grid=param_grid, cv=5)

# Perform grid search
grid_search.fit(X_train, y_train)

# Get the best hyperparameters
best_params = grid_search.best_params_
print("Best Hyperparameters:", best_params)

# Evaluate the model with the best hyperparameters
best_model = grid_search.best_estimator_
y_pred_best = best_model.predict(X_test)
accuracy_best = accuracy_score(y_test, y_pred_best)
precision_best = precision_score(y_test, y_pred_best, pos_label='pos')
recall_best = recall_score(y_test, y_pred_best, pos_label='pos')
f1_best = f1_score(y_test, y_pred_best, pos_label='pos')

# Print the evaluation metrics for the best model
print("Accuracy (Best Model):", accuracy_best)
print("Precision (Best Model):", precision_best)
print("Recall (Best Model):", recall_best)
print("F1-Score (Best Model):", f1_best)


There were some failures during the grid search process, likely due to incompatible hyperparameter combinations. However, despite these failures, we still managed to find the best hyperparameters for the logistic regression model.

Here's the result of the hyperparameter tuning:

**Best Hyperparameters: {'C': 10, 'penalty': 'l2'}
Accuracy (Best Model): 0.8375
Precision (Best Model): 0.8366
Recall (Best Model): 0.8408
F1-Score (Best Model): 0.8387**

## Prediction 

In [None]:
# Example text for prediction
new_reviews = [
    "This movie was amazing! I loved every moment of it.",
    "The acting was terrible and the plot was boring.",
    "I couldn't stop laughing throughout the entire film."
]

# Preprocess the new reviews
preprocessed_new_reviews = [preprocess_text(review) for review in new_reviews]

# Convert the preprocessed text into TF-IDF features
X_new_tfidf = tfidf_vectorizer.transform(preprocessed_new_reviews)

# Make predictions on the new reviews
predictions = logistic_regression_model.predict(X_new_tfidf)

# Print the predictions
for review, prediction in zip(new_reviews, predictions):
    print("Review:", review)
    print("Predicted Sentiment:", prediction)
    print()


## Conclusion 

After completing the sentiment analysis of IMDb movie reviews using a logistic regression model, we achieved the following results:

The logistic regression model achieved an accuracy of approximately 83.75% on the test data, indicating that it can effectively classify movie reviews as positive or negative.
The precision, recall, and F1-score of the model were around 0.84, indicating a good balance between precision (ability to avoid false positives) and recall (ability to identify positive samples).
Insights from the analysis:

The logistic regression model, trained on TF-IDF features, performed well in classifying movie reviews as positive or negative, demonstrating the effectiveness of text classification techniques in sentiment analysis tasks.
The model's performance could potentially be further improved with additional preprocessing steps, feature engineering, or experimenting with different machine learning algorithms.

Overall, the sentiment analysis experiment provided valuable insights into the sentiment of IMDb movie reviews and demonstrated the effectiveness of machine learning models in analyzing text data.