# Sentiment analysis by using the IMDB Dataset

In [1]:
!pip install  pandas
!pip install datasets
!pip install nltk
!pip install sckit learn


Collecting datasets
  Downloading datasets-2.21.0-py3-none-any.whl.metadata (21 kB)
Collecting pyarrow>=15.0.0 (from datasets)
  Downloading pyarrow-17.0.0-cp310-cp310-manylinux_2_28_x86_64.whl.metadata (3.3 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl.metadata (7.2 kB)
Downloading datasets-2.21.0-py3-none-any.whl (527 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m527.3/527.3 kB[0m [31m10.3 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m10.2 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading pyarrow-17.0.0-cp310-cp310-manylinux_2_28_x86_64.whl (39.9 MB)
[

**Load the necessary Dataset**

In [2]:
from datasets import load_dataset

# Load the IMDb dataset
dataset = load_dataset('imdb')

# Display the first example from the training set
print(dataset['train'][0])


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Downloading readme:   0%|          | 0.00/7.81k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/21.0M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/20.5M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/42.0M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/25000 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/25000 [00:00<?, ? examples/s]

Generating unsupervised split:   0%|          | 0/50000 [00:00<?, ? examples/s]

{'text': 'I rented I AM CURIOUS-YELLOW from my video store because of all the controversy that surrounded it when it was first released in 1967. I also heard that at first it was seized by U.S. customs if it ever tried to enter this country, therefore being a fan of films considered "controversial" I really had to see this for myself.<br /><br />The plot is centered around a young Swedish drama student named Lena who wants to learn everything she can about life. In particular she wants to focus her attentions to making some sort of documentary on what the average Swede thought about certain political issues such as the Vietnam War and race issues in the United States. In between asking politicians and ordinary denizens of Stockholm about their opinions on politics, she has sex with her drama teacher, classmates, and married men.<br /><br />What kills me about I AM CURIOUS-YELLOW is that 40 years ago, this was considered pornographic. Really, the sex and nudity scenes are few and far be

**Data Preprocessing step**

In [8]:
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import string

# Download necessary NLTK resources
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('stopwords')
# Initialize the lemmatizer
lemmatizer = WordNetLemmatizer()

def preprocess_text_nltk(text):
    """
    Preprocess the input text using NLTK:
    - Convert to lowercase
    - Remove punctuation
    - Tokenize
    - Remove stopwords
    - Lemmatize
    """
    # Convert to lowercase
    text = text.lower()

    # Remove punctuation
    text = text.translate(str.maketrans('', '', string.punctuation))

    # Tokenize
    tokens = word_tokenize(text)

    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    tokens = [word for word in tokens if word not in stop_words]

    # Lemmatize
    tokens = [lemmatizer.lemmatize(word) for word in tokens]

    return ' '.join(tokens)
import pandas as pd

# Example DataFrame
df = pd.DataFrame({
    'text': [
        "This is a sample sentence.",
        "Another example with some text!"
    ]
})

# Apply preprocessing
df['cleaned_text'] = df['text'].apply(preprocess_text_nltk)

# Display the DataFrame
print(df)


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


                              text          cleaned_text
0       This is a sample sentence.       sample sentence
1  Another example with some text!  another example text


**Label Encoding**

In [9]:
from sklearn.preprocessing import LabelEncoder

# Encode sentiment labels
label_encoder = LabelEncoder()
df_train['sentiment'] = label_encoder.fit_transform(df_train['label'])
df_test['sentiment'] = label_encoder.transform(df_test['label'])


**Split the data and extract features**

In [11]:
# Apply preprocessing to df_train and df_test
df_train['cleaned_text'] = df_train['text'].apply(preprocess_text_nltk)
df_test['cleaned_text'] = df_test['text'].apply(preprocess_text_nltk)

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer

# Split data
X_train = df_train['cleaned_text']
y_train = df_train['sentiment']
X_test = df_test['cleaned_text']
y_test = df_test['sentiment']

# Feature extraction
tfidf_vectorizer = TfidfVectorizer()
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)
X_test_tfidf = tfidf_vectorizer.transform(X_test)

**Training the classifier**

In [12]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, classification_report

# Initialize and train the model
model = MultinomialNB()
model.fit(X_train_tfidf, y_train)

# Make predictions
y_pred = model.predict(X_test_tfidf)

# Evaluate the model
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred))


Accuracy: 0.8336
Classification Report:
               precision    recall  f1-score   support

           0       0.80      0.88      0.84     12500
           1       0.87      0.79      0.83     12500

    accuracy                           0.83     25000
   macro avg       0.84      0.83      0.83     25000
weighted avg       0.84      0.83      0.83     25000



**Saving and loading the model**

In [13]:
import joblib

# Save the model to a file
joblib.dump(model, 'sentiment_model.pkl')

# Save the TF-IDF vectorizer to a file
joblib.dump(tfidf_vectorizer, 'tfidf_vectorizer.pkl')

print("Model and vectorizer saved successfully.")


Model and vectorizer saved successfully.


In [14]:
import joblib

# Load the model from the file
model = joblib.load('sentiment_model.pkl')

# Load the TF-IDF vectorizer from the file
tfidf_vectorizer = joblib.load('tfidf_vectorizer.pkl')

print("Model and vectorizer loaded successfully.")


Model and vectorizer loaded successfully.


**Testing**

In [15]:
# Example text for classification
example_text = "I love this product! It works wonderfully."

# Preprocess and classify
cleaned_text = preprocess_text(example_text)
text_tfidf = tfidf_vectorizer.transform([cleaned_text])
prediction = model.predict(text_tfidf)

# Map the prediction to sentiment
sentiment_map = {0: 'Negative', 1: 'Positive', 2: 'Neutral'}
sentiment = sentiment_map.get(prediction[0], 'Unknown')

print(f"The sentiment of the example text is {sentiment}.")


The sentiment of the example text is Positive.
