In [24]:
import pandas as pd
import nltk
from google.colab import drive

# Mount Google Drive
drive.mount('/content/drive')

# Load dataset
data = pd.read_csv('/content/drive/My Drive/dataset_kcb/IMDB_Dataset.csv')
data

# Tokenization
from nltk.tokenize import word_tokenize

# Stemming
from nltk.stem import PorterStemmer

# Sentiment Analysis
from nltk.sentiment import SentimentIntensityAnalyzer

# Download necessary NLTK datasets
nltk.download('punkt')
nltk.download('vader_lexicon')

# Initialize stemmer and sentiment analyzer
stemmer = PorterStemmer()
sia = SentimentIntensityAnalyzer()

# Function to preprocess text: tokenization and stemming
def preprocess_text(text):
    # Tokenize the text
    tokens = word_tokenize(text)
    # Stem the tokens
    stems = [stemmer.stem(token) for token in tokens]
    return ' '.join(stems)

# Preprocess the review texts
data['processed_review'] = data['review'].apply(preprocess_text)

# Perform sentiment analysis on the processed reviews
data['sentiment_score'] = data['processed_review'].apply(lambda x: sia.polarity_scores(x)['compound'])

# Determine sentiment category based on sentiment score
def determine_sentiment(score):
    if score >= 0.05:
        return 'positive'
    elif score <= -0.05:
        return 'negative'
    else:
        return 'neutral'

data['predicted_sentiment'] = data['sentiment_score'].apply(determine_sentiment)

# Display the first few rows of the processed data
print(data.head())

# Compare predicted sentiment with actual sentiment
accuracy = (data['sentiment'] == data['predicted_sentiment']).mean()
print(f"Accuracy: {accuracy}")

# Save the processed data to a new CSV file
data.to_csv('/content/drive/My Drive/dataset_kcb/IMDB_Dataset_Processed.csv', index=False)


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package vader_lexicon to /root/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


                                              review sentiment  \
0  One of the other reviewers has mentioned that ...  positive   
1  A wonderful little production. <br /><br />The...  positive   
2  I thought this was a wonderful way to spend ti...  positive   
3  Basically there's a family where a little boy ...  negative   
4  Petter Mattei's "Love in the Time of Money" is...  positive   

                                    processed_review  sentiment_score  \
0  one of the other review ha mention that after ...          -0.9710   
1  a wonder littl product . < br / > < br / > the...           0.9439   
2  i thought thi wa a wonder way to spend time on...           0.9345   
3  basic there 's a famili where a littl boy ( ja...          -0.7639   
4  petter mattei 's `` love in the time of money ...           0.9714   

  predicted_sentiment  
0            negative  
1            positive  
2            positive  
3            negative  
4            positive  
Accuracy: 0.62626262

In [23]:
data = pd.read_csv('/content/drive/My Drive/dataset_kcb/IMDB_Dataset_Processed.csv')

data

Unnamed: 0,review,sentiment,processed_review,sentiment_score,predicted_sentiment
0,One of the other reviewers has mentioned that ...,positive,one of the other review ha mention that after ...,-0.9710,negative
1,A wonderful little production. <br /><br />The...,positive,a wonder littl product . < br / > < br / > the...,0.9439,positive
2,I thought this was a wonderful way to spend ti...,positive,i thought thi wa a wonder way to spend time on...,0.9345,positive
3,Basically there's a family where a little boy ...,negative,basic there 's a famili where a littl boy ( ja...,-0.7639,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive,petter mattei 's `` love in the time of money ...,0.9714,positive
...,...,...,...,...,...
94,"I watched this series out of curiosity,wanting...",negative,"i watch thi seri out of curios , want to see i...",0.7964,positive
95,Daniel Day-Lewis is the most versatile actor a...,positive,daniel day-lewi is the most versatil actor ali...,0.8550,positive
96,My guess would be this was originally going to...,negative,my guess would be thi wa origin go to be at le...,-0.9636,negative
97,"Well, I like to watch bad horror B-Movies, cau...",negative,"well , i like to watch bad horror b-movi , cau...",-0.9866,negative
