In [16]:
import kagglehub
import pandas as pd
import numpy as np
import nltk
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix
import re
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from tqdm import tqdm
nltk.download('all')


[nltk_data] Downloading collection 'all'
[nltk_data]    | 
[nltk_data]    | Downloading package abc to
[nltk_data]    |     /Users/nafisgh/nltk_data...
[nltk_data]    |   Package abc is already up-to-date!
[nltk_data]    | Downloading package alpino to
[nltk_data]    |     /Users/nafisgh/nltk_data...
[nltk_data]    |   Package alpino is already up-to-date!
[nltk_data]    | Downloading package averaged_perceptron_tagger to
[nltk_data]    |     /Users/nafisgh/nltk_data...
[nltk_data]    |   Package averaged_perceptron_tagger is already up-
[nltk_data]    |       to-date!
[nltk_data]    | Downloading package averaged_perceptron_tagger_eng to
[nltk_data]    |     /Users/nafisgh/nltk_data...
[nltk_data]    |   Package averaged_perceptron_tagger_eng is already
[nltk_data]    |       up-to-date!
[nltk_data]    | Downloading package averaged_perceptron_tagger_ru to
[nltk_data]    |     /Users/nafisgh/nltk_data...
[nltk_data]    |   Package averaged_perceptron_tagger_ru is already
[nltk_data]  

In [11]:
# Download latest version
path = kagglehub.dataset_download("lakshmi25npathi/imdb-dataset-of-50k-movie-reviews")

print("Path to dataset files:", path)

df = pd.read_csv(path + "/IMDB Dataset.csv")
df['sentiment'] = df['sentiment'].map({'negative': 0, 'positive': 1})
df

Path to dataset files: /Users/nafisgh/.cache/kagglehub/datasets/lakshmi25npathi/imdb-dataset-of-50k-movie-reviews/versions/1


Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,1
1,A wonderful little production. <br /><br />The...,1
2,I thought this was a wonderful way to spend ti...,1
3,Basically there's a family where a little boy ...,0
4,"Petter Mattei's ""Love in the Time of Money"" is...",1
...,...,...
49995,I thought this movie did a down right good job...,1
49996,"Bad plot, bad dialogue, bad acting, idiotic di...",0
49997,I am a Catholic taught in parochial elementary...,0
49998,I'm going to have to disagree with the previou...,0


In [19]:
# create preprocess_text function
def preprocess_text(text):
    # Tokenize the text
    tokens = word_tokenize(text.lower())
    # Remove stop words
    filtered_tokens = [token for token in tokens if token not in stopwords.words('english')]
    # Lemmatize the tokens
    lemmatizer = WordNetLemmatizer()
    lemmatized_tokens = [lemmatizer.lemmatize(token) for token in filtered_tokens]
    # Join the tokens back into a string
    processed_text = ' '.join(lemmatized_tokens)
    return processed_text
tqdm.pandas()
# clean/preprocess the review text
df['review_cleaned'] = df['review'].progress_apply(preprocess_text)
df

100%|██████████| 50000/50000 [08:20<00:00, 99.99it/s] 


Unnamed: 0,review,sentiment,review_cleaned
0,One of the other reviewers has mentioned that ...,1,one reviewer mentioned watching 1 oz episode '...
1,A wonderful little production. <br /><br />The...,1,wonderful little production . < br / > < br / ...
2,I thought this was a wonderful way to spend ti...,1,thought wonderful way spend time hot summer we...
3,Basically there's a family where a little boy ...,0,basically 's family little boy ( jake ) think ...
4,"Petter Mattei's ""Love in the Time of Money"" is...",1,petter mattei 's `` love time money '' visuall...
...,...,...,...
49995,I thought this movie did a down right good job...,1,thought movie right good job . n't creative or...
49996,"Bad plot, bad dialogue, bad acting, idiotic di...",0,"bad plot , bad dialogue , bad acting , idiotic..."
49997,I am a Catholic taught in parochial elementary...,0,catholic taught parochial elementary school nu...
49998,I'm going to have to disagree with the previou...,0,'m going disagree previous comment side maltin...


In [None]:

# Download required NLTK data
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

vectorizer = TfidfVectorizer(max_features=5000)
classifier = LogisticRegression(max_iter=1000)
lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words('english'))

# Split data
X_train, X_test, y_train, y_test = train_test_split(
    df['processed_text'],
    df[label_column],
    test_size=0.2,
    random_state=42
)

# Vectorize text
print("Vectorizing text...")
X_train_vectorized = vectorizer.fit_transform(X_train)
X_test_vectorized = vectorizer.transform(X_test)
    

def train(X_train, y_train):
    """Train the sentiment analysis model"""
    print("Training model...")
    classifier.fit(X_train, y_train)

def evaluate(X_test, y_test):
    """Evaluate the model"""
    print("\nModel Evaluation:")
    y_pred = classifier.predict(X_test)
    
    print("\nClassification Report:")
    print(classification_report(y_test, y_pred))
    
    print("\nConfusion Matrix:")
    print(confusion_matrix(y_test, y_pred))
    
    return y_pred

def predict(text, vectorizer, classifier):
    """Predict sentiment for new text"""
    processed_text = preprocess_text(text)
    vectorized_text = vectorizer.transform([processed_text])
    prediction = classifier.predict(vectorized_text)
    probability = classifier.predict_proba(vectorized_text)
    
    return {
        'sentiment': 'positive' if prediction[0] == 1 else 'negative',
        'confidence': max(probability[0])
    }

def main():
    # Example usage with a sample DataFrame
    # Assuming your DataFrame has 'text' and 'sentiment' columns
    # where sentiment is binary (0 for negative, 1 for positive)
    
    # Sample data loading (replace with your actual data)
    df = pd.DataFrame({
        'text': ["This is amazing!", "I hate this product"],
        'sentiment': [1, 0]
    })
    
    # Initialize analyzer
    analyzer = SentimentAnalyzer()
    
    # Prepare and train model
    X_train_vec, X_test_vec, y_train, y_test = analyzer.prepare_data(
        df, 'text', 'sentiment'
    )
    analyzer.train(X_train_vec, y_train)
    
    # Evaluate model
    predictions = analyzer.evaluate(X_test_vec, y_test)
    
    # Example prediction
    sample_text = "This product is fantastic!"
    result = analyzer.predict(sample_text)
    print(f"\nSample prediction for '{sample_text}':")
    print(f"Sentiment: {result['sentiment']}")
    print(f"Confidence: {result['confidence']:.2f}")

if __name__ == "__main__":
    main()