In [5]:
!pip install nltk
import nltk
nltk.download('punkt_tab')



[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


True

In [7]:
import pandas as pd
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

# Download required NLTK data
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')


# Load the dataset
try:
    df = pd.read_csv("reviews.csv")
except FileNotFoundError:
    print("Error: 'reviews.csv' not found. Please ensure the file is present in the current directory.")
    # Instead of exiting, assign an empty dataframe to df to allow the code to continue
    df = pd.DataFrame(columns=['Review Text', 'Sentiment'])
    # You might want to handle this empty DataFrame appropriately in the subsequent steps

# Preprocessing
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

def preprocess_text(text):
    # Check if text is a string to avoid AttributeError on NaN values
    if isinstance(text, str):
        tokens = word_tokenize(text.lower())
        tokens = [lemmatizer.lemmatize(token) for token in tokens if token.isalnum() and token not in stop_words]
        return " ".join(tokens)
    else:
        return ""  # Return empty string for non-string values


df['Processed_Review'] = df['Review Text'].apply(preprocess_text)

# Check if 'Processed_Review' column is empty after preprocessing
# If empty, provide feedback and skip model training and evaluation
if df['Processed_Review'].empty:
    print("Error: The 'Processed_Review' column is empty after preprocessing. Check your preprocessing steps or provide data in reviews.csv.")
else:
    # Text Vectorization
    vectorizer = TfidfVectorizer()
    X = vectorizer.fit_transform(df['Processed_Review'])
    y = df['Sentiment']

    # Model Training
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    model = LogisticRegression()
    model.fit(X_train, y_train)

    # Model Evaluation
    y_pred = model.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    # Assuming 'positive' is the positive class for precision, recall, and F1-score
    # If your dataset has different labels, update 'pos_label' accordingly
    precision = precision_score(y_test, y_pred, pos_label='positive')
    recall = recall_score(y_test, y_pred, pos_label='positive')
    f1 = f1_score(y_test, y_pred, pos_label='positive')

    print(f"Accuracy: {accuracy}")
    print(f"Precision: {precision}")
    print(f"Recall: {recall}")
    print(f"F1-Score: {f1}")

    # Insights (Example - You can expand this)
    # Reset index to align predictions with original DataFrame
    y_test_reset = y_test.reset_index(drop=True)
    correct_predictions = df.iloc[y_test.index][df.iloc[y_test.index]['Sentiment'] == y_pred]
    incorrect_predictions = df.iloc[y_test.index][df.iloc[y_test.index]['Sentiment'] != y_pred]
    print("\nCorrect Predictions:")
    print(correct_predictions.head())
    print("\nIncorrect Predictions:")
    print(incorrect_predictions.head())

Accuracy: 1.0
Precision: 1.0
Recall: 1.0
F1-Score: 1.0

Correct Predictions:
                                   Review Text Sentiment  \
8  Great value for the price. Works perfectly!  positive   
1       Terrible quality. Broke within a week.  negative   

                     Processed_Review  
8    great value price work perfectly  
1  terrible quality broke within week  

Incorrect Predictions:
Empty DataFrame
Columns: [Review Text, Sentiment, Processed_Review]
Index: []


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
