In [None]:
# Import necessary libraries
import pandas as pd
import numpy as np
import re
import nltk
from nltk.corpus import stopwords
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

# Download NLTK resources (uncomment the following line if you haven't downloaded them yet)
# nltk.download('stopwords')

# Load dataset (sample dataset used here)
# For example purposes, let's create a simple dataset
data = {
    'text': [
        "I love this product! It's amazing.",
        "Worst experience ever. I hate it.",
        "Just okay, nothing special.",
        "Absolutely fantastic! Highly recommend.",
        "Not worth the money.",
        "Will buy again, very satisfied!",
        "Terrible service, I am very disappointed.",
        "I'm happy with my purchase!",
        "It was a decent experience.",
        "I'm not a fan of this product."
    ],
    'sentiment': [
        1,  # Positive
        0,  # Negative
        0,  # Neutral
        1,  # Positive
        0,  # Negative
        1,  # Positive
        0,  # Negative
        1,  # Positive
        0,  # Neutral
        0   # Negative
    ]
}

# Convert to DataFrame
df = pd.DataFrame(data)

# Display the dataset
print("Sample Dataset:")
print(df)

# Data Preprocessing
def preprocess_text(text):
    text = text.lower()  # Lowercase
    text = re.sub(r'http\S+|www\S+|https\S+', '', text, flags=re.MULTILINE)  # Remove URLs
    text = re.sub(r'\@\w+|\#','', text)  # Remove @mentions and #hashtags
    text = re.sub(r'\d+', '', text)  # Remove numbers
    text = re.sub(r'[^\w\s]', '', text)  # Remove punctuation
    return text

# Apply preprocessing
df['cleaned_text'] = df['text'].apply(preprocess_text)

# Split the dataset into training and testing sets
X = df['cleaned_text']
y = df['sentiment']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Feature Extraction using TF-IDF
vectorizer = TfidfVectorizer(stop_words='english')
X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)

# Build the model (Logistic Regression)
model = LogisticRegression()
model.fit(X_train_tfidf, y_train)

# Predictions
y_pred = model.predict(X_test_tfidf)

# Evaluate the model
print("\nConfusion Matrix:")
print(confusion_matrix(y_test, y_pred))

print("\nClassification Report:")
print(classification_report(y_test, y_pred))

accuracy = accuracy_score(y_test, y_pred)
print(f"\nAccuracy: {accuracy:.2f}")

# Function for predicting sentiment of new texts
def predict_sentiment(text):
    cleaned_text = preprocess_text(text)
    text_tfidf = vectorizer.transform([cleaned_text])
    prediction = model.predict(text_tfidf)
    sentiment = "Positive" if prediction == 1 else "Negative" if prediction == 0 else "Neutral"
    return sentiment

# Example usage
new_reviews = [
    "I absolutely love this!",
    "This is the worst thing I've ever bought.",
    "It's okay, not great but not bad either."
]

print("\nSentiment Predictions for New Reviews:")
for review in new_reviews:
    print(f"Review: '{review}' - Sentiment: {predict_sentiment(review)}")
