<a href="https://colab.research.google.com/github/Nachi2006/Sentiment_Analysis/blob/main/Sentiment_analyser.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report, accuracy_score
import re

def preprocess_text(text):
    """
    Simple text preprocessing
    """
    # Convert to lowercase and remove special characters
    text = str(text).lower()
    text = re.sub(r'[^a-z\s]', ' ', text)
    return ' '.join(text.split())

def create_lexicon():
    """
    Create a simple sentiment lexicon
    """
    positive_words = {'good', 'great', 'awesome', 'excellent', 'happy', 'love', 'wonderful', 'fantastic', 'best', 'amazing'}
    negative_words = {'bad', 'terrible', 'awful', 'horrible', 'hate', 'worst', 'poor', 'disappointing', 'disappointed'}
    neutral_words = {'okay', 'ok', 'fine', 'average', 'neutral', 'fair', 'moderate'}
    return positive_words, negative_words, neutral_words

def add_lexicon_features(X, texts):
    """
    Add lexicon-based features to improve accuracy
    """
    positive_words, negative_words, neutral_words = create_lexicon()

    # Count sentiment words in each text
    positive_counts = []
    negative_counts = []
    neutral_counts = []

    for text in texts:
        words = set(text.split())
        positive_counts.append(len(words.intersection(positive_words)))
        negative_counts.append(len(words.intersection(negative_words)))
        neutral_counts.append(len(words.intersection(neutral_words)))

    # Add these as new features
    X_with_lexicon = np.c_[
        X.toarray(),
        np.array(positive_counts).reshape(-1, 1),
        np.array(negative_counts).reshape(-1, 1),
        np.array(neutral_counts).reshape(-1, 1)
    ]

    return X_with_lexicon

# Load your dataset
data = pd.read_csv('/content/data.csv')

# Print dataset information
print("Dataset Info:")
print(data.info())
print("\nFirst few rows of the dataset:")
print(data.head())
print("\nColumn names:", data.columns.tolist())


# Preprocess the text data
print("\nPreprocessing text data...")
X_text = data['Sentence'].apply(preprocess_text)
y = data['Sentiment']

# Split the data
X_train_text, X_test_text, y_train, y_test = train_test_split(
    X_text, y, test_size=0.2, random_state=42, stratify=y
)

# Create and fit the vectorizer
print("\nVectorizing text data...")
vectorizer = CountVectorizer(
    max_features=3000,
    ngram_range=(1, 2),
    min_df=2
)

# Transform text data
X_train = vectorizer.fit_transform(X_train_text)
X_test = vectorizer.transform(X_test_text)

# Add lexicon features
X_train_with_lexicon = add_lexicon_features(X_train, X_train_text)
X_test_with_lexicon = add_lexicon_features(X_test, X_test_text)

# Train the model
print("\nTraining the model...")
model = MultinomialNB(alpha=0.1)  # Slightly reduced smoothing
model.fit(X_train_with_lexicon, y_train)

# Make predictions
y_pred = model.predict(X_test_with_lexicon)

# Print model performance
print("\nModel Performance:")
print("Accuracy:", accuracy_score(y_test, y_pred))
print("\nDetailed Classification Report:")
print(classification_report(y_test, y_pred))

def predict_sentiment(text):
    """
    Predict sentiment for a given text
    """
    # Preprocess the text
    processed_text = preprocess_text(text)

    # Vectorize the text
    text_vectorized = vectorizer.transform([processed_text])

    # Add lexicon features
    text_with_lexicon = add_lexicon_features(text_vectorized, [processed_text])

    # Get prediction and probability
    prediction = model.predict(text_with_lexicon)[0]
    probabilities = model.predict_proba(text_with_lexicon)[0]
    confidence = max(probabilities)

    return prediction, confidence

# Example usage
if __name__ == "__main__":
    # Test the model with some example sentences
    test_sentences = [
        "I love this product, it's amazing!",
        "The movie was okay, nothing special.",
        "The customer service was the worst and I hated it.",

    ]

    print("\nTesting with example sentences:")
    for sentence in test_sentences:
        sentiment, confidence = predict_sentiment(sentence)
        print(f"Sentence: {sentence}")
        print(f"Predicted Sentiment: {sentiment}")
        print(f"Confidence: {confidence:.2f}\n")


Dataset Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5842 entries, 0 to 5841
Data columns (total 2 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   Sentence   5842 non-null   object
 1   Sentiment  5842 non-null   object
dtypes: object(2)
memory usage: 91.4+ KB
None

First few rows of the dataset:
                                            Sentence Sentiment
0  The GeoSolutions technology will leverage Bene...  positive
1  $ESI on lows, down $1.50 to $2.50 BK a real po...  negative
2  For the last quarter of 2010 , Componenta 's n...  positive
3  According to the Finnish-Russian Chamber of Co...   neutral
4  The Swedish buyout firm has sold its remaining...   neutral

Column names: ['Sentence', 'Sentiment']

Preprocessing text data...

Vectorizing text data...

Training the model...

Model Performance:
Accuracy: 0.6852010265183918

Detailed Classification Report:
              precision    recall  f1-score   support

    negative 