In [1]:
import pandas as pd
import numpy as np
import re
import nltk
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score


In [2]:
# Download stopwords
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\LENOVO\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\stopwords.zip.


In [5]:
# Load Sentiment140 dataset
def load_data(filepath, sample_size=10000):
    column_names = ["target", "id", "date", "flag", "user", "text"]
    df = pd.read_csv(filepath, encoding='latin-1', names=column_names)
    
    # Keep only 'target' and 'text' columns
    df = df[['target', 'text']]
    
    # Convert sentiment labels (0 = negative, 4 = positive)
    df['target'] = df['target'].replace({0: "negative", 4: "positive"})
    
    # Sample a smaller dataset for training
    df = df.sample(sample_size, random_state=42)
    
    return df

In [6]:
# Preprocess text (remove special characters, stopwords, etc.)
def preprocess_text(text):
    text = text.lower()  # Lowercase text
    text = re.sub(r'http\S+|www\S+', '', text)  # Remove URLs
    text = re.sub(r'@\w+', '', text)  # Remove mentions
    text = re.sub(r'[^a-zA-Z\s]', '', text)  # Remove non-alphabetic characters
    text = ' '.join([word for word in text.split() if word not in stop_words])  # Remove stopwords
    return text

In [7]:
# Load and preprocess data
data_path = "training.1600000.processed.noemoticon.csv"  # Change this to your dataset path
df = load_data(data_path)

# Apply text preprocessing
df['text'] = df['text'].apply(preprocess_text)

# Split dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(df['text'], df['target'], test_size=0.2, random_state=42)

# Create a pipeline for vectorization + classification
model = Pipeline([
    ('tfidf', TfidfVectorizer(max_features=5000)),  # Convert text to numerical vectors
    ('classifier', MultinomialNB())  # Train Naive Bayes classifier
])

# Train the model
model.fit(X_train, y_train)

# Evaluate on test set
y_pred = model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print(f"Model Accuracy: {accuracy:.2f}")


Model Accuracy: 0.72


In [8]:
# Function to predict sentiment for new input text
def predict_sentiment(text):
    text = preprocess_text(text)
    prediction = model.predict([text])[0]
    return prediction

# Test with user input
while True:
    user_text = input("\nEnter a tweet (or 'exit' to quit): ")
    if user_text.lower() == 'exit':
        break
    sentiment = predict_sentiment(user_text)
    print(f"Predicted Sentiment: {sentiment}")

Predicted Sentiment: negative
Predicted Sentiment: positive
Predicted Sentiment: negative
