In [24]:
import pandas as pd
import numpy as np
import re
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix
import nltk
import warnings

warnings.filterwarnings('ignore')

# Download only essential NLTK data
print("Downloading NLTK data...")
try:
    nltk.download('punkt', quiet=True)
    nltk.download('stopwords', quiet=True)
except Exception as e:
    print(f"Warning: NLTK download failed - {str(e)}")
    print("Continuing with basic preprocessing...")

from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords


Downloading NLTK data...


In [25]:
def load_data(file_path):
    """Load and prepare the dataset."""
    columns = ['target', 'id', 'date', 'flag', 'user', 'text']
    df = pd.read_csv(file_path, encoding='latin-1', names=columns)
    df['target'] = df['target'].map({0: 0, 4: 1})
    return df[['text', 'target']]


In [26]:
def preprocess_text(text):
    """Clean and preprocess text data without lemmatization."""
    try:
        # Basic cleaning
        text = str(text).lower().strip()
        
        # Remove URLs
        text = re.sub(r'http\S+|www\S+|https\S+', '', text, flags=re.MULTILINE)
        
        # Remove user mentions and hashtags
        text = re.sub(r'@\w+', '', text)
        text = re.sub(r'#\w+', '', text)
        
        # Remove special characters and digits
        text = re.sub(r'[^\w\s]', '', text)
        text = re.sub(r'\d+', '', text)
        
        # Remove extra whitespace
        text = ' '.join(text.split())
        
        # Tokenization
        tokens = word_tokenize(text)
        
        # Remove stopwords if available
        stop_words = set(stopwords.words('english'))
        tokens = [token for token in tokens if token not in stop_words]
        
        return ' '.join(tokens)
    except Exception as e:
        print(f"Error in preprocessing: {str(e)}")
        return text


In [27]:
def create_pipeline(file_path, sample_size=None):
    """Create and train the sentiment analysis pipeline."""
    print("Loading data...")
    df = load_data(file_path)
    
    if sample_size:
        df = df.sample(n=sample_size, random_state=42)
    
    print(f"Processing {len(df)} tweets...")
    
    # Process in batches
    batch_size = 10000
    total_batches = len(df) // batch_size + 1
    processed_texts = []
    
    for i in range(0, len(df), batch_size):
        batch = df['text'].iloc[i:i+batch_size]
        processed_batch = [preprocess_text(text) for text in batch]
        processed_texts.extend(processed_batch)
        
        batch_num = i // batch_size + 1
        print(f"Processed batch {batch_num}/{total_batches}")
    
    df['processed_text'] = processed_texts
    
    # Split the data
    X_train, X_test, y_train, y_test = train_test_split(
        df['processed_text'],
        df['target'],
        test_size=0.2,
        random_state=42
    )
    
    print("Vectorizing texts...")
    vectorizer = TfidfVectorizer(
        max_features=10000,
        ngram_range=(1, 2),
        min_df=5
    )
    X_train_vectors = vectorizer.fit_transform(X_train)
    X_test_vectors = vectorizer.transform(X_test)
    
    print("Training model...")
    model = LogisticRegression(max_iter=1000, C=1.0, class_weight='balanced')
    model.fit(X_train_vectors, y_train)
    
    # Evaluate
    y_pred = model.predict(X_test_vectors)
    print("\nClassification Report:")
    print(classification_report(y_test, y_pred))
    
    return vectorizer, model


In [28]:
def predict_sentiment(text, vectorizer, model):
    """Predict sentiment for new text."""
    processed_text = preprocess_text(text)
    text_vector = vectorizer.transform([processed_text])
    prediction = model.predict(text_vector)[0]
    probability = model.predict_proba(text_vector)[0]
    
    sentiment = "Positive" if prediction == 1 else "Negative"
    confidence = probability[1] if prediction == 1 else probability[0]
    
    return {
        'sentiment': sentiment,
        'confidence': confidence,
        'processed_text': processed_text
    }


In [29]:
if __name__ == "__main__":
    file_path = "/kaggle/input/sentiment140/training.1600000.processed.noemoticon.csv"
    
    # For testing with a smaller sample
    vectorizer, model = create_pipeline(file_path, sample_size=100000)
    
    # Test the model
    test_texts = [
        "I love this product! It's amazing!",
        "This is the worst experience ever.",
        "The weather is nice today."
    ]
    
    print("\nExample Predictions:")
    for text in test_texts:
        result = predict_sentiment(text, vectorizer, model)
        print(f"\nText: {text}")
        print(f"Sentiment: {result['sentiment']}")
        print(f"Confidence: {result['confidence']:.2f}")


Loading data...
Processing 100000 tweets...
Processed batch 1/11
Processed batch 2/11
Processed batch 3/11
Processed batch 4/11
Processed batch 5/11
Processed batch 6/11
Processed batch 7/11
Processed batch 8/11
Processed batch 9/11
Processed batch 10/11
Vectorizing texts...
Training model...

Classification Report:
              precision    recall  f1-score   support

           0       0.78      0.75      0.76      9995
           1       0.76      0.78      0.77     10005

    accuracy                           0.77     20000
   macro avg       0.77      0.77      0.77     20000
weighted avg       0.77      0.77      0.77     20000


Example Predictions:

Text: I love this product! It's amazing!
Sentiment: Positive
Confidence: 0.94

Text: This is the worst experience ever.
Sentiment: Negative
Confidence: 0.75

Text: The weather is nice today.
Sentiment: Positive
Confidence: 0.61
