In [1]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import cross_validate
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.preprocessing import LabelEncoder
import time
import warnings
warnings.filterwarnings('ignore')

In [2]:
# Load and preprocess data
def load_data(filepath):
    df = pd.read_csv(filepath)
    # Convert sentiment to numeric using LabelEncoder
    le = LabelEncoder()
    df['airline_sentiment'] = le.fit_transform(df['airline_sentiment'])
    return df['text'], df['airline_sentiment']

In [3]:
# Create different TF-IDF configurations
def create_tfidf_configs():
    configs = [
        {'name': 'min_df_5', 'vectorizer': TfidfVectorizer(min_df=5)},
        {'name': '2500_features', 'vectorizer': TfidfVectorizer(max_features=2500)},
        {'name': '500_features', 'vectorizer': TfidfVectorizer(max_features=500)}
    ]
    return configs

In [4]:
# Create classifiers
def create_classifiers():
    classifiers = [
        {
            'name': 'Logistic Regression',
            'model': LogisticRegression(max_iter=1000, random_state=42)
        },
        {
            'name': 'LinearSVC',
            'model': LinearSVC(max_iter=1000, random_state=42)
        },
        {
            'name': 'Random Forest',
            'model': RandomForestClassifier(n_estimators=100, random_state=42)
        },
        {
            'name': 'Neural Network',
            'model': MLPClassifier(hidden_layer_sizes=(100,), max_iter=1000, random_state=42)
        }
    ]
    return classifiers

In [5]:
# Evaluate models using cross-validation
def evaluate_model(model, X, y):
    scoring = {
        'accuracy': 'accuracy',
        'f1_macro': 'f1_macro'
    }
    
    scores = cross_validate(
        model, X, y,
        cv=5,
        scoring=scoring,
        return_train_score=False,
        n_jobs=-1
    )
    
    return {
        'accuracy': scores['test_accuracy'].mean(),
        'f1_score': scores['test_f1_macro'].mean(),
        'fit_time': scores['fit_time'].mean()
    }

In [None]:
X_text, y = load_data('Twitter_US_Airline_Sentiment.csv')

In [None]:
# Get configurations
tfidf_configs = create_tfidf_configs()
classifiers = create_classifiers()

In [None]:
# Store results
results = []

In [None]:
# Run experiments
for tfidf_config in tfidf_configs:
    print(f"\nProcessing {tfidf_config['name']} configuration...")
    
    # Transform text data
    X = tfidf_config['vectorizer'].fit_transform(X_text)
    
    for clf in classifiers:
        print(f"Evaluating {clf['name']}...")
        
        # Evaluate model
        scores = evaluate_model(clf['model'], X, y)
        
        # Store results
        results.append({
            'TF-IDF Config': tfidf_config['name'],
            'Classifier': clf['name'],
            'Accuracy': scores['accuracy'],
            'F1-Score': scores['f1_score'],
            'Fit Time': scores['fit_time']
        })

In [None]:
# Create results DataFrame
results_df = pd.DataFrame(results)

In [None]:
# Display results
print("\nResults:")
print(results_df.to_string(index=False))

In [None]:
# Save results to CSV
results_df.to_csv('sentiment_analysis_results.csv', index=False)
print("\nResults saved to 'sentiment_analysis_results.csv'")