In [2]:
nltk.download('punkt', download_dir=r'C:\nltk_data')
nltk.download('stopwords', download_dir=r'C:\nltk_data')
nltk.download('wordnet', download_dir=r'C:\nltk_data')
nltk.download('averaged_perceptron_tagger', download_dir=r'C:\nltk_data')

import nltk
nltk.data.path.append(r'C:\nltk_data')


[nltk_data] Downloading package punkt to C:\nltk_data...
[nltk_data]   Unzipping tokenizers\punkt.zip.
[nltk_data] Downloading package stopwords to C:\nltk_data...
[nltk_data]   Unzipping corpora\stopwords.zip.
[nltk_data] Downloading package wordnet to C:\nltk_data...
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\nltk_data...
[nltk_data]   Unzipping taggers\averaged_perceptron_tagger.zip.


In [4]:
# ===============================
# Portfolio-Ready NLP Sentiment Analysis Pipeline
# ===============================

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from wordcloud import WordCloud
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk import pos_tag, ngrams
import re
from collections import Counter
import os
import joblib

# -------------------------------
# 0. Setup
# -------------------------------
np.random.seed(42)
os.makedirs('outputs', exist_ok=True)

nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('averaged_perceptron_tagger')

# -------------------------------
# 1. Load Data
# -------------------------------
def load_data(file_path):
    df = pd.read_csv('Tweets.csv')
    print("Dataset loaded. Shape:", df.shape)
    return df

# -------------------------------
# 2. Clean Data
# -------------------------------
def clean_data(df):
    df = df[['text', 'airline_sentiment']].dropna()
    sentiment_map = {'positive': 2, 'neutral': 1, 'negative': 0}
    df['airline_sentiment'] = df['airline_sentiment'].map(sentiment_map)
    print("Data cleaned. Shape:", df.shape)
    return df

# -------------------------------
# 3. Preprocess Text
# -------------------------------
def preprocess_text(text):
    text = text.lower()
    text = re.sub(r'http\S+|www\S+|https\S+', '', text)
    text = re.sub(r'@\w+|#\w+', '', text)
    text = re.sub(r'[^\w\s]', '', text)
    text = re.sub(r'[\U0001F600-\U0001F64F\U0001F300-\U0001F5FF]', '', text)
    
    tokens = word_tokenize(text)
    custom_stopwords = set(stopwords.words('english')) - {'not', 'no', 'never', 'bad', 'good', 'great'}
    tokens = [t for t in tokens if t not in custom_stopwords]
    
    pos_tags = pos_tag(tokens)
    tokens = [w for w, pos in pos_tags if pos.startswith(('NN','VB','JJ','RB'))]
    
    lemmatizer = WordNetLemmatizer()
    tokens = [lemmatizer.lemmatize(t) for t in tokens]
    
    return ' '.join(tokens)

# -------------------------------
# 4. EDA
# -------------------------------
def perform_eda(df):
    # Sentiment distribution
    plt.figure(figsize=(8,6))
    sns.countplot(x='airline_sentiment', data=df)
    plt.xticks([0,1,2], ['Negative','Neutral','Positive'])
    plt.title('Sentiment Distribution')
    plt.savefig('outputs/sentiment_distribution.png', dpi=300)
    plt.close()
    
    # Tweet length
    df['text_length'] = df['text'].str.len()
    plt.figure(figsize=(8,6))
    sns.histplot(df, x='text_length', hue='airline_sentiment', multiple='stack', bins=30)
    plt.title('Tweet Length by Sentiment')
    plt.savefig('outputs/tweet_length_distribution.png', dpi=300)
    plt.close()
    
    # Word clouds
    for label, num in zip(['Negative','Neutral','Positive'], [0,1,2]):
        text = ' '.join(df[df['airline_sentiment']==num]['text'])
        wc = WordCloud(width=800, height=400, background_color='white', max_words=100).generate(text)
        plt.figure(figsize=(10,5))
        plt.imshow(wc, interpolation='bilinear')
        plt.axis('off')
        plt.title(f'WordCloud: {label}')
        plt.savefig(f'outputs/wordcloud_{label.lower()}.png', dpi=300)
        plt.close()
    
    # Top 10 bigrams for negative sentiment
    negative_text = ' '.join(df[df['airline_sentiment']==0]['text'])
    tokens = word_tokenize(negative_text)
    bigrams = list(ngrams(tokens,2))
    bigram_freq = Counter(bigrams).most_common(10)
    labels = [' '.join(bg) for bg,_ in bigram_freq]
    counts = [count for _,count in bigram_freq]
    
    plt.figure(figsize=(10,6))
    sns.barplot(x=counts, y=labels)
    plt.title('Top 10 Bigrams (Negative)')
    plt.savefig('outputs/top_bigrams_negative.png', dpi=300)
    plt.close()

# -------------------------------
# 5. Feature Extraction
# -------------------------------
def extract_features(X_train, X_test):
    tfidf = TfidfVectorizer(max_features=5000, ngram_range=(1,3), min_df=2, max_df=0.95)
    X_train_tfidf = tfidf.fit_transform(X_train)
    X_test_tfidf = tfidf.transform(X_test)
    print("TF-IDF features extracted:", X_train_tfidf.shape)
    return X_train_tfidf, X_test_tfidf, tfidf

# -------------------------------
# 6. Train & Evaluate
# -------------------------------
def train_and_evaluate(model, name, X_train, X_test, y_train, y_test):
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    
    acc = accuracy_score(y_test, y_pred)
    report = classification_report(y_test, y_pred, target_names=['Negative','Neutral','Positive'])
    
    # Save report
    with open(f'outputs/{name.lower().replace(" ","_")}_report.txt','w') as f:
        f.write(f"{name} Accuracy: {acc:.4f}\n\n{report}")
    
    # Confusion matrix
    cm = confusion_matrix(y_test, y_pred)
    plt.figure(figsize=(8,6))
    sns.heatmap(cm, annot=True, fmt='d', xticklabels=['Neg','Neu','Pos'], yticklabels=['Neg','Neu','Pos'], cmap='Blues')
    plt.title(f'Confusion Matrix ({name})')
    plt.savefig(f'outputs/confusion_matrix_{name.lower().replace(" ","_")}.png', dpi=300)
    plt.close()
    
    # Cross-validation
    cv = cross_val_score(model, X_train, y_train, cv=5)
    print(f"{name} Accuracy: {acc:.4f} | CV Mean: {cv.mean():.4f}")
    
    return model, acc

# -------------------------------
# 7. Feature Importance
# -------------------------------
def feature_importance(model, tfidf, name):
    features = tfidf.get_feature_names_out()
    
    if name == "Logistic Regression":
        coefs = model.coef_
        for i,label in enumerate(['Negative','Neutral','Positive']):
            top_idx = np.argsort(coefs[i])[-10:]
            top_feat = [features[idx] for idx in top_idx]
            top_coef = coefs[i][top_idx]
            plt.figure(figsize=(10,6))
            sns.barplot(x=top_coef, y=top_feat)
            plt.title(f'Top 10 Features ({label})')
            plt.savefig(f'outputs/feature_importance_lr_{label.lower()}.png', dpi=300)
            plt.close()
    elif name=="Random Forest":
        imp = model.feature_importances_
        top_idx = np.argsort(imp)[-10:]
        top_feat = [features[idx] for idx in top_idx]
        top_imp = imp[top_idx]
        plt.figure(figsize=(10,6))
        sns.barplot(x=top_imp, y=top_feat)
        plt.title('Top 10 Features (Random Forest)')
        plt.savefig('outputs/feature_importance_rf.png', dpi=300)
        plt.close()

# -------------------------------
# 8. Main Pipeline
# -------------------------------
def main(file_path):
    df = load_data(file_path)
    df = clean_data(df)
    df['cleaned_text'] = df['text'].apply(preprocess_text)
    perform_eda(df)
    
    X = df['cleaned_text']
    y = df['airline_sentiment']
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    
    X_train_tfidf, X_test_tfidf, tfidf = extract_features(X_train, X_test)
    
    models = {
        "Logistic Regression": {
            "model": LogisticRegression(max_iter=1000, multi_class='multinomial'),
            "params": {'C':[0.1,1,10],'solver':['lbfgs','liblinear']}
        },
        "Random Forest": {
            "model": RandomForestClassifier(random_state=42),
            "params": {'n_estimators':[100,200],'max_depth':[10,20,None]}
        },
        "SVM": {
            "model": SVC(),
            "params": {'C':[0.1,1,10],'kernel':['linear','rbf']}
        }
    }
    
    best_acc = 0
    best_model = None
    best_name = ""
    
    for name, cfg in models.items():
        print(f"\nTraining {name}...")
        gs = GridSearchCV(cfg['model'], cfg['params'], cv=5, n_jobs=-1)
        gs.fit(X_train_tfidf, y_train)
        print(f"Best Params: {gs.best_params_}")
        
        model, acc = train_and_evaluate(gs.best_estimator_, name, X_train_tfidf, X_test_tfidf, y_train, y_test)
        
        if name in ["Logistic Regression","Random Forest"]:
            feature_importance(model, tfidf, name)
        
        if acc > best_acc:
            best_acc = acc
            best_model = model
            best_name = name
    
    print(f"\nBest Model: {best_name} | Accuracy: {best_acc:.4f}")
    
    # Save best model
    joblib.dump(best_model, 'outputs/best_model.pkl')
    print("Best model saved to outputs/best_model.pkl")

# -------------------------------
if __name__ == "__main__":
    main('airline_sentiment_tweets.csv')  # Replace with your CSV path


[nltk_data] Downloading package stopwords to C:\Users\HARSHIT
[nltk_data]     SHARMA\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to C:\Users\HARSHIT
[nltk_data]     SHARMA\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to C:\Users\HARSHIT
[nltk_data]     SHARMA\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\HARSHIT SHARMA\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


Dataset loaded. Shape: (14640, 15)
Data cleaned. Shape: (14640, 2)


LookupError: 
**********************************************************************
  Resource [93mpunkt_tab[0m not found.
  Please use the NLTK Downloader to obtain the resource:

  [31m>>> import nltk
  >>> nltk.download('punkt_tab')
  [0m
  For more information see: https://www.nltk.org/data.html

  Attempted to load [93mtokenizers/punkt_tab/english/[0m

  Searched in:
    - 'C:\\Users\\HARSHIT SHARMA/nltk_data'
    - 'c:\\Python312\\nltk_data'
    - 'c:\\Python312\\share\\nltk_data'
    - 'c:\\Python312\\lib\\nltk_data'
    - 'C:\\Users\\HARSHIT SHARMA\\AppData\\Roaming\\nltk_data'
    - 'C:\\nltk_data'
    - 'D:\\nltk_data'
    - 'E:\\nltk_data'
    - 'C:\\nltk_data'
**********************************************************************
