In [None]:
# Social Media Monitoring - NLP + Machine Learning
# Executable in Google Colab.
# Updated version: Allows uploading dataset directly in Colab.

# ------------------------------
# 1. Install required packages (run in Colab)
# ------------------------------
!pip install --quiet scikit-learn pandas matplotlib seaborn nltk wordcloud imbalanced-learn

# Optional (transformer model) - uncomment if you have GPU and want a transformer baseline
# !pip install --quiet transformers torch sentencepiece

# ------------------------------
# 2. Imports
# ------------------------------
import os
import re
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import LinearSVC
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
import matplotlib.pyplot as plt
import seaborn as sns

# NLP helpers
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

# For dealing with class imbalance (if present)
from imblearn.over_sampling import SMOTE

# ------------------------------
# 3. NLTK downloads (first run)
# ------------------------------
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')
nltk.download('vader_lexicon')
nltk.download('punkt_tab')

# ------------------------------
# 4. Upload dataset directly in Colab
# ------------------------------
from google.colab import files
print('Please upload your dataset CSV file...')
uploaded = files.upload()
filename = list(uploaded.keys())[0]
print(f"\nUploaded file: {filename}")
df = pd.read_csv(filename)

print('\nDataset shape:', df.shape)
print('\nPreview:')
print(df.head())

# ------------------------------
# 5. Expected columns and quick fixes
# ------------------------------
possible_text_cols = ['text','tweet','content','message','post']
possible_label_cols = ['label','sentiment','target','class']

text_col = None
label_col = None
for c in df.columns:
    if c.lower() in possible_text_cols:
        text_col = c
        break
for c in df.columns:
    if c.lower() in possible_label_cols:
        label_col = c
        break

if text_col is None or label_col is None:
    print('\nCould not auto-detect text/label columns. Detected columns:')
    print(list(df.columns))
    raise SystemExit('Please rename your text column to one of: ' + str(possible_text_cols) + '\nand label column to one of: ' + str(possible_label_cols))

df = df[[text_col, label_col]].rename(columns={text_col: 'text', label_col: 'label'})
df = df.dropna(subset=['text','label']).reset_index(drop=True)

print('\nLabel distribution:')
print(df['label'].value_counts())

# ------------------------------
# 6. Basic EDA
# ------------------------------
print('\nNumber of examples:', len(df))
print('\nSample texts:')
print(df['text'].sample(5).values)

plt.figure(figsize=(6,4))
sns.countplot(x='label', data=df)
plt.title('Label distribution')
plt.show()

# ------------------------------
# 7. Preprocessing utilities
# ------------------------------
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

def clean_text(text):
    text = str(text).lower()
    text = re.sub(r'https?://\S+|www\.\S+', ' ', text)
    text = re.sub(r'@\w+', ' ', text)
    text = re.sub(r'#(\w+)', lambda m: m.group(1), text)
    text = re.sub(r'[^a-zA-Z0-9\s]', ' ', text)
    tokens = nltk.word_tokenize(text)
    tokens = [t for t in tokens if t not in stop_words and len(t) > 1]
    tokens = [lemmatizer.lemmatize(t) for t in tokens]
    return ' '.join(tokens)

print('\nCleaning texts (this may take a minute)...')
df['clean_text'] = df['text'].apply(clean_text)
print('Done.')

# ------------------------------
# 8. Train / Test split
# ------------------------------
# ------------------------------
# 8. Train / Test split (Fixed)
# ------------------------------
label_counts = y.value_counts()

if (label_counts < 2).any():
    print("⚠️ Some classes have fewer than 2 samples. Using non-stratified split.")
    stratify_option = None
else:
    stratify_option = y

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, stratify=stratify_option, random_state=42
)

print('\nTrain size:', len(X_train), 'Test size:', len(X_test))
print('Label distribution in train set:')
print(y_train.value_counts())
print('\\nLabel distribution in test set:')
print(y_test.value_counts())


# ------------------------------
# 9. Train and Evaluate Models
# ------------------------------
tfidf = TfidfVectorizer(max_features=20000, ngram_range=(1,2))

def train_and_evaluate(model_name, model):
    pipe = Pipeline([
        ('tfidf', tfidf),
        ('clf', model)
    ])
    pipe.fit(X_train, y_train)
    preds = pipe.predict(X_test)
    print(f'\nModel: {model_name}')
    print('Accuracy:', accuracy_score(y_test, preds))
    print('Classification Report:\n', classification_report(y_test, preds))
    cm = confusion_matrix(y_test, preds)
    plt.figure(figsize=(6,5))
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues')
    plt.title(f'Confusion Matrix - {model_name}')
    plt.xlabel('Predicted')
    plt.ylabel('True')
    plt.show()
    return pipe

# Logistic Regression
lr_model = train_and_evaluate('Logistic Regression', LogisticRegression(max_iter=1000))

# Linear SVM
svm_model = train_and_evaluate('Linear SVM', LinearSVC(max_iter=10000))

# Random Forest
rf_model = train_and_evaluate('Random Forest', RandomForestClassifier(n_estimators=200))