In [2]:
import pandas as pd
import numpy as np
import os
from sklearn.model_selection import train_test_split

# Define BASE_FOLDER globally
BASE_FOLDER = "data"

def make_dir():
    """Creates required directories if they don't exist."""
    sub_dirs = ["raw", "raw/train", "raw/test"]  # Fixed missing comma
    for sub in sub_dirs:
        os.makedirs(os.path.join(BASE_FOLDER, sub), exist_ok=True)

def load_data(url):
    """Loads data from the provided URL."""
    return pd.read_csv(url)

def processing(df):
    """Filters, maps sentiment values, and shuffles data."""
    return (
        df[df["sentiment"].isin(["happiness", "sadness"])]
        .drop(columns=["tweet_id"], errors="ignore")  # Avoid KeyError if column is missing
        .assign(sentiment=lambda x: x["sentiment"].map({"happiness": 1, "sadness": 0}))
        .dropna()
        .sample(frac=1, random_state=42)  # Shuffle the data
    )

def save_data(final_df):
    """Splits data into train and test sets and saves them."""
    train_df, test_df = train_test_split(final_df, test_size=0.2, random_state=42)

    train_df.to_csv(os.path.join(BASE_FOLDER, "raw/train", "train.csv"), index=False)
    test_df.to_csv(os.path.join(BASE_FOLDER, "raw/test", "test.csv"), index=False)

def main():
    make_dir()
    url = "https://raw.githubusercontent.com/campusx-official/jupyter-masterclass/main/tweet_emotions.csv"

    df = load_data(url)
    final_df = processing(df)
    save_data(final_df)

if __name__ == "__main__":
    main()

In [3]:
import re
# NLTK for natural language processing
import nltk
from nltk.corpus import stopwords    # For stopwords
from nltk.tokenize import word_tokenize # For tokenization
from nltk.stem import PorterStemmer, WordNetLemmatizer # For stemming and lemmatization

# Downloading NLTK data
nltk.download('stopwords')   # Downloading stopwords data
nltk.download('punkt')       # Downloading tokenizer data
nltk.download('wordnet')     # Downloading WordNet data for lemmatization

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/santoshkumar/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     /Users/santoshkumar/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/santoshkumar/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [4]:
def data_cleaning(text_series):
    """Cleans the text data by removing URLs, emails, numbers, and punctuation."""
    number_pattern = r"(?<=\D)\d+|\d+(?=\D)"  # Removes numbers but keeps letters
    url_pattern = r"https?://\S+|www\.\S+"
    email_pattern = r"\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Za-z]{2,}\b"
    punctuation_pattern = r"[^\w\s]"

    return (
        text_series.astype(str)  # Ensure text is string
        .str.lower()
        .str.replace(url_pattern, " ", regex=True)
        .str.replace(email_pattern, " ", regex=True)
        .str.replace(number_pattern, " ", regex=True)
        .str.replace(punctuation_pattern, " ", regex=True)
        .str.strip()
        .str.replace(r"\s+", " ", regex=True)  # Normalize spaces
    )

def remove_short_words(text_series, min_length=3):
    """Removes words shorter than `min_length` characters."""
    return text_series.apply(lambda x: " ".join([word for word in x.split() if len(word) >= min_length]))

def lemmatization(text_series):
    """Lemmatizes words using WordNetLemmatizer."""
    lemmatizer = WordNetLemmatizer()
    return text_series.apply(lambda x: " ".join([lemmatizer.lemmatize(word, pos="v") for word in x.split()]))

def remove_stopwords(text_series):
    """Removes stopwords from text."""
    stop_words = frozenset(stopwords.words("english"))  # Faster lookup
    return text_series.apply(lambda x: " ".join([word for word in x.split() if word not in stop_words]))

def normalize(df):
    """Applies text preprocessing steps."""
    df["content"] = data_cleaning(df["content"])
    df["content"] = remove_short_words(df["content"])
    df["content"] = lemmatization(df["content"])
    df["content"] = remove_stopwords(df["content"])
    df["content"]
    return df

def main():
    train_data = pd.read_csv("/Users/santoshkumar/Data_science/PyTorch_guide/data/raw/train/train.csv")
    test_data = pd.read_csv("/Users/santoshkumar/Data_science/PyTorch_guide/data/raw/test/test.csv")

    # Transform the data
    train_processed_data = normalize(train_data)
    test_processed_data = normalize(test_data)

    # Store the data inside data/processed
    data_path = os.path.join("./data", "interim")
    os.makedirs(data_path, exist_ok=True)

    train_processed_data.to_csv(os.path.join(data_path, "train_processed.csv"), index=False)
    test_processed_data.to_csv(os.path.join(data_path, "test_processed.csv"), index=False)


if __name__ == "__main__":
    main()


In [5]:
x=pd.read_csv("/Users/santoshkumar/Data_science/PyTorch_guide/data/interim/train_processed.csv")
y=pd.read_csv("/Users/santoshkumar/Data_science/PyTorch_guide/data/interim/test_processed.csv")
x=x.dropna()
y= y.dropna()

x_train= x['content']
x_test=y['content']
y_train= x["sentiment"]
y_test=y["sentiment"]

x_train = x_train
x_test= x_test.dropna()

In [6]:
from sklearn.feature_extraction.text import TfidfVectorizer

# Initialize TF-IDF Vectorizer
vectorizer = TfidfVectorizer(max_features=1000, stop_words="english", ngram_range=(1,2), min_df=5, max_df=0.8)

# Fit and transform on training data
x_train_tfidf = vectorizer.fit_transform(x_train)

# Transform the test data
x_test_tfidf = vectorizer.transform(x_test)


In [7]:
tfidf_df = pd.DataFrame(x_train_tfidf.toarray(), columns=vectorizer.get_feature_names_out())
tfidf_df

Unnamed: 0,able,absolutely,accept,access,account,act,actually,add,adorable,afternoon,...,year,year old,years,yep,yes,yesterday,youtube,yum,yummy,yup
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8293,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000,0.0
8294,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000,0.0
8295,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000,0.0
8296,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.403969,0.380663,0.0


In [73]:
tfidf_test_df = pd.DataFrame(x_test_tfidf.toarray(), columns=vectorizer.get_feature_names_out())
tfidf_test_df

Unnamed: 0,able,absolutely,accept,access,account,act,actually,add,adorable,afternoon,...,year,year old,years,yep,yes,yesterday,youtube,yum,yummy,yup
0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2069,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2070,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2071,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2072,0.0,0.0,0.0,0.0,0.0,0.0,0.728505,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [8]:
# Import Libraries
import torch
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from torch.utils.data import DataLoader, TensorDataset

In [9]:
# Convert sparse TF-IDF matrix to dense NumPy array before converting to tensors
X_train_tensor = torch.tensor(x_train_tfidf.toarray(), dtype=torch.float32)
X_test_tensor = torch.tensor(x_test_tfidf.toarray(), dtype=torch.float32)

# Convert labels to tensors
y_train_tensor = torch.tensor(y_train.values, dtype=torch.float32)
y_test_tensor = torch.tensor(y_test.values, dtype=torch.float32)

In [None]:
# Create TensorDataset
train_dataset = TensorDataset(X_train_tensor, y_train_tensor)
test_dataset = TensorDataset(X_test_tensor, y_test_tensor)

# Create DataLoaders
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False)


In [12]:
X_train_tensor.shape

torch.Size([8298, 1000])

In [80]:
import torch
import torch.nn as nn
from torchinfo import summary

class LogisticRegression(nn.Module):
    def __init__(self, input_dim, output_dim):
        super().__init__()
        self.linear = nn.Linear(input_dim, output_dim)

    def forward(self, x):
        return self.linear(x).squeeze(1)  

# Model Parameters
input_dim = X_train_tensor.shape[1]
output_dim = 1  

# Instantiate Model
model = LogisticRegression(input_dim, output_dim)

# Print Model Summary (use batch_size=32 for better display)
summary(model, input_size=(32, input_dim))  


Layer (type:depth-idx)                   Output Shape              Param #
LogisticRegression                       [32]                      --
├─Linear: 1-1                            [32, 1]                   1,001
Total params: 1,001
Trainable params: 1,001
Non-trainable params: 0
Total mult-adds (Units.MEGABYTES): 0.03
Input size (MB): 0.13
Forward/backward pass size (MB): 0.00
Params size (MB): 0.00
Estimated Total Size (MB): 0.13

In [None]:
# Define Loss Function (Cross Entropy includes Softmax)
criterion = nn.BCEWithLogitsLoss()  # More stable than BCELoss

# Define Optimizer (SGD)
learning_rate = 0.005
optimizer = torch.optim.SGD(model.parameters(), lr=learning_rate, weight_decay=1e-4)


In [85]:
num_epochs = 500  # Number of epochs
for epoch in range(num_epochs):
    total_loss = 0.0
    for X_batch, y_batch in train_loader:
        # Forward pass
        y_pred = model(X_batch)
        loss = criterion(y_pred, y_batch)

        # Backward pass
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        total_loss += loss.item()

    # Print progress every 500 epochs
    if (epoch + 1) % 100 == 0:  
        print(f'Epoch: {epoch+1}  Loss: {total_loss:.4f}')


Epoch: 100  Loss: 213.0949
Epoch: 200  Loss: 212.6650
Epoch: 300  Loss: 212.2767
Epoch: 400  Loss: 212.0544
Epoch: 500  Loss: 211.9871


In [86]:
# Evaluation Function
def evaluate_model(model, data_loader):
    """Evaluates the model and prints key metrics."""
    model.eval()
    all_y_true, all_y_pred, all_y_scores = [], [], []

    with torch.no_grad():
        for X_batch, y_batch in data_loader:
            y_batch = y_batch.float().view(-1, 1)  # Ensure correct shape
            
            y_logits = model(X_batch)
            y_probs = torch.sigmoid(y_logits)
            y_pred_labels = (y_probs >= 0.5).float()

            all_y_true.extend(y_batch.cpu().numpy())  
            all_y_pred.extend(y_pred_labels.cpu().numpy())  
            all_y_scores.extend(y_probs.cpu().numpy())  

    # Compute Metrics
    accuracy = accuracy_score(all_y_true, all_y_pred)
    precision = precision_score(all_y_true, all_y_pred, zero_division=0)
    recall = recall_score(all_y_true, all_y_pred, zero_division=0)
    f1 = f1_score(all_y_true, all_y_pred, zero_division=0)
    roc_auc = roc_auc_score(all_y_true, all_y_scores)

    print(f"  - Accuracy: {accuracy:.4f}")
    print(f"  - Precision: {precision:.4f}")
    print(f"  - Recall: {recall:.4f}")
    print(f"  - F1 Score: {f1:.4f}")
    print(f"  - ROC-AUC Score: {roc_auc:.4f}\n")

# Run Final Evaluation
print("\nFinal Model Evaluation:")
evaluate_model(model, test_loader)


Final Model Evaluation:
  - Accuracy: 0.7739
  - Precision: 0.7863
  - Recall: 0.7616
  - F1 Score: 0.7738
  - ROC-AUC Score: 0.8607



In [91]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score

# Train Logistic Regression Model
sklearn_model = LogisticRegression(max_iter=1000)
sklearn_model.fit(x_train_tfidf, y_train)  # Train on TF-IDF features

# Predict on Test Data
y_pred = sklearn_model.predict(x_test_tfidf)
y_probs = sklearn_model.predict_proba(x_test_tfidf)[:, 1]  # Probabilities for ROC-AUC

# Calculate Evaluation Metrics
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
roc_auc = roc_auc_score(y_test, y_probs)

# Print Metrics
print("🔹 Sklearn Logistic Regression Results:")
print(f"Accuracy: {accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"F1 Score: {f1:.4f}")
print(f"ROC-AUC Score: {roc_auc:.4f}")


🔹 Sklearn Logistic Regression Results:
Accuracy: 0.7739
Precision: 0.7835
Recall: 0.7664
F1 Score: 0.7748
ROC-AUC Score: 0.8647


In [168]:
import torch
import torch.nn as nn
import torch.optim as optim

class ANN(nn.Module):
    def __init__(self, input_dim):
        super().__init__()
        self.fc1 = nn.Linear(input_dim, 32)
        self.fc2 = nn.Linear(32, 1)
        self.relu = nn.ReLU()
        self.dropout= nn.Dropout(.2)
    def forward(self, x):
        x = self.relu(self.fc1(x))
        x= self.dropout(x)
        x = torch.sigmoid(self.fc2(x))  # Apply Sigmoid for binary classification
        return x

# Model Parameters
input_dim = X_train_tensor.shape[1]
model = ANN(input_dim)

# Loss Function and Optimizer
criterion = nn.BCELoss()  # Use BCEWithLogitsLoss() if no sigmoid in forward()
optimizer = optim.SGD(model.parameters(), lr=0.005, weight_decay=1e-4)

# Print Model Summary
from torchinfo import summary
summary(model, input_size=(32, input_dim))  # batch_size = 32


Layer (type:depth-idx)                   Output Shape              Param #
ANN                                      [32, 1]                   --
├─Linear: 1-1                            [32, 32]                  32,032
├─ReLU: 1-2                              [32, 32]                  --
├─Dropout: 1-3                           [32, 32]                  --
├─Linear: 1-4                            [32, 1]                   33
Total params: 32,065
Trainable params: 32,065
Non-trainable params: 0
Total mult-adds (Units.MEGABYTES): 1.03
Input size (MB): 0.13
Forward/backward pass size (MB): 0.01
Params size (MB): 0.13
Estimated Total Size (MB): 0.26

In [169]:
# Training Parameters
num_epochs = 200  # You can increase this
verbose_step = 10  # Print loss every 10 epochs

# Training Loop
for epoch in range(num_epochs):
    total_loss = 0.0
    model.train()  # Set model to training mode

    for X_batch, y_batch in train_loader:
        X_batch, y_batch = X_batch.to(device), y_batch.to(device)

        # Forward pass
        y_pred = model(X_batch).squeeze()  # Squeeze to ensure correct shape
        loss = criterion(y_pred, y_batch)

        # Backward pass
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        total_loss += loss.item()

    # Print progress every `verbose_step` epochs
    if (epoch + 1) % verbose_step == 0:
        print(f"Epoch {epoch+1}/{num_epochs} | Loss: {total_loss:.4f}")


Epoch 10/200 | Loss: 358.2644
Epoch 20/200 | Loss: 353.7908
Epoch 30/200 | Loss: 341.8072
Epoch 40/200 | Loss: 317.9062
Epoch 50/200 | Loss: 290.0093
Epoch 60/200 | Loss: 266.1908
Epoch 70/200 | Loss: 250.8556
Epoch 80/200 | Loss: 239.6591
Epoch 90/200 | Loss: 229.5809
Epoch 100/200 | Loss: 224.1549
Epoch 110/200 | Loss: 219.4491
Epoch 120/200 | Loss: 215.3842
Epoch 130/200 | Loss: 211.7769
Epoch 140/200 | Loss: 210.2859
Epoch 150/200 | Loss: 206.1332
Epoch 160/200 | Loss: 205.0118
Epoch 170/200 | Loss: 201.7789
Epoch 180/200 | Loss: 199.9354
Epoch 190/200 | Loss: 199.0067
Epoch 200/200 | Loss: 196.9233


In [170]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score

def evaluate_model(model, data_loader):
    """Evaluates the model and prints performance metrics."""
    model.eval()  # Set to evaluation mode
    all_y_true, all_y_pred, all_y_scores = [], [], []

    with torch.no_grad():
        for X_batch, y_batch in data_loader:
            y_logits = model(X_batch).squeeze()
            y_probs = y_logits.cpu().numpy()  # Convert to NumPy
            y_pred_labels = (y_probs >= 0.5).astype(int)  # Apply threshold

            all_y_true.extend(y_batch.cpu().numpy())
            all_y_pred.extend(y_pred_labels)
            all_y_scores.extend(y_probs)

    # Compute Metrics
    accuracy = accuracy_score(all_y_true, all_y_pred)
    precision = precision_score(all_y_true, all_y_pred, zero_division=0)
    recall = recall_score(all_y_true, all_y_pred, zero_division=0)
    f1 = f1_score(all_y_true, all_y_pred, zero_division=0)
    roc_auc = roc_auc_score(all_y_true, all_y_scores)

    print(f"Accuracy: {accuracy:.4f}")
    print(f"Precision: {precision:.4f}")
    print(f"Recall: {recall:.4f}")
    print(f"F1 Score: {f1:.4f}")
    print(f"ROC-AUC Score: {roc_auc:.4f}")

# Evaluate on Test Data
print("Final Evaluation:")
evaluate_model(model, test_loader)


Final Evaluation:
Accuracy: 0.7748
Precision: 0.7867
Recall: 0.7635
F1 Score: 0.7749
ROC-AUC Score: 0.8628
