<h3 style="text-align:center">Image and text classification using CNN</h3>

Image classification

In [None]:
from PIL import Image
from torchvision.models import resnet50, ResNet50_Weights
import torch
import torchvision.transforms as T
import matplotlib.pyplot as plt

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

In [None]:
model = resnet50(weights=ResNet50_Weights.IMAGENET1K_V2)
model.to(device)
model.eval()

Inferencing

In [None]:
with open("imagenet1000_clsidx_to_labels.txt") as f:
    labels = eval(f.read())

In [None]:
preprocessor = ResNet50_Weights.IMAGENET1K_V2.transforms()
test_images = ["cat.png", "plane.png", "tractor.png"]

transforms = T.Compose([
    T.Resize((224, 224)),
    T.ToTensor(),
    T.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
])


for img_path in test_images:
    test_image = Image.open(img_path).convert("RGB")
    # test_image = T.ToTensor()(test_image)
    input_image = transforms(test_image).unsqueeze(0).to(device)

    with torch.no_grad():
        output = model(input_image)

    predicted_probabilities = torch.nn.functional.softmax(output[0], dim=0)
    predicted_probabilities, predicted_classes = torch.topk(predicted_probabilities, 5)

    label = labels[predicted_classes[0].item()]
    probability = predicted_probabilities[0].item()

    plt.imshow(test_image)
    plt.axis('off')
    plt.title(f"Predicted class: {label}, Probability: {probability:.4f}")
    plt.show()

Text classification

In [8]:
from sklearn.model_selection import train_test_split
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, Dataset
import pandas as pd
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import re
from collections import Counter, OrderedDict

In [3]:
nltk.download('stopwords')
nltk.download('punkt_tab')
stop_words = stopwords.words('english')

[nltk_data] Downloading package stopwords to
[nltk_data]     /home/hrishikesh/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     /home/hrishikesh/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


In [4]:
df = pd.read_csv("IMDB Dataset.csv")
df.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


Creating a CNN for text classification

In [None]:
class TextClassifier(nn.Module):
    def __init__(self, vocab_size, embed_dim, num_classes):
        super(TextClassifier, self).__init__()
        
        self.embedding = nn.Embedding(vocab_size, embed_dim)

        self.conv1 = nn.Conv1d(embed_dim, 128, 3)
        self.conv2 = nn.Conv1d(128, 64, 3)
        self.conv3 = nn.Conv1d(64, 32, 3)

        self.pool = nn.MaxPool1d(2)
        self.adaptive_pool = nn.AdaptiveMaxPool1d(1)
        self.fc1 = nn.Linear(32 * 24, 128)
        self.fc2 = nn.Linear(128, num_classes)
        self.dropout = nn.Dropout(0.5)
        self.dropout2 = nn.Dropout(0.3)
        self.relu = nn.ReLU()
        self.sigmoid = nn.Sigmoid()

    def forward(self, x):
        x = self.embedding(x)
        x = x.permute(0, 2, 1)

        x = self.conv1(x)
        x = self.relu(x)
        x = self.pool(x)

        x = self.conv2(x)
        x = self.relu(x)
        x = self.pool(x)

        x = self.conv3(x)
        x = self.relu(x)
        x = self.pool(x)

        x = self.adaptive_pool(x)

        x = x.view(x.size(0), -1)
        x = self.dropout(x)
        x = self.fc1(x)
        x = self.relu(x)
        x = self.dropout2(x)

        x = self.fc2(x)
        x = self.sigmoid(x)

        return x


In [None]:
class TextCNN(nn.Module):
    def __init__(self, vocab_size, embed_dim, num_classes, padding_idx=None):
        super(TextCNN, self).__init__()
        
        # --- Embedding Layer ---
        # We add padding_idx to handle padding correctly
        self.embedding = nn.Embedding(vocab_size, embed_dim, padding_idx=padding_idx)

        # --- Parallel Convolutions ---
        # Your sequential layers are now parallel branches
        # We use ModuleList to hold them
        filter_sizes = [3, 4, 5] # Kernel sizes for 3-grams, 4-grams, 5-grams
        out_channels_list = [128, 64, 32] # Your specified out_channels

        self.convs = nn.ModuleList([
            nn.Conv1d(
                in_channels=embed_dim,
                out_channels=out,
                kernel_size=ks
            )
            for out, ks in zip(out_channels_list, filter_sizes)
        ])

        # --- Fully Connected Layers ---
        # The input size is now the sum of all parallel filter outputs
        # 128 + 64 + 32 = 224
        total_out_channels = sum(out_channels_list) 
        
        # This re-uses your fc1 hidden size
        self.fc1 = nn.Linear(total_out_channels, 128) 
        
        # This re-uses your fc2
        self.fc2 = nn.Linear(128, num_classes)
        
        # Re-using your dropout layers
        self.dropout = nn.Dropout(0.5)
        self.dropout2 = nn.Dropout(0.3)
        self.relu = nn.ReLU()
        
        # NOTE: Removed self.sigmoid

    def forward(self, x):
        # x.shape = (batch_size, seq_len)
        
        x = self.embedding(x)
        # x.shape = (batch_size, seq_len, embed_dim)
        
        # Permute for Conv1d: (batch_size, embed_dim, seq_len)
        x = x.permute(0, 2, 1)
        
        # --- Apply parallel convolutions and pooling ---
        
        # Apply each conv, relu, and pool in parallel
        pooled_outputs = []
        for conv in self.convs:
            # 1. Convolve
            # conved.shape = (batch_size, out_channels, new_seq_len)
            conved = self.relu(conv(x))
            
            # 2. Global Max Pooling (max-over-time)
            # This is the key fix: it finds the max value across the
            # entire sequence, resulting in a fixed-size output
            # pooled.shape = (batch_size, out_channels, 1)
            pooled = F.max_pool1d(conved, kernel_size=conved.shape[2])
            
            # 3. Squeeze
            # squeezed.shape = (batch_size, out_channels)
            squeezed = pooled.squeeze(2)
            pooled_outputs.append(squeezed)
            
        # --- Concatenate all parallel outputs ---
        # x.shape = (batch_size, 128 + 64 + 32)
        x = torch.cat(pooled_outputs, dim=1)

        # --- Pass through final linear layers ---
        x = self.dropout(x)
        x = self.fc1(x)
        x = self.relu(x)
        x = self.dropout2(x)
        
        x = self.fc2(x)
        
        # Return raw logits. This is more stable.
        # You should use nn.BCEWithLogitsLoss or nn.CrossEntropyLoss
        # in your training loop, as it combines sigmoid/softmax + loss.
        return x

In [5]:
def preprocess_text(text):
    text = text.lower()
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    stop_words = set(stopwords.words('english'))
    text = nltk.word_tokenize(text)

    text = [word for word in text if len(word) > 2]

    text = ' '.join([word for word in text if word not in stop_words])
    return text

In [6]:
df["review"] = df["review"].apply(preprocess_text)
df["sentiment"] = df["sentiment"].map({"positive": 1, "negative": 0})
df.head()

Unnamed: 0,review,sentiment
0,one reviewers mentioned watching episode youll...,1
1,wonderful little production filming technique ...,1
2,thought wonderful way spend time hot summer we...,1
3,basically theres family little boy jake thinks...,0
4,petter matteis love time money visually stunni...,1


In [7]:
df["tokens"] = df["review"].apply(word_tokenize)

In [9]:
special_tokens = ['<unk>', '<pad>']
token_counts = Counter()

for tokens in df["tokens"]:
    token_counts.update(tokens)

In [10]:
vocab_wtoi = OrderedDict()
for i, token in enumerate(special_tokens):
    vocab_wtoi[token] = i

current_index = len(special_tokens)
for word, freq in token_counts.most_common():
    if not word in vocab_wtoi:
        vocab_wtoi[word] = current_index
        current_index += 1


vocab_itow = {idx: word for word, idx in vocab_wtoi.items()}

In [11]:
class SimpleVocab:
    def __init__(self, stoi, itos):
        self._stoi = stoi
        self._itos = itos
        self.unk_index = stoi.get("<unk>", 0) # Default to 0

    def __len__(self):
        return len(self._stoi)

    def stoi(self, token):
        # Return the index for the token, or the <unk> index
        return self._stoi.get(token, self.unk_index)

    def itos(self, index):
        # Return the token for the index
        return self._itos.get(index, "<unk>")
    
    def __call__(self, tokens):
        # A helper to convert a list of tokens to indices
        return [self.stoi(token) for token in tokens]

In [12]:
vocab = SimpleVocab(vocab_wtoi, vocab_itow)

In [15]:
import gensim.downloader as api
glove_vectors = api.load('glove-wiki-gigaword-100')



In [16]:
vocab_size = len(vocab)
embed_dim = glove_vectors.vector_size

In [19]:
embedding_matrix = torch.zeros(vocab_size, embed_dim)

for word, idx in vocab_wtoi.items():
    if word in glove_vectors:
        embedding_matrix[idx] = torch.tensor(glove_vectors[word])
    else:
        embedding_matrix[idx] = torch.randn(embed_dim) * 0.1


In [20]:
embedding_matrix.shape

torch.Size([175220, 100])