NLP

Question: 1

1. Data Set Link: (Take Any Dataset but avoid using benchmark data)
Perform Below Listed Preprocessing Task in proper order as per your dataset using torch or tensorflow.

Emoji Removal 

Lemmatization 

Stemming 

Word Tokenization 

Grammar Correction

Http Links Removal 

Stop Words Removal

Sentence Tokenization 

Lower casing

 Remove white spaces

Text Normalization

Part of speech tagging

In [None]:
import pandas as pd
import re
import string
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.stem import WordNetLemmatizer, PorterStemmer
import spacy

# Download NLTK resources
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

# Load English language model for spaCy
nlp = spacy.load('en_core_web_sm')

# Load your CSV dataset
csv_path = 'path/to/your/dataset.csv'
df = pd.read_csv(csv_path)

def preprocess_text(text):
    # Emoji removal
    text = re.sub(r'\W+', ' ', text)  # Removing non-alphanumeric characters
    text = re.sub('[\U00010000-\U0010ffff]', '', text)  # Removing emojis

    # Lemmatization
    lemmatizer = WordNetLemmatizer()
    text = ' '.join([lemmatizer.lemmatize(word) for word in word_tokenize(text)])

    # Stemming
    stemmer = PorterStemmer()
    text = ' '.join([stemmer.stem(word) for word in word_tokenize(text)])

    # Word Tokenization
    words = word_tokenize(text)

    # Grammar Correction (not easy to implement programmatically, may require external tools)

    # HTTP Links Removal
    text = re.sub(r'http\S+', '', text)

    # Stop Words Removal
    stop_words = set(stopwords.words('english'))
    words = [word for word in words if word.lower() not in stop_words]

    # Sentence Tokenization
    sentences = sent_tokenize(text)

    # Lowercasing
    words = [word.lower() for word in words]

    # Remove white spaces
    text = ' '.join(words)

    # Text Normalization (consider additional normalization steps based on your requirements)

    # Part of speech tagging
    pos_tags = [(token.text, token.pos_) for token in nlp(text)]

    return {
        'original_text': text,
        'words': words,
        'sentences': sentences,
        'pos_tags': pos_tags
    }

# Apply the preprocessing function to the entire dataset
df['preprocessed_data'] = df['text_column'].apply(preprocess_text)

# Save the preprocessed dataset
df.to_csv('path/to/your/preprocessed_dataset.csv', index=False)


In [3]:
pip install spacy


Collecting spacy
  Downloading spacy-3.7.2-cp311-cp311-win_amd64.whl.metadata (26 kB)
Collecting spacy-legacy<3.1.0,>=3.0.11 (from spacy)
  Downloading spacy_legacy-3.0.12-py2.py3-none-any.whl (29 kB)
Collecting spacy-loggers<2.0.0,>=1.0.0 (from spacy)
  Downloading spacy_loggers-1.0.5-py3-none-any.whl.metadata (23 kB)
Collecting murmurhash<1.1.0,>=0.28.0 (from spacy)
  Using cached murmurhash-1.0.10-cp311-cp311-win_amd64.whl.metadata (2.0 kB)
Collecting cymem<2.1.0,>=2.0.2 (from spacy)
  Using cached cymem-2.0.8-cp311-cp311-win_amd64.whl.metadata (8.6 kB)
Collecting preshed<3.1.0,>=3.0.2 (from spacy)
  Downloading preshed-3.0.9-cp311-cp311-win_amd64.whl.metadata (2.2 kB)
Collecting thinc<8.3.0,>=8.1.8 (from spacy)
  Downloading thinc-8.2.2-cp311-cp311-win_amd64.whl.metadata (15 kB)
Collecting wasabi<1.2.0,>=0.9.1 (from spacy)
  Downloading wasabi-1.1.2-py3-none-any.whl.metadata (28 kB)
Collecting srsly<3.0.0,>=2.4.3 (from spacy)
  Downloading srsly-2.4.8-cp311-cp311-win_amd64.whl.meta

In [6]:
!python -m spacy download en_core_web_sm

Collecting en-core-web-sm==3.7.1
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.7.1/en_core_web_sm-3.7.1-py3-none-any.whl (12.8 MB)
     ---------------------------------------- 0.0/12.8 MB ? eta -:--:--
     ---------------------------------------- 0.0/12.8 MB ? eta -:--:--
     --------------------------------------- 0.0/12.8 MB 445.2 kB/s eta 0:00:29
     --------------------------------------- 0.1/12.8 MB 751.6 kB/s eta 0:00:17
     --------------------------------------- 0.1/12.8 MB 853.3 kB/s eta 0:00:15
      -------------------------------------- 0.2/12.8 MB 985.7 kB/s eta 0:00:13
      --------------------------------------- 0.3/12.8 MB 1.2 MB/s eta 0:00:11
     - -------------------------------------- 0.3/12.8 MB 1.2 MB/s eta 0:00:11
     - -------------------------------------- 0.4/12.8 MB 1.3 MB/s eta 0:00:10
     - -------------------------------------- 0.5/12.8 MB 1.4 MB/s eta 0:00:09
     - -------------------------------------

In [8]:
import torch
import torch.nn as nn
import torch.optim as optim
import pandas as pd
from sklearn.model_selection import train_test_split
from torch.utils.data import Dataset, DataLoader
from torchtext.data.utils import get_tokenizer

# Set random seed for reproducibility
SEED = 1234
torch.manual_seed(SEED)
torch.backends.cudnn.deterministic = True

# Load your CSV dataset (replace 'your_dataset.csv' with the actual path)
csv_path = r'C:\Users\geeth\Downloads\instagram_reach (1).csv'
df = pd.read_csv(csv_path)

# Split the dataset into training, validation, and test sets
train_data, test_data = train_test_split(df, test_size=0.2, random_state=SEED)
train_data, valid_data = train_test_split(train_data, test_size=0.1, random_state=SEED)

# Define a simple dataset class
class CustomDataset(Dataset):
    def __init__(self, dataframe, text_field, label_field):
        self.data = dataframe
        self.text_field = text_field
        self.label_field = label_field

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        text = self.data.iloc[idx]['text']
        label = float(self.data.iloc[idx]['label'])
        return {'text': text, 'label': label}

# Tokenizer function
tokenizer = get_tokenizer('spacy')

# Apply tokenization to the text column
train_data['text'] = train_data['text'].apply(lambda x: tokenizer(x))
valid_data['text'] = valid_data['text'].apply(lambda x: tokenizer(x))
test_data['text'] = test_data['text'].apply(lambda x: tokenizer(x))


# Create custom datasets
train_dataset = CustomDataset(train_data, 'text', 'label')
valid_dataset = CustomDataset(valid_data, 'text', 'label')
test_dataset = CustomDataset(test_data, 'text', 'label')

# Example of how to use DataLoader
BATCH_SIZE = 64
train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
valid_loader = DataLoader(valid_dataset, batch_size=BATCH_SIZE, shuffle=False)
test_loader = DataLoader(test_dataset, batch_size=BATCH_SIZE, shuffle=False)

# Continue with the rest of your code...




KeyError: 'text'