# Task 2: Natural Language Processing (NLP) - Text Classification

### Description: Classify text data into categories (e.g., spam vs. non-spam, sentiment analysis).

- Preprocess text data (tokenization, removing stopwords, stemming/lemmatization).
- Convert text into numerical representation using TF-IDF or Word2Vec.
- Train a classification model (e.g., Naive Bayes, Logistic Regression) on the processed text.
- Evaluate the model using precision, recall, and F1-score.

In [1]:
import pandas as pd
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report

# Download required NLTK resources
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('punkt')
nltk.download('punkt_tab')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


True

In [2]:
df = pd.read_csv('data/Sentiment dataset.csv', index_col=[0])
df.head(2)

Unnamed: 0.1,Unnamed: 0,Text,Sentiment,Timestamp,User,Platform,Hashtags,Retweets,Likes,Country,Year,Month,Day,Hour
0,0,Enjoying a beautiful day at the park! ...,Positive,2023-01-15 12:30:00,User123,Twitter,#Nature #Park,15.0,30.0,USA,2023,1,15,12
1,1,Traffic was terrible this morning. ...,Negative,2023-01-15 08:45:00,CommuterX,Twitter,#Traffic #Morning,5.0,10.0,Canada,2023,1,15,8


# Text Preprocessing


In [3]:
def preprocess_text(text):
    # Tokenization & Lowercasing
    text = text.strip()
    tokens = nltk.word_tokenize(text.lower())

    # Remove Stopwords and non-alphabetic characters
    stop_words = set(stopwords.words('english'))
    tokens = [word for word in tokens if word.isalpha() and word not in stop_words]

    # Lemmatization (reducing words to their dictionary root)
    lemmatizer = WordNetLemmatizer()
    tokens = [lemmatizer.lemmatize(word) for word in tokens]
    return " ".join(tokens)

In [4]:
df["clean_text"] = df["Text"].apply(preprocess_text)

In [5]:
df["Sentiment"] = df["Sentiment"].apply(preprocess_text)

# Feature Engineering (TF-IDF)

In [6]:
# Initialize Vectorizer
tfidf = TfidfVectorizer(max_features=5000)

# Transform the cleaned text
X = tfidf.fit_transform(df['clean_text'])
y = df['Sentiment']

# Model Training & Evaluation


In [7]:
# Split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Train Logistic Regression
model = LogisticRegression()
model.fit(X_train, y_train)

# Predictions
y_pred = model.predict(X_test)

# Evaluation
print(classification_report(y_test, y_pred, zero_division=0))


                     precision    recall  f1-score   support

         acceptance       1.00      0.33      0.50         3
         admiration       0.00      0.00      0.00         3
          affection       0.00      0.00      0.00         1
        ambivalence       0.00      0.00      0.00         2
          amusement       0.00      0.00      0.00         1
              anger       0.00      0.00      0.00         1
       anticipation       0.00      0.00      0.00         1
       apprehensive       0.00      0.00      0.00         1
            arousal       0.00      0.00      0.00         4
                awe       0.00      0.00      0.00         3
                bad       0.00      0.00      0.00         2
           betrayal       0.00      0.00      0.00         3
             bitter       0.00      0.00      0.00         1
         bitterness       0.00      0.00      0.00         3
        bittersweet       0.00      0.00      0.00         1
            boredom    