# Natural Language Processing
## Text Classification
### Task 1: Data Exploration
#### Importing Libaries


In [18]:
import pandas as pd
import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score,confusion_matrix, classification_report

#### Loading and displaying the first 5 rows of the dataset

In [4]:

file_path = r'E:\Online_Course\NLP\Projects\text_class.csv'  
dataset = pd.read_csv(file_path)
print(dataset.head())

                                                text     label
0                 I loved the product, it's amazing!  positive
1    Terrible service, I will never shop here again.  negative
2    The quality is good, but the delivery was late.   neutral
3  Absolutely wonderful experience, highly recomm...  positive
4  Product was damaged when it arrived, very disa...  negative


### Printing the total number of rows and the number of unique labels in the dataset

In [5]:
total_rows = len(dataset)
unique_labels = dataset['label'].nunique() 
print(f"Total number of rows: {total_rows}")
print(f"Number of unique labels: {unique_labels}")

Total number of rows: 8
Number of unique labels: 3


#### Checking for missing values and dropping incase of any

In [6]:
print("Checking for missing values...")
print(dataset.isnull().sum())

dataset = dataset.dropna()
print("After handling missing values:")
print(dataset.isnull().sum())

Checking for missing values...
text     0
label    0
dtype: int64
After handling missing values:
text     0
label    0
dtype: int64


### Task 2: Preprocessing Text Data
#### Converting all text to lower case

In [7]:
dataset['text'] = [text.lower() for text in dataset['text']]
print(dataset)

                                                text     label
0                 i loved the product, it's amazing!  positive
1    terrible service, i will never shop here again.  negative
2    the quality is good, but the delivery was late.   neutral
3  absolutely wonderful experience, highly recomm...  positive
4  product was damaged when it arrived, very disa...  negative
5  the customer support was very helpful and polite.  positive
6                     worst purchase i've ever made.  negative
7  satisfied with the product but the price was t...   neutral


#### Removing all punctuaion and special charaters

In [8]:
cleaned_text = []
for text in dataset['text']:
    text = re.sub(r'[^\w\s]', '', text)
    cleaned_text.append(text)
dataset['text'] = cleaned_text
print(dataset)

                                                text     label
0                    i loved the product its amazing  positive
1      terrible service i will never shop here again  negative
2      the quality is good but the delivery was late   neutral
3   absolutely wonderful experience highly recommend  positive
4  product was damaged when it arrived very disap...  negative
5   the customer support was very helpful and polite  positive
6                       worst purchase ive ever made  negative
7  satisfied with the product but the price was t...   neutral


#### Tokenizing the text, removing stop words and displaying the first 5 rows


In [None]:
tokenized_text = []
for text in dataset['text']:
    tokens = text.split()  
    tokenized_text.append(tokens)

stop_words = set(stopwords.words('english'))
processed_text = []
for tokens in tokenized_text:
    filtered_tokens = [word for word in tokens if word not in stop_words]
    processed_text.append(filtered_tokens)

joined_text = [' '.join(tokens) for tokens in processed_text]

dataset['processed_text'] = joined_text

print("Processed Text (first 5 rows):")
print(dataset[['text', 'processed_text']].head())

Processed Text (first 5 rows):
                                                text  \
0                    i loved the product its amazing   
1      terrible service i will never shop here again   
2      the quality is good but the delivery was late   
3   absolutely wonderful experience highly recommend   
4  product was damaged when it arrived very disap...   

                                     processed_text  
0                             loved product amazing  
1                       terrible service never shop  
2                        quality good delivery late  
3  absolutely wonderful experience highly recommend  
4              product damaged arrived disappointed  


### Task 3: Training a Classifier
#### Spliting data into training and testing sets

In [None]:
X = dataset['processed_text']  
y = dataset['label']           
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


vectorizer = CountVectorizer()
X_train_vectorized = vectorizer.fit_transform(X_train)
X_test_vectorized = vectorizer.transform(X_test)

#### Traning the model, predicting and analysing the accuracy of the model

In [16]:
model = LogisticRegression(random_state=42)
model.fit(X_train_vectorized, y_train)

# Predict labels on the test set
y_pred = model.predict(X_test_vectorized)

# Calculate accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy score: {accuracy:.2f}")


Accuracy score: 0.50


- As the score is 0.5 the accuracy is moderate

### Task 4: Model Evaluation 

In [19]:
conf_matrix = confusion_matrix(y_test, y_pred)
print("Confusion Matrix:")
print(conf_matrix)

# Detailed evaluation using a classification report
print("\nClassification Report:")
print(classification_report(y_test, y_pred))

Confusion Matrix:
[[0 1]
 [0 1]]

Classification Report:
              precision    recall  f1-score   support

    negative       0.00      0.00      0.00         1
    positive       0.50      1.00      0.67         1

    accuracy                           0.50         2
   macro avg       0.25      0.50      0.33         2
weighted avg       0.25      0.50      0.33         2



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


### Results of evaluation
- The model achieved an accuracy score of 50%, which is marginal and suggests unreliable classification.
- The confusion matrix reveals that the model correctly predicted the positive class but completely failed for the negative class, demonstrating imbalanced performance and bias.

### How the confusion matirx helps:
- Errors like false positives and false negatives are clearly broken down in the confusion matrix.
- It draws attention to the model's poor performance in accurately predicting the negative class, which informs future developments such as class balance or model refinement.