In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
import re
import nltk
from nltk.corpus import stopwords

# Load the dataset
df = pd.read_csv(r"D:\DS\NLP and  Naive Bayes\blogs.csv")

# Basic exploration
print(df.head())
print(df.info())
print(df['Labels'].value_counts())

# Text cleaning function
def clean_text(text):
    text = text.lower()  # Convert to lowercase
    text = re.sub(r'\d+', '', text)  # Remove numbers
    text = re.sub(r'[^\w\s]', '', text)  # Remove punctuation
    text = text.strip()  # Remove whitespace
    text = re.sub(r'\s+', ' ', text)  # Replace multiple spaces with a single space
    return text

# Apply the cleaning function to the Data column
df['Data'] = df['Data'].apply(clean_text)

# Tokenization and stopwords removal
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))
df['Data'] = df['Data'].apply(lambda x: ' '.join([word for word in x.split() if word not in stop_words]))

# Feature extraction using TF-IDF
vectorizer = TfidfVectorizer(max_features=5000)
X = vectorizer.fit_transform(df['Data']).toarray()
y = df['Labels']


                                                Data       Labels
0  Path: cantaloupe.srv.cs.cmu.edu!magnesium.club...  alt.atheism
1  Newsgroups: alt.atheism\nPath: cantaloupe.srv....  alt.atheism
2  Path: cantaloupe.srv.cs.cmu.edu!das-news.harva...  alt.atheism
3  Path: cantaloupe.srv.cs.cmu.edu!magnesium.club...  alt.atheism
4  Xref: cantaloupe.srv.cs.cmu.edu alt.atheism:53...  alt.atheism
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2000 entries, 0 to 1999
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   Data    2000 non-null   object
 1   Labels  2000 non-null   object
dtypes: object(2)
memory usage: 31.4+ KB
None
Labels
alt.atheism                 100
comp.graphics               100
talk.politics.misc          100
talk.politics.mideast       100
talk.politics.guns          100
soc.religion.christian      100
sci.space                   100
sci.med                     100
sci.electronics             100
sci.crypt    

[nltk_data] Downloading package stopwords to C:\Users\GAURI
[nltk_data]     DUBEY\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\stopwords.zip.


In [3]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report

# Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Train the Naive Bayes model
nb = MultinomialNB()
nb.fit(X_train, y_train)

# Make predictions on the test set
y_pred = nb.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average='weighted')
recall = recall_score(y_test, y_pred, average='weighted')
f1 = f1_score(y_test, y_pred, average='weighted')

print(f'Accuracy: {accuracy:.4f}')
print(f'Precision: {precision:.4f}')
print(f'Recall: {recall:.4f}')
print(f'F1 Score: {f1:.4f}')
print(classification_report(y_test, y_pred))


Accuracy: 0.8317
Precision: 0.8474
Recall: 0.8317
F1 Score: 0.8212
                          precision    recall  f1-score   support

             alt.atheism       0.53      0.96      0.69        24
           comp.graphics       0.73      0.86      0.79        28
 comp.os.ms-windows.misc       0.94      0.88      0.91        33
comp.sys.ibm.pc.hardware       0.80      0.67      0.73        36
   comp.sys.mac.hardware       0.94      0.89      0.91        36
          comp.windows.x       0.97      0.81      0.88        36
            misc.forsale       0.69      0.83      0.75        24
               rec.autos       0.96      0.87      0.92        31
         rec.motorcycles       0.66      0.95      0.78        22
      rec.sport.baseball       0.94      0.94      0.94        32
        rec.sport.hockey       0.96      1.00      0.98        25
               sci.crypt       0.80      1.00      0.89        24
         sci.electronics       0.72      0.78      0.75        27
        

In [5]:
from textblob import TextBlob

# Function to get sentiment
def get_sentiment(text):
    analysis = TextBlob(text)
    if analysis.sentiment.polarity > 0:
        return 'Positive'
    elif analysis.sentiment.polarity < 0:
        return 'Negative'
    else:
        return 'Neutral'

# Apply sentiment analysis to the Data column
df['Sentiment'] = df['Data'].apply(get_sentiment)

# Analyze the sentiment distribution
sentiment_distribution = df['Sentiment'].value_counts()
print(sentiment_distribution)

# Sentiment distribution across categories
sentiment_category_distribution = df.groupby('Labels')['Sentiment'].value_counts(normalize=True).unstack()
print(sentiment_category_distribution)


Sentiment
Positive    1453
Negative     544
Neutral        3
Name: count, dtype: int64
Sentiment                 Negative  Neutral  Positive
Labels                                               
alt.atheism                   0.35      NaN      0.65
comp.graphics                 0.27      NaN      0.73
comp.os.ms-windows.misc       0.23      NaN      0.77
comp.sys.ibm.pc.hardware      0.19      NaN      0.81
comp.sys.mac.hardware         0.26      NaN      0.74
comp.windows.x                0.20     0.02      0.78
misc.forsale                  0.21      NaN      0.79
rec.autos                     0.24      NaN      0.76
rec.motorcycles               0.28      NaN      0.72
rec.sport.baseball            0.35      NaN      0.65
rec.sport.hockey              0.40      NaN      0.60
sci.crypt                     0.19      NaN      0.81
sci.electronics               0.25      NaN      0.75
sci.med                       0.34      NaN      0.66
sci.space                     0.28      NaN      

***The Naive Bayes model achieved 83.17%, correctly categorizing most blog posts***
Precision: 84.74% (few false positives)
Recall: 83.17% (mostly accurate with some missed categories)
F1-Score: 82.12% (good balance of precision and recall)