In [15]:
import pandas as pd
import re
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from textblob import TextBlob
import nltk

# Download stopwords if necessary
nltk.download('stopwords')
nltk.download('punkt')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\dell\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\dell\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [3]:
# Load dataset
df = pd.read_csv("blogs.csv")

In [5]:
# Function to extract main content, excluding metadata
def extract_content(text):
    # Split text at first empty line (assumes body follows metadata)
    content = re.split(r'\n\s*\n', text, maxsplit=1)
    return content[1] if len(content) > 1 else content[0]

# Apply the extraction function
df['Cleaned_Data'] = df['Data'].apply(extract_content)

In [7]:
# Data Preprocessing function
def preprocess_text(text):
    text = text.lower()  # Lowercase
    text = re.sub(r'[^\w\s]', '', text)  # Remove punctuation
    tokens = word_tokenize(text)  # Tokenize
    tokens = [word for word in tokens if word not in stopwords.words('english')]  # Remove stopwords
    return ' '.join(tokens)

# Apply text cleaning
df['Cleaned_Data'] = df['Cleaned_Data'].apply(preprocess_text)

In [8]:
# Feature Extraction using TF-IDF
tfidf = TfidfVectorizer(max_features=5000)
X = tfidf.fit_transform(df['Cleaned_Data']).toarray()
y = df['Labels']

In [9]:
# Train-Test Split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [10]:
# Naive Bayes Model for Text Classification
nb = MultinomialNB()
nb.fit(X_train, y_train)
y_pred = nb.predict(X_test)

In [11]:
# Evaluation
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Precision:", precision_score(y_test, y_pred, average='weighted'))
print("Recall:", recall_score(y_test, y_pred, average='weighted'))
print("F1 Score:", f1_score(y_test, y_pred, average='weighted'))
print("\nClassification Report:\n", classification_report(y_test, y_pred))

Accuracy: 0.655
Precision: 0.6824175258136331
Recall: 0.655
F1 Score: 0.652850417573629

Classification Report:
                           precision    recall  f1-score   support

             alt.atheism       0.34      0.61      0.44        18
           comp.graphics       0.58      0.61      0.59        18
 comp.os.ms-windows.misc       0.68      0.77      0.72        22
comp.sys.ibm.pc.hardware       0.56      0.56      0.56        25
   comp.sys.mac.hardware       0.55      0.52      0.54        21
          comp.windows.x       0.82      0.56      0.67        25
            misc.forsale       0.89      0.44      0.59        18
               rec.autos       0.82      0.78      0.80        18
         rec.motorcycles       0.74      0.88      0.80        16
      rec.sport.baseball       0.67      0.78      0.72        18
        rec.sport.hockey       0.71      1.00      0.83        15
               sci.crypt       0.73      0.84      0.78        19
         sci.electronics    

In [12]:
# Sentiment Analysis
def get_sentiment(text):
    blob = TextBlob(text)
    polarity = blob.sentiment.polarity
    if polarity > 0:
        return 'Positive'
    elif polarity < 0:
        return 'Negative'
    else:
        return 'Neutral'

df['Sentiment'] = df['Cleaned_Data'].apply(get_sentiment)

In [13]:
# Sentiment Distribution Analysis
sentiment_counts = df.groupby('Labels')['Sentiment'].value_counts(normalize=True).unstack()
print("\nSentiment Distribution Across Categories:\n", sentiment_counts)


Sentiment Distribution Across Categories:
 Sentiment                 Negative  Neutral  Positive
Labels                                               
alt.atheism                   0.30     0.03      0.67
comp.graphics                 0.11     0.08      0.81
comp.os.ms-windows.misc       0.12     0.08      0.80
comp.sys.ibm.pc.hardware      0.12     0.02      0.86
comp.sys.mac.hardware         0.15     0.05      0.80
comp.windows.x                0.12     0.05      0.83
misc.forsale                  0.12     0.07      0.81
rec.autos                     0.17     0.06      0.77
rec.motorcycles               0.21     0.04      0.75
rec.sport.baseball            0.32     0.05      0.63
rec.sport.hockey              0.35     0.04      0.61
sci.crypt                     0.21     0.01      0.78
sci.electronics               0.13     0.05      0.82
sci.med                       0.30     0.02      0.68
sci.space                     0.14     0.05      0.81
soc.religion.christian        0.20    

In [14]:
# Save Results and Cleaned Dataset
df.to_csv("blogs_categories_with_sentiments.csv", index=False)
print("\nResults saved to blogs_categories_with_sentiments.csv")


Results saved to blogs_categories_with_sentiments.csv
