In [1]:
import pandas as pd
import numpy as np
import re
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix
from textblob import TextBlob

In [2]:
df = pd.read_csv('blogs.csv')

In [3]:
# EDA
df.head()

Unnamed: 0,Data,Labels
0,Path: cantaloupe.srv.cs.cmu.edu!magnesium.club...,alt.atheism
1,Newsgroups: alt.atheism\nPath: cantaloupe.srv....,alt.atheism
2,Path: cantaloupe.srv.cs.cmu.edu!das-news.harva...,alt.atheism
3,Path: cantaloupe.srv.cs.cmu.edu!magnesium.club...,alt.atheism
4,Xref: cantaloupe.srv.cs.cmu.edu alt.atheism:53...,alt.atheism


In [4]:
df.isnull()

Unnamed: 0,Data,Labels
0,False,False
1,False,False
2,False,False
3,False,False
4,False,False
...,...,...
1995,False,False
1996,False,False
1997,False,False
1998,False,False


In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2000 entries, 0 to 1999
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   Data    2000 non-null   object
 1   Labels  2000 non-null   object
dtypes: object(2)
memory usage: 31.4+ KB


In [6]:
df.shape

(2000, 2)

In [7]:
# 2. Preprocessing function
def clean_text(text):
    if not isinstance(text, str):
        return ""
    # Convert to lowercase
    text = text.lower()
    # Remove punctuation and special characters (keeping basic text)
    text = re.sub(r'[^\w\s]', '', text)
    # Remove extra whitespace
    text = re.sub(r'\s+', ' ', text).strip()
    return text

In [8]:
# Apply cleaning
df['Cleaned_Data'] = df['Data'].apply(clean_text)

In [9]:
# Feature Extraction (TF-IDF)
# Using stop_words='english' removes common stopwords automatically
tfidf = TfidfVectorizer(stop_words='english', max_features=5000)
X = tfidf.fit_transform(df['Cleaned_Data'])
y = df['Labels']

In [10]:
# 3. Model Building
# Split data (80% train, 20% test)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [11]:
# Train Naive Bayes Classifier
nb_classifier = MultinomialNB()
nb_classifier.fit(X_train, y_train)

In [12]:
# Predictions
y_pred = nb_classifier.predict(X_test)

In [13]:
# 4. Evaluation
print("\n--- Model Evaluation ---")
print("Accuracy:", accuracy_score(y_test, y_pred))
print("\nClassification Report:\n")
print(classification_report(y_test, y_pred))


--- Model Evaluation ---
Accuracy: 0.8275

Classification Report:

                          precision    recall  f1-score   support

             alt.atheism       0.54      0.83      0.65        18
           comp.graphics       0.75      0.83      0.79        18
 comp.os.ms-windows.misc       0.86      0.82      0.84        22
comp.sys.ibm.pc.hardware       0.80      0.80      0.80        25
   comp.sys.mac.hardware       0.79      0.90      0.84        21
          comp.windows.x       0.91      0.84      0.88        25
            misc.forsale       0.88      0.78      0.82        18
               rec.autos       0.89      0.94      0.92        18
         rec.motorcycles       0.94      0.94      0.94        16
      rec.sport.baseball       0.71      0.94      0.81        18
        rec.sport.hockey       0.94      1.00      0.97        15
               sci.crypt       0.95      0.95      0.95        19
         sci.electronics       0.62      0.62      0.62        16
       

In [14]:
# Part 3: Sentiment Analysis
def get_sentiment(text):
    # TextBlob returns polarity: -1.0 (negative) to 1.0 (positive)
    analysis = TextBlob(str(text))
    polarity = analysis.sentiment.polarity
    if polarity > 0.1:
        return 'Positive'
    elif polarity < -0.1:
        return 'Negative'
    else:
        return 'Neutral'

In [15]:
# Apply sentiment analysis to the original 'Data' column
df['Sentiment'] = df['Data'].apply(get_sentiment)

print("\n--- Sentiment Analysis Distribution ---")
sentiment_counts = df['Sentiment'].value_counts()
print(sentiment_counts)


--- Sentiment Analysis Distribution ---
Sentiment
Neutral     1081
Positive     782
Negative     137
Name: count, dtype: int64


In [16]:
# Calculate sentiment distribution per category
sentiment_by_cat = pd.crosstab(df['Labels'], df['Sentiment'], normalize='index') * 100
print("\nSentiment Distribution by Category (%):\n", sentiment_by_cat)


Sentiment Distribution by Category (%):
 Sentiment                 Negative  Neutral  Positive
Labels                                               
alt.atheism                    5.0     60.0      35.0
comp.graphics                 10.0     51.0      39.0
comp.os.ms-windows.misc        9.0     48.0      43.0
comp.sys.ibm.pc.hardware       6.0     49.0      45.0
comp.sys.mac.hardware          7.0     50.0      43.0
comp.windows.x                10.0     53.0      37.0
misc.forsale                   7.0     33.0      60.0
rec.autos                      7.0     48.0      45.0
rec.motorcycles                7.0     49.0      44.0
rec.sport.baseball             8.0     53.0      39.0
rec.sport.hockey              13.0     56.0      31.0
sci.crypt                      3.0     62.0      35.0
sci.electronics                4.0     56.0      40.0
sci.med                       11.0     56.0      33.0
sci.space                      5.0     53.0      42.0
soc.religion.christian         1.0     6