## Import Libraries

In [43]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from nltk.corpus import stopwords
import re

## Load Dataset

In [44]:
# Load the dataset
df = pd.read_csv("blogs.csv", encoding='latin1')
df

Unnamed: 0,Data,Labels
0,Path: cantaloupe.srv.cs.cmu.edu!magnesium.club...,alt.atheism
1,Newsgroups: alt.atheism\nPath: cantaloupe.srv....,alt.atheism
2,Path: cantaloupe.srv.cs.cmu.edu!das-news.harva...,alt.atheism
3,Path: cantaloupe.srv.cs.cmu.edu!magnesium.club...,alt.atheism
4,Xref: cantaloupe.srv.cs.cmu.edu alt.atheism:53...,alt.atheism
...,...,...
1995,Xref: cantaloupe.srv.cs.cmu.edu talk.abortion:...,talk.religion.misc
1996,Xref: cantaloupe.srv.cs.cmu.edu talk.religion....,talk.religion.misc
1997,Xref: cantaloupe.srv.cs.cmu.edu talk.origins:4...,talk.religion.misc
1998,Xref: cantaloupe.srv.cs.cmu.edu talk.religion....,talk.religion.misc


## DATA EXPLORATION AND PREPROCESSING

In [45]:
df.head()

Unnamed: 0,Data,Labels
0,Path: cantaloupe.srv.cs.cmu.edu!magnesium.club...,alt.atheism
1,Newsgroups: alt.atheism\nPath: cantaloupe.srv....,alt.atheism
2,Path: cantaloupe.srv.cs.cmu.edu!das-news.harva...,alt.atheism
3,Path: cantaloupe.srv.cs.cmu.edu!magnesium.club...,alt.atheism
4,Xref: cantaloupe.srv.cs.cmu.edu alt.atheism:53...,alt.atheism


In [46]:
df.shape

(2000, 2)

In [47]:
df.isnull().sum()

Data      0
Labels    0
dtype: int64

In [48]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2000 entries, 0 to 1999
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   Data    2000 non-null   object
 1   Labels  2000 non-null   object
dtypes: object(2)
memory usage: 31.4+ KB


In [49]:
df.describe()

Unnamed: 0,Data,Labels
count,2000,2000
unique,2000,20
top,Path: cantaloupe.srv.cs.cmu.edu!magnesium.club...,alt.atheism
freq,1,100


In [50]:
# Drop rows with any missing values in the 'Data' and 'Labels' columns
df = df.dropna(subset=['Data', 'Labels'])


In [51]:
# Data Preprocessing
def clean_text(text):
    text = text.lower()  # Convert to lowercase
    text = re.sub(r'https?://\S+|www\.\S+', '', text)  # Remove URLs
    text = re.sub(r'<.*?>', '', text)  # Remove HTML tags
    text = re.sub(r'[^\w\s]', '', text)  # Remove punctuation
    text = re.sub(r'\d+', '', text)  # Remove numbers
    text = text.strip()  # Remove leading/trailing whitespace
    return text

In [52]:
# Apply the cleaning function to the 'Data' column
df['cleaned_data'] = df['Data'].apply(clean_text)

In [53]:
# Tokenization and Stopword Removal
stop_words = set(stopwords.words('english'))

In [54]:
def remove_stopwords(text):
    words = text.split()
    filtered_words = [word for word in words if word not in stop_words]
    return ' '.join(filtered_words)

df['processed_data'] = df['cleaned_data'].apply(remove_stopwords)


In [55]:
# Feature Extraction using TF-IDF
tfidf_vectorizer = TfidfVectorizer(max_features=5000)
X = tfidf_vectorizer.fit_transform(df['processed_data'])
y = df['Labels']

print("\nProcessed Data and TF-IDF Matrix Shape:")
print(df[['Data', 'processed_data']].head())
print(f"TF-IDF matrix shape: {X.shape}")


Processed Data and TF-IDF Matrix Shape:
                                                Data  \
0  Path: cantaloupe.srv.cs.cmu.edu!magnesium.club...   
1  Newsgroups: alt.atheism\nPath: cantaloupe.srv....   
2  Path: cantaloupe.srv.cs.cmu.edu!das-news.harva...   
3  Path: cantaloupe.srv.cs.cmu.edu!magnesium.club...   
4  Xref: cantaloupe.srv.cs.cmu.edu alt.atheism:53...   

                                      processed_data  
0  path cantaloupesrvcscmuedumagnesiumclubcccmued...  
1  newsgroups altatheism path cantaloupesrvcscmue...  
2  path cantaloupesrvcscmuedudasnewsharvardedunoc...  
3  path cantaloupesrvcscmuedumagnesiumclubcccmued...  
4  xref cantaloupesrvcscmuedu altatheism talkreli...  
TF-IDF matrix shape: (2000, 5000)


## Naive Bayes Model for Text Classification

In [56]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report, accuracy_score
from sklearn.model_selection import train_test_split

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [57]:
# Implement and train the Naive Bayes classifier
nb_model = MultinomialNB()
nb_model.fit(X_train, y_train)


In [58]:
# Make predictions on the test set
y_pred = nb_model.predict(X_test)

print("\nNaive Bayes Classification Report:")
print(classification_report(y_test, y_pred))
print(f"Accuracy: {accuracy_score(y_test, y_pred):.4f}")


Naive Bayes Classification Report:
                          precision    recall  f1-score   support

             alt.atheism       0.48      0.83      0.61        18
           comp.graphics       0.75      0.83      0.79        18
 comp.os.ms-windows.misc       0.91      0.91      0.91        22
comp.sys.ibm.pc.hardware       0.81      0.84      0.82        25
   comp.sys.mac.hardware       0.83      0.90      0.86        21
          comp.windows.x       1.00      0.84      0.91        25
            misc.forsale       1.00      0.78      0.88        18
               rec.autos       0.94      0.94      0.94        18
         rec.motorcycles       0.88      0.94      0.91        16
      rec.sport.baseball       0.76      0.89      0.82        18
        rec.sport.hockey       0.88      1.00      0.94        15
               sci.crypt       0.90      1.00      0.95        19
         sci.electronics       0.67      0.75      0.71        16
                 sci.med       0.88    

## Sentiment Analysis

In [59]:
from textblob import TextBlob

# Perform sentiment analysis on the 'Data' column
df['sentiment_score'] = df['Data'].apply(lambda x: TextBlob(str(x)).sentiment.polarity)

In [60]:
# Categorize sentiments
def get_sentiment(score):
    if score > 0:
        return 'Positive'
    elif score < 0:
        return 'Negative'
    else:
        return 'Neutral'

df['sentiment'] = df['sentiment_score'].apply(get_sentiment)

print("\nSentiment Analysis Results:")
print(df[['Data', 'sentiment']].head())


Sentiment Analysis Results:
                                                Data sentiment
0  Path: cantaloupe.srv.cs.cmu.edu!magnesium.club...  Positive
1  Newsgroups: alt.atheism\nPath: cantaloupe.srv....  Negative
2  Path: cantaloupe.srv.cs.cmu.edu!das-news.harva...  Positive
3  Path: cantaloupe.srv.cs.cmu.edu!magnesium.club...  Positive
4  Xref: cantaloupe.srv.cs.cmu.edu alt.atheism:53...  Positive


In [61]:
# Examine the distribution of sentiments
sentiment_distribution = df['sentiment'].value_counts(normalize=True)
print("\nSentiment Distribution:")
print(sentiment_distribution)


Sentiment Distribution:
sentiment
Positive    0.7715
Negative    0.2285
Name: proportion, dtype: float64


## Evaluation

In [62]:
print("\nEvaluation Summary:")
print("The Naive Bayes model performance is as follows:")
print(f"Accuracy: {accuracy_score(y_test, y_pred):.4f}")
print("Classification Report (Precision, Recall, F1-Score):")
print(classification_report(y_test, y_pred))



Evaluation Summary:
The Naive Bayes model performance is as follows:
Accuracy: 0.8400
Classification Report (Precision, Recall, F1-Score):
                          precision    recall  f1-score   support

             alt.atheism       0.48      0.83      0.61        18
           comp.graphics       0.75      0.83      0.79        18
 comp.os.ms-windows.misc       0.91      0.91      0.91        22
comp.sys.ibm.pc.hardware       0.81      0.84      0.82        25
   comp.sys.mac.hardware       0.83      0.90      0.86        21
          comp.windows.x       1.00      0.84      0.91        25
            misc.forsale       1.00      0.78      0.88        18
               rec.autos       0.94      0.94      0.94        18
         rec.motorcycles       0.88      0.94      0.91        16
      rec.sport.baseball       0.76      0.89      0.82        18
        rec.sport.hockey       0.88      1.00      0.94        15
               sci.crypt       0.90      1.00      0.95        19
 

## The accuracy of the model indicates its overall correctness in predicting categories.
## Precision measures the accuracy of positive predictions, recall measures the model ability to find all postive samples,and the F1 score is the weighted average of precision.
## The sentiment analysis results provides insight into the emotional tone of the blog posts, which could be used to enhance the classification process.