In [None]:
#importing libraries
import pandas as pd
import re
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from textblob import TextBlob
import nltk

# Download necessary NLTK data
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('punkt_tab') # Download the punkt_tab data package

# Load the dataset
file_path = '/content/blogs (1).csv'
data = pd.read_csv(file_path)

# Preprocessing function
def preprocess_text(text):
    # Remove metadata and unnecessary headers
    text = re.sub(r'(?i)(path:|newsgroups:|xref:).*?\n', '', text)
    # Remove punctuation and convert to lowercase
    text = re.sub(r'[\W_]+', ' ', text).lower()
    # Tokenize
    tokens = word_tokenize(text)
    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    tokens = [word for word in tokens if word not in stop_words]
    return ' '.join(tokens)

# Apply preprocessing
data['Cleaned_Data'] = data['Data'].apply(preprocess_text)

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


In [None]:
# Feature extraction using TF-IDF
vectorizer = TfidfVectorizer(max_features=5000)
X = vectorizer.fit_transform(data['Cleaned_Data'])
y = data['Labels']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train Naive Bayes classifier
nb_model = MultinomialNB()
nb_model.fit(X_train, y_train)

# Predictions
y_pred = nb_model.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)
print("Classification Report:\n", classification_report(y_test, y_pred))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))

# Sentiment Analysis
def analyze_sentiment(text):
    sentiment = TextBlob(text).sentiment.polarity
    if sentiment > 0:
        return 'Positive'
    elif sentiment < 0:
        return 'Negative'
    else:
        return 'Neutral'

# Apply sentiment analysis
data['Sentiment'] = data['Cleaned_Data'].apply(analyze_sentiment)

# Sentiment distribution by category
sentiment_distribution = data.groupby(['Labels', 'Sentiment']).size().unstack(fill_value=0)
print("\nSentiment Distribution by Category:\n", sentiment_distribution)


Accuracy: 0.7225
Classification Report:
                           precision    recall  f1-score   support

             alt.atheism       0.56      0.83      0.67        18
           comp.graphics       0.65      0.72      0.68        18
 comp.os.ms-windows.misc       0.76      0.86      0.81        22
comp.sys.ibm.pc.hardware       0.64      0.56      0.60        25
   comp.sys.mac.hardware       0.73      0.52      0.61        21
          comp.windows.x       0.83      0.40      0.54        25
            misc.forsale       0.65      0.61      0.63        18
               rec.autos       0.67      0.89      0.76        18
         rec.motorcycles       0.75      0.75      0.75        16
      rec.sport.baseball       0.65      0.83      0.73        18
        rec.sport.hockey       0.58      1.00      0.73        15
               sci.crypt       0.72      0.95      0.82        19
         sci.electronics       0.50      0.50      0.50        16
                 sci.med       0.8