In [1]:
!pip install pandas scikit-learn nltk

Defaulting to user installation because normal site-packages is not writeable


In [None]:
import pandas as pd
import numpy as np
import re
import nltk
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from nltk.corpus import stopwords
from nltk.sentiment import SentimentIntensityAnalyzer

# Download stopwords
nltk.download('stopwords')
nltk.download('vader_lexicon')

# Load the dataset
data = pd.read_csv('/content/blogs.csv')

# Data Exploration
print(data.head())
print(data.info())
print(data['Labels'].value_counts())

# Data Preprocessing
def preprocess_text(text):
    text = re.sub(r'[^\w\s]', '', text)  # Remove punctuation
    text = text.lower() # Convert to lowercase
    return text

data['Cleaned_Data'] = data['Data'].apply(preprocess_text)

# Tokenization and removing stopwords
stop_words = set(stopwords.words('english'))
data['Cleaned_Data'] = data['Cleaned_Data'].apply(
    lambda x: ' '.join([word for word in x.split() if word not in stop_words])
)

# Feature Extraction using TF-IDF
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(data['Cleaned_Data'])
y = data['Labels']

# Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Naive Bayes Model for Text Classification
model = MultinomialNB()
model.fit(X_train, y_train)

# Make predictions on the test set
y_pred = model.predict(X_test)

# Evaluation
accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy}')
print('Classification Report:')
print(classification_report(y_test, y_pred))
print('Confusion Matrix:')
print(confusion_matrix(y_test, y_pred))

# Sentiment Analysis
sia = SentimentIntensityAnalyzer()
data['Sentiment'] = data['Data'].apply(lambda x: sia.polarity_scores(x)['compound'])

# Categorizing sentiments
def categorize_sentiment(score):
    if score >= 0.05:
        return 'Positive'
    elif score <= -0.05:
        return 'Negative'
    else:
        return 'Neutral'

data['Sentiment_Category'] = data['Sentiment'].apply(categorize_sentiment)

# Examining sentiment distribution across categories
sentiment_distribution = data.groupby(['Labels', 'Sentiment_Category']).size().unstack(fill_value=0)
print(sentiment_distribution)

# Summary of findings
print("Sentiment distribution")
import pandas as pd
import numpy as np
import re
import nltk
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from nltk.corpus import stopwords
from nltk.sentiment import SentimentIntensityAnalyzer

# Download stopwords
nltk.download('stopwords')
nltk.download('vader_lexicon')

# Load the dataset
data = pd.read_csv('/content/blogs.csv')

# Data Exploration
print(data.head())
print(data.info())
print(data['Labels'].value_counts())

# Data Preprocessing
def preprocess_text(text):
    text = re.sub(r'[^\w\s]', '', text)  # Remove punctuation
    text = text.lower()  # Convert to lowercase
    return text

data['Cleaned_Data'] = data['Data'].apply(preprocess_text)

# Tokenization and removing stopwords
stop_words = set(stopwords.words('english'))
data['Cleaned_Data'] = data['Cleaned_Data'].apply(
    lambda x: ' '.join([word for word in x.split() if word not in stop_words])
)

# Feature Extraction using TF-IDF
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(data['Cleaned_Data'])
y = data['Labels']

# Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Naive Bayes Model for Text Classification
model = MultinomialNB()
model.fit(X_train, y_train)

# Make predictions on the test set
y_pred = model.predict(X_test)

# Evaluation
accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy}')
print('Classification Report:')
print(classification_report(y_test, y_pred))
print('Confusion Matrix:')
print(confusion_matrix(y_test, y_pred))

# Visualization of Confusion Matrix
confusion_mat = confusion_matrix(y_test, y_pred)
plt.figure(figsize=(10, 7))
sns.heatmap(confusion_mat, annot=True, fmt='d', cmap='Blues', xticklabels=model.classes_, yticklabels=model.classes_)
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.title('Confusion Matrix')
plt.show()

# Sentiment Analysis
sia = SentimentIntensityAnalyzer()
data['Sentiment'] = data['Data'].apply(lambda x: sia.polarity_scores(x)['compound'])

# Categorizing sentiments
def categorize_sentiment(score):
    if score >= 0.05:
        return 'Positive'
    elif score <= -0.05:
        return 'Negative'
    else:
        return 'Neutral'

data['Sentiment_Category'] = data['Sentiment'].apply(categorize_sentiment)

# Examining sentiment distribution across categories
sentiment_distribution = data.groupby(['Labels', 'Sentiment_Category']).size().unstack(fill_value=0)

# Visualization of Sentiment Distribution
sentiment_distribution.plot(kind='bar', stacked=True, figsize=(10, 7))
plt.title('Sentiment Distribution by Category')
plt.xlabel('Categories')
plt.ylabel('Number of Posts')
plt.xticks(rotation=45)
plt.legend(title='Sentiment Category')
plt.show()

# Summary of findings
print("Sentiment distribution by category:")
print(sentiment_distribution)