In [11]:
# Import necessary libraries
import pandas as pd  # For data manipulation
import numpy as np  # For numerical operations
import matplotlib.pyplot as plt  # For plotting
import seaborn as sns  # For advanced visualizations
from sklearn.model_selection import train_test_split  # For splitting data
from sklearn.naive_bayes import MultinomialNB  # For Naive Bayes model
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score  # For model evaluation
from sklearn.feature_extraction.text import TfidfVectorizer  # For text vectorization
from nltk.corpus import stopwords  # For removing stopwords
from nltk.tokenize import word_tokenize  # For tokenizing text
import re  # For regex-based text cleaning
import nltk

# Download necessary NLTK data
nltk.download('stopwords')
nltk.download('punkt')

# Configure Jupyter Notebook for inline plotting
%matplotlib inline


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\LENOVO\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\LENOVO\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [12]:
# Synthetic dataset (or replace with your own dataset)
data = {
    'Post_ID': ['001', '002', '003', '004', '005'],
    'User_ID': ['A1', 'A2', 'A3', 'A4', 'A5'],
    'Caption': [
        "I feel so anxious every day, it's hard to focus on anything.",
        "Been feeling really down lately, can't get out of bed.",
        "Just finished a great workout, feeling pumped and happy!",
        "Why does everything feel so overwhelming and exhausting?",
        "Life is good, enjoying every moment to the fullest!"
    ],
    'Date': ['2024-09-01', '2024-09-02', '2024-09-03', '2024-09-04', '2024-09-05'],
    'Condition': ['anxiety', 'depression', 'none', 'anxiety', 'none']
}

# Create a DataFrame
df = pd.DataFrame(data)

# Display the first few rows of the dataset
df.head()


Unnamed: 0,Post_ID,User_ID,Caption,Date,Condition
0,1,A1,"I feel so anxious every day, it's hard to focu...",2024-09-01,anxiety
1,2,A2,"Been feeling really down lately, can't get out...",2024-09-02,depression
2,3,A3,"Just finished a great workout, feeling pumped ...",2024-09-03,none
3,4,A4,Why does everything feel so overwhelming and e...,2024-09-04,anxiety
4,5,A5,"Life is good, enjoying every moment to the ful...",2024-09-05,none


In [13]:
def clean_text(text):
    text = text.lower()  # Convert to lowercase
    text = re.sub(r'\[.*?\]', '', text)  # Remove text in brackets
    text = re.sub(r'[^\w\s]', '', text)  # Remove punctuation
    text = re.sub(r'\d+', '', text)  # Remove numbers
    text = ' '.join(word for word in text.split() if word not in stopwords.words('english'))  # Remove stopwords
    return text

# Apply text cleaning
df['Cleaned_Text'] = df['Text'].apply(clean_text)

# Display the cleaned text data
df[['Post_ID', 'Cleaned_Text']].head()


KeyError: 'Text'

In [None]:
# Vectorize text using TF-IDF
vectorizer = TfidfVectorizer(max_features=100)  # Limit to top 100 features
X_text = vectorizer.fit_transform(df['Cleaned_Text']).toarray()

# Convert to DataFrame for merging
text_df = pd.DataFrame(X_text, columns=vectorizer.get_feature_names_out())

# Merge the original DataFrame with the text DataFrame
df_combined = pd.concat([df, text_df], axis=1)

# Display the combined DataFrame
df_combined.head()


In [None]:
# Define features (X) and target (y)
X = df_combined.drop(['Post_ID', 'User_ID', 'Text', 'Date', 'Cleaned_Text', 'Condition'], axis=1)
y = df_combined['Condition']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize the model
model = MultinomialNB()

# Train the model
model.fit(X_train, y_train)


In [18]:
y_pred = model.predict(X_test)

# Evaluate the model
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))

print("\nClassification Report:")
print(classification_report(y_test, y_pred))

accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.2f}")


Confusion Matrix:
[[0 1]
 [0 0]]

Classification Report:
              precision    recall  f1-score   support

  depression       0.00      0.00      0.00       1.0
        none       0.00      0.00      0.00       0.0

    accuracy                           0.00       1.0
   macro avg       0.00      0.00      0.00       1.0
weighted avg       0.00      0.00      0.00       1.0

Accuracy: 0.00


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
