# Mounting Google Drive to Access Files

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


# Loading Data from Google Drive into a Pandas DataFrame"

In [None]:
import pandas as pd

# Path to your file in Google Drive
file_path = '/content/drive/MyDrive/combinedData.csv'

# Load the dataset
data = pd.read_csv(file_path)
print(data.head())

  id                                          statement   status
0  0                                         oh my gosh  Anxiety
1  1  trouble sleeping, confused mind, restless hear...  Anxiety
2  2  All wrong, back off dear, forward doubt. Stay ...  Anxiety
3  3  I've shifted my focus to something else but I'...  Anxiety
4  4  I'm restless and restless, it's been a month n...  Anxiety


# Preparing for Text Preprocessing and Model Training

In [None]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import classification_report
from sklearn.preprocessing import LabelEncoder
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer, PorterStemmer
from nltk.tokenize import word_tokenize
import nltk

# Ensure necessary NLTK resources are available
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('punkt_tab')


# Initialize the Lemmatizer and Stemmer
lemmatizer = WordNetLemmatizer()
stemmer = PorterStemmer()

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


# Text Preprocessing: Tokenization, Lemmatization, and Stemming without stopwords removal

In [None]:
import re
def preprocess_text(text):
    # Lowercase the text
    text = text.lower()

    # Remove special characters, numbers, and punctuation, but keep text-based ones
    text = re.sub(r'[^a-zA-Z\s]', '', text)

    # Tokenize the text
    tokens = word_tokenize(text)

    # Apply stemming
    tokens = [stemmer.stem(word) for word in tokens]

    # Apply lemmatization
    tokens = [lemmatizer.lemmatize(word) for word in tokens]

    # Rejoin the tokens into a single string
    return " ".join(tokens)

# Preparing Data: Handling Missing Values and Preprocessing Text

In [None]:
# Assuming the dataset is in a DataFrame 'data' with 'statement' and 'status'
X = data['statement']  # Column with the text data
y = data['status']  # Column with sentiment labels (7 classes)

# Remove rows with missing text (if any)
data = pd.concat([X, y], axis=1).dropna()  # Drop rows where either X or y has NaN values

# Re-assign X and y after dropna
X = data['statement']
y = data['status']
X = X.apply(preprocess_text)

# Check if the lengths are consistent
print(f"Length of X: {len(X)}, Length of y: {len(y)}")

Length of X: 52680, Length of y: 52680


# Label Encoding and TF-IDF Vectorization for Text Classification

In [None]:
# Encode labels (e.g., 'Anxiety' -> 0, 'Depression' -> 1, etc.)
encoder = LabelEncoder()
y_encoded = encoder.fit_transform(y)

# Convert text to TF-IDF features
vectorizer = TfidfVectorizer(max_features=5000, ngram_range=(1, 2))
X_tfidf = vectorizer.fit_transform(X)

# Splitting Data into Training and Testing Sets

In [None]:
# Split into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X_tfidf, y_encoded, test_size=0.2, random_state=42)

# Hyperparameter Tuning with GridSearchCV for Logistic Regression

In [None]:
import time

from sklearn.multiclass import OneVsRestClassifier

# Manually specify the best hyperparameters from GridSearch results
best_params = {
    'C': 10,
    'max_iter': 1000,
    'solver': 'saga'
}

# Initialize the logistic regression model with the best hyperparameters
best_logreg_model = OneVsRestClassifier(LogisticRegression(n_jobs=-1, C=best_params['C'], max_iter=best_params['max_iter'], solver=best_params['solver']))
start_time = time.time()


# Model Evaluation and Performance Metrics

In [None]:
# Train the model using the training data
best_logreg_model.fit(X_train, y_train)
end_time = time.time()

# Evaluate performance on test set
y_pred = best_logreg_model.predict(X_test)

# Classification report
print("Classification Report:")
print(classification_report(y_test, y_pred, target_names=encoder.classes_))

training_time = end_time - start_time
print(f"Training Time: {training_time / 60:.2f} Minutes")


Classification Report:
                      precision    recall  f1-score   support

             Anxiety       0.80      0.78      0.79       754
             Bipolar       0.85      0.75      0.79       554
          Depression       0.71      0.72      0.72      3058
              Normal       0.87      0.96      0.91      3325
Personality disorder       0.83      0.61      0.70       220
              Stress       0.67      0.50      0.57       530
            Suicidal       0.68      0.65      0.67      2095

            accuracy                           0.78     10536
           macro avg       0.77      0.71      0.74     10536
        weighted avg       0.77      0.78      0.77     10536

Training Time: 2.94 Minutes


In [None]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix

# Calculate performance metrics
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average='weighted')
recall = recall_score(y_test, y_pred, average='weighted')
f1 = f1_score(y_test, y_pred, average='weighted')
confusion_matrix = confusion_matrix(y_test, y_pred)

print(f"Accuracy: {accuracy:.2f}")
print(f"Precision: {precision:.2f}")
print(f"Recall: {recall:.2f}")
print(f"F1 Score: {f1:.2f}")
print("Confusion Matrix:")
print(confusion_matrix)

Accuracy: 0.78
Precision: 0.77
Recall: 0.78
F1 Score: 0.77
Confusion Matrix:
[[ 590   14   60   52    8   25    5]
 [  22  415   69   22    4   16    6]
 [  53   39 2213  135    9   40  569]
 [  13    5   64 3183    0   27   33]
 [   5    6   33   25  134   14    3]
 [  53   10   80   98    7  265   17]
 [   1    2  589  128    0    8 1367]]


##Optional: Model Testing and Sentiment Prediction on New Data with Confidence

In [None]:
# Example list of sentences
sentences = [
    'I feel really overwhelmed and anxious these days.',
    'I am so happy and excited about the upcoming event!',
    'I can’t stop feeling sad and depressed lately.',
    'Everything seems fine, I feel completely normal.',
    'I am struggling with so much stress these days.',
    'My mind is constantly racing, I feel suicidal.',
    'I’ve been feeling confused and not myself lately.'
]

# Preprocess the sentences
processed_sentences = [preprocess_text(sentence) for sentence in sentences]

# Convert the preprocessed text to TF-IDF features
X_new_tfidf = vectorizer.transform(processed_sentences)

# Get the prediction probabilities (confidence scores) for each sentence
probabilities = best_logreg_model.predict_proba(X_new_tfidf)

# Make predictions
predictions = best_logreg_model.predict(X_new_tfidf)

# Decode the predictions back to the original labels
predicted_labels = encoder.inverse_transform(predictions)

# Enhanced output with confidence and ethical warnings
for sentence, label, prob in zip(sentences, predicted_labels, probabilities):
    # Get the maximum confidence score (probability) for the predicted class
    confidence = prob.max()

    print(f"Sentence: {sentence}")
    print(f"Predicted Sentiment: {label} (Confidence: {confidence*100:.2f}%)")
    print("\n")

Sentence: I feel really overwhelmed and anxious these days.
Predicted Sentiment: Anxiety (Confidence: 60.79%)


Sentence: I am so happy and excited about the upcoming event!
Predicted Sentiment: Normal (Confidence: 88.19%)


Sentence: I can’t stop feeling sad and depressed lately.
Predicted Sentiment: Depression (Confidence: 88.42%)


Sentence: Everything seems fine, I feel completely normal.
Predicted Sentiment: Depression (Confidence: 77.73%)


Sentence: I am struggling with so much stress these days.
Predicted Sentiment: Stress (Confidence: 65.49%)


Sentence: My mind is constantly racing, I feel suicidal.
Predicted Sentiment: Suicidal (Confidence: 78.03%)


Sentence: I’ve been feeling confused and not myself lately.
Predicted Sentiment: Normal (Confidence: 89.23%)


