In [1]:
import pandas as pd
from sklearn.utils import shuffle
from IPython.display import display, HTML

# Load the Excel file
file_path = '/kaggle/input/wellness-dataset/wellness_data.xlsx' 
sheet_names = ['IVA', 'PA', 'SA', 'SEA']  

dataframes = []
for idx, sheet_name in enumerate(sheet_names):
    # Read the sheet
    df = pd.read_excel(file_path, sheet_name=sheet_name)
    df = df.iloc[1:]
    df['Aspect'] = idx
    dataframes.append(df)

combined_df = pd.concat(dataframes, ignore_index=True)
if 'Table 1' in combined_df.columns:
    combined_df.drop(columns=['Table 1'], inplace=True)
combined_df.columns = ['Text', 'Explanations', 'Aspect']
combined_df = shuffle(combined_df, random_state=42)
pd.set_option('display.max_columns', None) 
# display(HTML(combined_df.to_html(index=False)))

combined_df.head()


Unnamed: 0,Text,Explanations,Aspect
679,"""I’m tired of having the generic discussions w...",Tired of having the generic discussions with m...,2
1050,“I have been having less of the really concern...,really concerning/harmful thoughts/ incredibly...,3
901,“I wish I was never born.”,wish I was never born/,3
243,"“I was eating the cake and she said, that's wh...",I cried/ couldn’t stop/,1
328,“Over the last year my depression has gotten a...,My hands are losing strength/ anxiety/ suicida...,1


In [2]:
combined_df.shape

(1249, 3)

In [3]:
combined_df.to_csv('/kaggle/working/combined_with_sheet_index.csv', index=False)

In [4]:
# Combine 'Text' and 'Explanations' to create a single feature
combined_df['Text&Spans'] = combined_df['Text'] + " " + combined_df['Explanations']
combined_df.head()

Unnamed: 0,Text,Explanations,Aspect,Text&Spans
679,"""I’m tired of having the generic discussions w...",Tired of having the generic discussions with m...,2,"""I’m tired of having the generic discussions w..."
1050,“I have been having less of the really concern...,really concerning/harmful thoughts/ incredibly...,3,“I have been having less of the really concern...
901,“I wish I was never born.”,wish I was never born/,3,“I wish I was never born.” wish I was never born/
243,"“I was eating the cake and she said, that's wh...",I cried/ couldn’t stop/,1,"“I was eating the cake and she said, that's wh..."
328,“Over the last year my depression has gotten a...,My hands are losing strength/ anxiety/ suicida...,1,“Over the last year my depression has gotten a...


In [5]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split, cross_val_score
# Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(combined_df['Text&Spans'], combined_df['Aspect'], test_size=0.2, random_state=42)

In [6]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import LinearSVC
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report
import numpy as np

In [7]:
# Define a function for cross-validation and reporting results
def evaluate_model_with_cross_validation(model, model_name):
    pipeline = Pipeline([
        ('tfidf', TfidfVectorizer()),  # Convert text to numerical features
        ('classifier', model)         # Add the classifier
    ])
    # Perform cross-validation
    scores = cross_val_score(pipeline, X_train, y_train, cv=5, scoring='accuracy')  # 5-fold cross-validation
    print(f"Cross-Validation Results for {model_name}:\n")
    print(f"Accuracy Scores: {scores}")
    print(f"Mean Accuracy: {np.mean(scores):.4f}")
    print(f"Standard Deviation: {np.std(scores):.4f}\n")

In [9]:
# Define a function to train and evaluate a model
def train_and_evaluate_model(model, model_name):
    pipeline = Pipeline([
        ('tfidf', TfidfVectorizer()),  # Convert text to numerical features
        ('classifier', model)         # Add the classifier
    ])
    # Train the model
    pipeline.fit(X_train, y_train)
    # Evaluate the model
    y_pred = pipeline.predict(X_test)
    print(f"Classification Report for {model_name}:\n")
    print(classification_report(y_test, y_pred))

In [10]:
# Logistic Regression
evaluate_model_with_cross_validation(LogisticRegression(max_iter=1000), "Logistic Regression")
train_and_evaluate_model(LogisticRegression(max_iter=1000), "Logistic Regression")

Cross-Validation Results for Logistic Regression:

Accuracy Scores: [0.665      0.57       0.57       0.59       0.64321608]
Mean Accuracy: 0.6076
Standard Deviation: 0.0392

Classification Report for Logistic Regression:

              precision    recall  f1-score   support

           0       1.00      0.24      0.38        34
           1       0.76      0.53      0.62        55
           2       0.72      0.71      0.71        75
           3       0.55      0.84      0.67        86

    accuracy                           0.65       250
   macro avg       0.76      0.58      0.60       250
weighted avg       0.71      0.65      0.63       250



In [11]:
# Naive Bayes
evaluate_model_with_cross_validation(MultinomialNB(), "Naive Bayes")
train_and_evaluate_model(MultinomialNB(), "Naive Bayes")

Cross-Validation Results for Naive Bayes:

Accuracy Scores: [0.59       0.495      0.525      0.52       0.52763819]
Mean Accuracy: 0.5315
Standard Deviation: 0.0314

Classification Report for Naive Bayes:

              precision    recall  f1-score   support

           0       0.00      0.00      0.00        34
           1       0.83      0.27      0.41        55
           2       0.65      0.48      0.55        75
           3       0.44      0.90      0.59        86

    accuracy                           0.51       250
   macro avg       0.48      0.41      0.39       250
weighted avg       0.53      0.51      0.46       250



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [12]:
# Random Forest
evaluate_model_with_cross_validation(RandomForestClassifier(), "Random Forest")
train_and_evaluate_model(RandomForestClassifier(), "Random Forest")

Cross-Validation Results for Random Forest:

Accuracy Scores: [0.57       0.56       0.53       0.555      0.61306533]
Mean Accuracy: 0.5656
Standard Deviation: 0.0271

Classification Report for Random Forest:

              precision    recall  f1-score   support

           0       0.86      0.18      0.29        34
           1       0.65      0.40      0.49        55
           2       0.70      0.61      0.65        75
           3       0.50      0.84      0.63        86

    accuracy                           0.58       250
   macro avg       0.68      0.51      0.52       250
weighted avg       0.64      0.58      0.56       250



In [14]:
# Decision Tree
evaluate_model_with_cross_validation(DecisionTreeClassifier(), "Decision Tree")
train_and_evaluate_model(DecisionTreeClassifier(), "Decision Tree")

Cross-Validation Results for Decision Tree:

Accuracy Scores: [0.46       0.445      0.43       0.535      0.54271357]
Mean Accuracy: 0.4825
Standard Deviation: 0.0470

Classification Report for Decision Tree:

              precision    recall  f1-score   support

           0       0.41      0.38      0.39        34
           1       0.42      0.42      0.42        55
           2       0.51      0.48      0.49        75
           3       0.51      0.55      0.53        86

    accuracy                           0.48       250
   macro avg       0.46      0.46      0.46       250
weighted avg       0.48      0.48      0.48       250



In [13]:
# K-Nearest Neighbors
evaluate_model_with_cross_validation(KNeighborsClassifier(), "K-Nearest Neighbors")
train_and_evaluate_model(KNeighborsClassifier(), "K-Nearest Neighbors")

Cross-Validation Results for K-Nearest Neighbors:

Accuracy Scores: [0.65      0.54      0.525     0.585     0.6080402]
Mean Accuracy: 0.5816
Standard Deviation: 0.0454

Classification Report for K-Nearest Neighbors:

              precision    recall  f1-score   support

           0       0.68      0.44      0.54        34
           1       0.69      0.65      0.67        55
           2       0.71      0.65      0.68        75
           3       0.60      0.74      0.66        86

    accuracy                           0.66       250
   macro avg       0.67      0.62      0.64       250
weighted avg       0.66      0.66      0.65       250



In [15]:
# Linear SVC
evaluate_model_with_cross_validation(LinearSVC(max_iter=10000), "Linear SVC")
train_and_evaluate_model(LinearSVC(max_iter=10000), "Linear SVC")

Cross-Validation Results for Linear SVC:

Accuracy Scores: [0.68       0.665      0.56       0.625      0.68341709]
Mean Accuracy: 0.6427
Standard Deviation: 0.0463

Classification Report for Linear SVC:

              precision    recall  f1-score   support

           0       0.90      0.53      0.67        34
           1       0.73      0.69      0.71        55
           2       0.73      0.73      0.73        75
           3       0.62      0.74      0.68        86

    accuracy                           0.70       250
   macro avg       0.75      0.67      0.70       250
weighted avg       0.72      0.70      0.70       250

