In [None]:
import pandas as pd
import re
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import accuracy_score
from textblob import TextBlob
import scipy.sparse as sp

import nltk
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

from google.colab import drive
drive.mount('/content/drive')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...


In [None]:
path = "/content/drive/MyDrive/Mental-Health-Twitter.csv"

columns_to_read = ['post_text', 'label']

df = pd.read_csv(path, encoding='unicode_escape', usecols=columns_to_read)
df.rename(columns={'label': 'class'}, inplace=True)
df.rename(columns={'post_text': 'text'}, inplace=True)
# num_rows_to_keep = 1000

# # # Create a new DataFrame with only the specified number of rows
# df = data.iloc[:num_rows_to_keep]
print(df)

mapping = {1: "suicide", 0: "non-suicide"}

df['class'] = df['class'].map(mapping)

print(df)

                                                    text  class
0      It's just over 2 years since I was diagnosed w...      1
1      It's Sunday, I need a break, so I'm planning t...      1
2      Awake but tired. I need to sleep but my brain ...      1
3      RT @SewHQ: #Retro bears make perfect gifts and...      1
4      Itâs hard to say whether packing lists are m...      1
...                                                  ...    ...
19995              A day without sunshine is like night.      0
19996  Boren's Laws: (1) When in charge, ponder. (2) ...      0
19997  The flow chart is a most thoroughly oversold p...      0
19998  Ships are safe in harbor, but they were never ...      0
19999     Black holes are where God is dividing by zero.      0

[20000 rows x 2 columns]
                                                    text        class
0      It's just over 2 years since I was diagnosed w...      suicide
1      It's Sunday, I need a break, so I'm planning t...      suic

In [None]:
# trying to add one more feature



# def contains_self_centric_words(text):
#     self_centric_words = [
#     "i", "me", "myself", "my", "mine", "we", "us", "our", "ours", "ourselves",
#     "self", "myself"]

#     tokens = text.split()
#     for word in tokens:
#         if word.lower() in self_centric_words:
#             return True
#     return False

# data['label_show'] = data['text'].apply(contains_self_centric_words)

# data.to_csv('updated_dataset.csv', index=False)

# df = pd.read_csv('/content/updated_dataset.csv')

In [None]:
df['class'].dropna(inplace=True)
df.dropna(subset=['text'], inplace=True)

def preprocess_text(text):

    text = text.lower()

    text = re.sub(r'[^\w\s]', '', text)

    tokens = word_tokenize(text)

    stop_words = set(stopwords.words('english'))
    tokens = [word for word in tokens if word not in stop_words]

    lemmatizer = WordNetLemmatizer()
    tokens = [lemmatizer.lemmatize(word) for word in tokens]
    return ' '.join(tokens)

df['text'] = df['text'].apply(preprocess_text)

In [None]:
print(df)

# occurrences_counts = df['label_show'].value_counts()

# # Print the counts
# print("True Count:", occurrences_counts[True])
# print("False Count:", occurrences_counts[False])

occurrences_counts = df['class'].value_counts()

# Print the counts
print("True Count:", occurrences_counts["suicide"])
print("False Count:", occurrences_counts["non-suicide"])

                                                    text        class
0      2 year since diagnosed anxiety depression toda...      suicide
1      sunday need break im planning spend little tim...      suicide
2                      awake tired need sleep brain idea      suicide
3      rt sewhq retro bear make perfect gift great be...      suicide
4      itâs hard say whether packing list making life...      suicide
...                                                  ...          ...
19995                    day without sunshine like night  non-suicide
19996  borens law 1 charge ponder 2 trouble delegate ...  non-suicide
19997  flow chart thoroughly oversold piece program d...  non-suicide
19998                  ship safe harbor never meant stay  non-suicide
19999                       black hole god dividing zero  non-suicide

[20000 rows x 2 columns]
True Count: 10000
False Count: 10000


In [None]:
print(df)

label_mapping = {'suicide': 1, 'non-suicide': 0}
# show_mapping = {True:1, False:0}
df['class'] = df['class'].replace(label_mapping)
# df['label_show'] = df['label_show'].replace(show_mapping)

                                                    text        class
0      2 year since diagnosed anxiety depression toda...      suicide
1      sunday need break im planning spend little tim...      suicide
2                      awake tired need sleep brain idea      suicide
3      rt sewhq retro bear make perfect gift great be...      suicide
4      itâs hard say whether packing list making life...      suicide
...                                                  ...          ...
19995                    day without sunshine like night  non-suicide
19996  borens law 1 charge ponder 2 trouble delegate ...  non-suicide
19997  flow chart thoroughly oversold piece program d...  non-suicide
19998                  ship safe harbor never meant stay  non-suicide
19999                       black hole god dividing zero  non-suicide

[20000 rows x 2 columns]


In [None]:
# have to combine those 2 columns in single



# X = data['text']
# y = data[['class', 'label_show']]  # Combine both label columns into a single dataframe

# # Split the dataset into training and testing sets
# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# # Initialize TF-IDF vectorizer
# tfidf_vectorizer = TfidfVectorizer()

# # Fit and transform the training data
# X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)

# # Transform the testing data
# X_test_tfidf = tfidf_vectorizer.transform(X_test)

# # Initialize and train the Multinomial Naive Bayes classifier
# mnb = MultinomialNB()
# mnb.fit(X_train_tfidf, y_train)

# # Make predictions
# predictions = mnb.predict(X_test_tfidf)

# # Evaluate the model
# accuracy = accuracy_score(y_test, predictions)
# print('Accuracy:', accuracy)

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import accuracy_score

# First, ensure the dataset has sufficient data for splitting
assert len(df) > 2, "Dataset is too small for splitting."

# Initial split: 60% training, 40% for validation/testing
X_train, X_temp, y_train, y_temp = train_test_split(df['text'], df['class'], test_size=0.2, random_state=42)

# Check the size of X_temp to confirm it has enough data for further split
assert len(X_temp) >= 4, "Insufficient data for 20-20 split."

# Split the 40% into 20% validation and 20% testing
X_validation, X_test, y_validation, y_test = train_test_split(X_temp,y_temp, test_size=0.5, random_state=42)

# Create a TF-IDF vectorizer and transform the training, validation, and test sets
tfidf_vectorizer = TfidfVectorizer()
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)
X_validation_tfidf = tfidf_vectorizer.transform(X_validation)
X_test_tfidf = tfidf_vectorizer.transform(X_test)

# Train the Multinomial Naive Bayes classifier
classifier = MultinomialNB()
classifier.fit(X_train_tfidf, y_train)

# Predict on the test set
y_pred = classifier.predict(X_test_tfidf)

# Calculate accuracy for the test set
accuracy = accuracy_score(y_test, y_pred)
print("Test Set Accuracy:", accuracy)

# Validate the model's performance on the validation set
y_val_pred = classifier.predict(X_validation_tfidf)
val_accuracy = accuracy_score(y_validation, y_val_pred)
print("Validation Set Accuracy:", val_accuracy)

Test Set Accuracy: 0.871
Validation Set Accuracy: 0.864


In [None]:
from sklearn.model_selection import cross_val_score

# Perform k-fold cross-validation with 5 folds
cross_val_accuracy = cross_val_score(classifier, X_train_tfidf, y_train, cv=5, scoring='accuracy')

# Output cross-validation results
print("Cross-Validation Accuracy:", cross_val_accuracy.mean())

Cross-Validation Accuracy: 0.8626874999999998


In [None]:
from sklearn.metrics import precision_score, recall_score, f1_score
precision = precision_score(y_test, y_pred, average='weighted')
recall = recall_score(y_test, y_pred, average='weighted')
f1 = f1_score(y_test, y_pred, average='weighted')

print("Precision:", precision)
print("Recall:", recall)
print("F1-score:", f1)

Precision: 0.8727841351878595
Recall: 0.871
F1-score: 0.8706202947274391


In [None]:
pip install textblob



In [None]:
from sklearn.preprocessing import MinMaxScaler
from scipy.sparse import hstack
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from textblob import TextBlob
import numpy as np
import scipy.sparse as sp

# Function to get sentiment polarity
def get_sentiment(text):
    blob = TextBlob(text)
    return blob.sentiment.polarity

# Separate features and labels
X = df['text']
y = df['class']

# Initial split: 60% training, 40% test/validation
X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.2, random_state=42)

# Split the test/validation data into 20% validation and 20% testing
X_validation, X_test, y_validation, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)

# Vectorize text data using TF-IDF
tfidf_vectorizer = TfidfVectorizer()
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)
X_validation_tfidf = tfidf_vectorizer.transform(X_validation)
X_test_tfidf = tfidf_vectorizer.transform(X_test)

# Get sentiment scores for text data
X_train_sentiment = np.array([get_sentiment(text) for text in X_train]).reshape(-1, 1)
X_validation_sentiment = np.array([get_sentiment(text) for text in X_validation]).reshape(-1, 1)
X_test_sentiment = np.array([get_sentiment(text) for text in X_test]).reshape(-1, 1)

# Scale sentiment scores using MinMaxScaler
scaler = MinMaxScaler()
X_train_sentiment_scaled = scaler.fit_transform(X_train_sentiment)
X_validation_sentiment_scaled = scaler.transform(X_validation_sentiment)
X_test_sentiment_scaled = scaler.transform(X_test_sentiment)

# Combine TF-IDF features with scaled sentiment scores
X_train_combined = hstack((X_train_tfidf, sp.csr_matrix(X_train_sentiment_scaled)))
X_validation_combined = hstack((X_validation_tfidf, sp.csr_matrix(X_validation_sentiment_scaled)))
X_test_combined = hstack((X_test_tfidf, sp.csr_matrix(X_test_sentiment_scaled)))

# Parameter grid for hyperparameter tuning
param_grid = {'alpha': [0.01, 0.1, 0.5, 1.0, 2.0, 5.0]}

# Perform GridSearchCV with 5-fold cross-validation
grid_search = GridSearchCV(MultinomialNB(), param_grid, cv=5, scoring='accuracy')
grid_search.fit(X_train_combined, y_train)

# Best alpha value from GridSearchCV
best_alpha = grid_search.best_params_['alpha']

# Train the MNB model with the best alpha
MNB_model = MultinomialNB(alpha=best_alpha)
MNB_model.fit(X_train_combined, y_train)

# Make predictions on the test set
y_pred = MNB_model.predict(X_test_combined)

# Evaluate the model's accuracy
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)

print("Best alpha value:", best_alpha)
print("Test Set Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("F1 Score:", f1)

Best alpha value: 0.1
Test Set Accuracy: 0.881
Precision: 0.871866295264624
Recall: 0.9037536092396535
F1 Score: 0.8875236294896031


In [None]:
y_val_pred = MNB_model.predict(X_validation_combined)

# Calculate validation set accuracy
validation_accuracy = accuracy_score(y_validation, y_val_pred)

print("Validation Set Accuracy:", validation_accuracy)

Validation Set Accuracy: 0.87


In [None]:
from sklearn.model_selection import cross_val_score

# Perform k-fold cross-validation with 5 folds
cross_val_accuracy = cross_val_score(MNB_model, X_train_combined, y_train, cv=5, scoring='accuracy')

# Output cross-validation results
print("Cross-Validation Accuracy:", cross_val_accuracy.mean())

Cross-Validation Accuracy: 0.8729375000000001


In [None]:
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import accuracy_score

# First, ensure the dataset has sufficient data for splitting
assert len(df) > 2, "Dataset is too small for splitting."

# Initial split: 60% training, 40% for validation/testing
X_train, X_temp, y_train, y_temp = train_test_split(df['text'], df['class'], test_size=0.2, random_state=42)

# Check the size of X_temp to confirm it has enough data for further split
assert len(X_temp) >= 4, "Insufficient data for 20-20 split."

# Split the 40% into 20% validation and 20% testing
X_validation, X_test, y_validation, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)

# Create a TF-IDF vectorizer and transform the training, validation, and test sets
tfidf_vectorizer = TfidfVectorizer()
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)
X_validation_tfidf = tfidf_vectorizer.transform(X_validation)
X_test_tfidf = tfidf_vectorizer.transform(X_test)

# Train the Multinomial Naive Bayes classifier
classifier = MultinomialNB()
classifier.fit(X_train_tfidf, y_train)

# Define a function to preprocess user input
def preprocess_text(text):
    # Implement your text preprocessing steps here
    return text

# Define a function to predict depression based on user input
def predict_depression(user_input):
    preprocessed_input = preprocess_text(user_input)
    input_tfidf = tfidf_vectorizer.transform([preprocessed_input])
    prediction = classifier.predict(input_tfidf)[0]
    return prediction

# Define the while loop for continuous user input
while True:
    user_input = input("Enter a sentence to check for depression (or type 'exit' to quit): ")

    # Check if the user wants to exit
    if user_input.lower() == 'exit':
        break

    # Make prediction based on user input
    prediction = predict_depression(user_input)

    # Print the prediction result
    print("Depression:", prediction)

Depression: 0
Depression: 1
Depression: 0
Depression: 1
Depression: 0
Depression: 0


In [None]:
while True:
  user_input = input("Enter a sentence to check for depression: ")

  preprocessed_input = preprocess_text(user_input)

  input_tfidf = vectorizer.transform([preprocessed_input])

  prediction = classifier.predict(input_tfidf)[0]

  if prediction == 0:
    print('Depression: No')
  else:
    print('Depression: Yes')

Enter a sentence to check for depression: h
Depression: No
Enter a sentence to check for depression: hey wasup
Depression: No
Enter a sentence to check for depression: i am gonna kill my self this academic pressure
Depression: Yes
Enter a sentence to check for depression: i am so happy hehe
Depression: No
Enter a sentence to check for depression: well i am not so happy ;{{{
Depression: Yes


KeyboardInterrupt: Interrupted by user

In [None]:
import pandas as pd

df = pd.read_csv('/content/drive/MyDrive/TwExtract-elonmusk-20240402_091647.csv', usecols=['tweetText'])

total_tweets = len(df)
depressed_count = 0

for tweet_text in df['tweetText']:
    preprocessed_input = preprocess_text(tweet_text)
    input_tfidf = vectorizer.transform([preprocessed_input])
    prediction = classifier.predict(input_tfidf)[0]
    if prediction == 1:  # Assuming 1 indicates depression in your model
        depressed_count += 1

percentage_depressed = (depressed_count / total_tweets) * 100

print(f"Percentage of depressed tweets: {percentage_depressed:.2f}%")

In [None]:
pip install flask



In [None]:
from flask import Flask

app = Flask(__name__)


@app.route("")
def hello():
    return "Welcome to machine learning model APIs!"


if __name__ == '__main__':
    app.run(debug=True)

ValueError: URL rule '' must start with a slash.

In [None]:
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.preprocessing import MinMaxScaler
from scipy.sparse import hstack
import numpy as np

# Function to get sentiment polarity
def get_sentiment(text):
    from textblob import TextBlob
    return TextBlob(text).sentiment.polarity

# Data preparation and splitting into 80-10-10
X = df['text']
y = df['class']

# Initial split: 80% training, 20% validation/test
X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.2, random_state=42)

# Split temporary set into 10% validation and 10% test
X_validation, X_test, y_validation, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)

# Vectorize text data using TF-IDF
tfidf_vectorizer = TfidfVectorizer()
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)
X_validation_tfidf = tfidf_vectorizer.transform(X_validation)
X_test_tfidf = tfidf_vectorizer.transform(X_test)

# Get sentiment polarity and scale
X_train_sentiment = np.array([get_sentiment(text) for text in X_train]).reshape(-1, 1)
X_validation_sentiment = np.array([get_sentiment(text) for text in X_validation]).reshape(-1, 1)
X_test_sentiment = np.array([get_sentiment(text) for text in X_test]).reshape(-1, 1)

scaler = MinMaxScaler()
X_train_sentiment_scaled = scaler.fit_transform(X_train_sentiment)
X_validation_sentiment_scaled = scaler.transform(X_validation_sentiment)
X_test_sentiment_scaled = scaler.transform(X_test_sentiment)

# Combine TF-IDF with scaled sentiment scores
X_train_combined = hstack((X_train_tfidf, X_train_sentiment_scaled))
X_validation_combined = hstack((X_validation_tfidf, X_validation_sentiment_scaled))
X_test_combined = hstack((X_test_tfidf, X_test_sentiment_scaled))

# Hyperparameter tuning with GridSearchCV
param_grid = {
    'C': [0.1, 1, 10],  # Inverse of regularization strength
    'penalty': ['l1', 'l2'],  # Type of regularization
    'solver': ['liblinear'],  # Solver that supports L1/L2
}

grid_search = GridSearchCV(LogisticRegression(), param_grid, cv=5, scoring='accuracy')
grid_search.fit(X_train_combined, y_train)

# Cross-Validation Accuracy
cross_val_accuracy = cross_val_score(
    LogisticRegression(**grid_search.best_params_),
    X_train_combined,
    y_train,
    cv=5,
    scoring='accuracy'
).mean()

# Train the model with the best hyperparameters
logistic_regression_model = LogisticRegression(**grid_search.best_params_)
logistic_regression_model.fit(X_train_combined, y_train)

# Predict and calculate metrics for the validation set
y_val_pred = logistic_regression_model.predict(X_validation_combined)
validation_accuracy = accuracy_score(y_validation, y_val_pred)

# Predict and calculate metrics for the test set
y_test_pred = logistic_regression_model.predict(X_test_combined)
test_accuracy = accuracy_score(y_test, y_test_pred)
test_precision = precision_score(y_test, y_test_pred)
test_recall = recall_score(y_test, y_test_pred)
test_f1 = f1_score(y_test, y_test_pred)

# Output metrics
print("Cross-Validation Accuracy:", cross_val_accuracy)
print("Validation Set Accuracy:", validation_accuracy)
print("Test Set Accuracy:", test_accuracy)
print("Test Set Precision:", test_precision)
print("Test Set Recall:", test_recall)
print("Test Set F1 Score:", test_f1)

Cross-Validation Accuracy: 0.8574375
Validation Set Accuracy: 0.8635
Test Set Accuracy: 0.861
Test Set Precision: 0.8641148325358852
Test Set Recall: 0.8691049085659288
Test Set F1 Score: 0.8666026871401151


In [None]:
from sklearn.metrics import precision_score, recall_score, f1_score
precision1 = precision_score(y_test, y_pred)

recall1 = recall_score(y_test, y_pred)

f1_1 = f1_score(y_test, y_pred)

print("Precision:", precision1)
print("Recall:", recall1)
print("F1-score:", f1_1)

Precision: 0.8503937007874016
Recall: 0.8558692421991084
F1-score: 0.8531226857566034
