<a href="https://colab.research.google.com/github/JiaYong02/Suicidal-Detection-NLP/blob/main/Model_Building_SVM.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Model Building

## Read and tokenize

In [2]:
# import
import pandas as pd
import nltk
from nltk.tokenize import word_tokenize
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import TfidfVectorizer
import matplotlib.pyplot as plt
import seaborn as sns
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [3]:
df = pd.read_csv('/content/drive/MyDrive/Text Sentiment Lab/Dataset/cleaned_SuicideAndDepression_detection_with_Token.csv')

In [4]:
df.shape

(347261, 2)

In [5]:
depression_data = df[df['class'] == 'depression'].head(20000)
suicide_data = df[df['class'] == 'suicide'].head(20000)
normal_data = df[df['class'] == 'normal'].head(20000)
new_df = pd.concat([depression_data, suicide_data, normal_data])

In [6]:
new_df.shape

(60000, 2)

In [7]:
new_df.head()

Unnamed: 0,text,class
0,"['life', 'actually', 'work', 'non', 'depressed...",depression
1,"['found', 'friend', 'body', 'almost', 'nine', ...",depression
5,"['never', 'anyone', 'life', 'problem', 'much',...",depression
6,"['somebody', 'help', 'terrible', 'episode', 't...",depression
7,"['can', 'not', 'hidden', 'away', 'summer', 'ro...",depression


In [8]:
depression_data.shape

(20000, 2)

## Prepare Training and Testing dataset


### Split dataset

In [9]:
from sklearn.model_selection import train_test_split
X = new_df['text']
y = new_df['class']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=100)


### Encoding Label

In [10]:
Encoder = LabelEncoder()
y_train = Encoder.fit_transform(y_train)
y_test = Encoder.fit_transform(y_test)

In [11]:
# Get the mapping of original labels to encoded values
label_mapping = dict(zip(Encoder.classes_, Encoder.transform(Encoder.classes_)))

# Print the mapping
print("Label Mapping:")
for label, encoded_value in label_mapping.items():
    print(f"{label} : {encoded_value}")

Label Mapping:
depression : 0
normal : 1
suicide : 2


### Word Vectorization

In [12]:
# Tfidf_vect = TfidfVectorizer(max_features=5000)
Tfidf_vect = TfidfVectorizer(ngram_range=(1,2))
Tfidf_vect.fit(new_df['text'])

X_tfidf_train = Tfidf_vect.transform(X_train)
X_tfidf_test = Tfidf_vect.transform(X_test)


In [13]:
print(Tfidf_vect.vocabulary_)

IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.

Current values:
NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
NotebookApp.rate_limit_window=3.0 (secs)



In [14]:
print(X_tfidf_train)

  (0, 1253045)	0.09319334009419085
  (0, 1251908)	0.04011066669122432
  (0, 1248492)	0.10101904867227803
  (0, 1247798)	0.10783914315052984
  (0, 1247306)	0.06702564601407657
  (0, 1192177)	0.11376625790835802
  (0, 1191852)	0.04264240454820569
  (0, 1181185)	0.13121808279847427
  (0, 1181100)	0.07705355042991754
  (0, 1163980)	0.12651346714443804
  (0, 1163725)	0.05590615352071292
  (0, 1159132)	0.06787045815517166
  (0, 1158487)	0.038273801372319914
  (0, 1132862)	0.07132421903841475
  (0, 1132424)	0.02473557330358571
  (0, 1123761)	0.0917539559864423
  (0, 1123170)	0.038503340319069296
  (0, 1120007)	0.09617206301271095
  (0, 1118551)	0.02761206216457836
  (0, 1118480)	0.08119983797015669
  (0, 1117215)	0.18773886962131572
  (0, 1115698)	0.08009309719929276
  (0, 1095474)	0.08333090176632812
  (0, 1094938)	0.03365295302185794
  (0, 1092065)	0.11376625790835802
  :	:
  (47999, 507350)	0.11903543234176271
  (47999, 500667)	0.13842387197461165
  (47999, 499927)	0.05297119831297692
  (4

## Support Vector Machine

In [None]:
from sklearn.svm import SVC

svc = SVC(random_state = 100)

svc.fit(X_tfidf_train, y_train)

prediction_svc = svc.predict(X_tfidf_test)

svc_train_score = svc.score(X_tfidf_train, y_train)
svc_test_score  = svc.score(X_tfidf_test, y_test)

print("SVM Model:")
print ("Training Score: {}\nTest Score: {}" .format(svc_train_score, svc_test_score))

In [None]:
from sklearn.metrics import confusion_matrix

plt.title('confusion_matrix of SVC')
sns.heatmap(confusion_matrix(y_test,prediction_svc),annot=True,fmt="d")
plt.show()

In [None]:
from sklearn.metrics import classification_report
print(classification_report(y_test,prediction_svc))


In [None]:
# Save model
import joblib
joblib.dump(svc, 'svm_model2.pkl')

## Random Forest

In [None]:
from sklearn.ensemble import RandomForestClassifier

rf = RandomForestClassifier(random_state = 100)

rf.fit(X_tfidf_train, y_train)

prediction_rf = rf.predict(X_tfidf_test)

rf_train_score = rf.score(X_tfidf_train, y_train)
rf_test_score  = rf.score(prediction_rf, y_test)

print("Random Forest Model:")
print ("Training Score: {}\nTest Score: {}" .format(rf_train_score, rf_test_score))

In [None]:
print(hi)