In [1]:
import pandas as pd
data=pd.read_csv("depression_dataset.csv")
df = pd.DataFrame(data)

In [2]:
df.head(5)

Unnamed: 0,clean_text,is_depression
0,we understand that most people who reply immed...,1
1,welcome to r depression s check in post a plac...,1
2,anyone else instead of sleeping more when depr...,1
3,i ve kind of stuffed around a lot in my life d...,1
4,sleep is my greatest and most comforting escap...,1


In [3]:
df.describe()

Unnamed: 0,is_depression
count,7731.0
mean,0.495537
std,0.500012
min,0.0
25%,0.0
50%,0.0
75%,1.0
max,1.0


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7731 entries, 0 to 7730
Data columns (total 2 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   clean_text     7731 non-null   object
 1   is_depression  7731 non-null   int64 
dtypes: int64(1), object(1)
memory usage: 120.9+ KB


In [5]:
df.isnull().sum()

clean_text       0
is_depression    0
dtype: int64

In [6]:
df=df.fillna("")

In [7]:
df.isnull().sum()

clean_text       0
is_depression    0
dtype: int64

In [8]:
df.columns

Index(['clean_text', 'is_depression'], dtype='object')

In [9]:
df.head()

Unnamed: 0,clean_text,is_depression
0,we understand that most people who reply immed...,1
1,welcome to r depression s check in post a plac...,1
2,anyone else instead of sleeping more when depr...,1
3,i ve kind of stuffed around a lot in my life d...,1
4,sleep is my greatest and most comforting escap...,1


In [10]:
import re # re module for using regular expressions
from nltk.stem import PorterStemmer # Porter stemming algorithm is a process for removing suffixes from words in English
from nltk.corpus import stopwords # to remove stopwords like “the”, “is”, “in”, “for”, “where”, “when”, “to”, “at” etc.

In [11]:
port_stem=PorterStemmer()

In [12]:
port_stem

<PorterStemmer>

In [13]:
port_stem.stem("Hi this is mariya")

'hi this is mariya'

In [14]:
def stemming(content): #for stemming and removing stopwords
    con = re.sub("[^a-zA-Z]", " ", content)  
    con = con.lower()
    con = con.split()
    con = [PorterStemmer().stem(word) for word in con if word not in stopwords.words("english")]  
    con = " ".join(con)
    return con

In [15]:
stemming("this is mariya")

'mariya'

In [16]:
df["clean_text"]=df["clean_text"].apply(stemming)

In [17]:
x=df["clean_text"]

In [18]:
y=df["is_depression"]

In [19]:
print(x.shape)
print(y.shape)

(7731,)
(7731,)


In [20]:
from sklearn.model_selection import train_test_split

In [21]:
X_train, X_test, Y_train, Y_test = train_test_split(df["clean_text"], df["is_depression"], test_size=0.25)

In [22]:
X_test

4494                           im sooo sad right need hug
4019    robluket love french tell peopl south qtr fren...
7371                                 ugh sleep feel great
89      typic feel bedtim good get struggl rememb time...
7299        cant eat drink breath thank bad throat infect
                              ...                        
1624    know even real thing whenev like anxiou overth...
3214    give depress anxieti cut food chain becom coll...
6270               miss adelaid wish wa beach look beauti
7154                                             back bit
4750    linuxfound contact need freelanc work linux co...
Name: clean_text, Length: 1933, dtype: object

In [23]:
X_train

5979                           ia awak ha go school today
1       welcom r depress check post place take moment ...
4430                  im lone keep compani femal new york
333     tire anyth bag full cloth took one half hour u...
4674    sleep wide awak got ta go work later boy go cr...
                              ...                        
4879                   quot fake quot verruca twitter sad
5774    best way eat reheat pizza keep tri eat one hot...
7162                       wish play reindeer game fowler
4877                      need new glass mine hangnon arm
4048    ha gotten somebodi read tweet cant get make ac...
Name: clean_text, Length: 5798, dtype: object

In [24]:
Y_train

5979    0
1       1
4430    0
333     1
4674    0
       ..
4879    0
5774    0
7162    0
4877    0
4048    0
Name: is_depression, Length: 5798, dtype: int64

In [25]:
Y_test

4494    0
4019    0
7371    0
89      1
7299    0
       ..
1624    1
3214    1
6270    0
7154    0
4750    0
Name: is_depression, Length: 1933, dtype: int64

In [26]:
X_test.shape

(1933,)

In [27]:
X_train.shape

(5798,)

In [28]:
import warnings
from sklearn.feature_extraction.text import CountVectorizer
warnings.filterwarnings("ignore")
cv = CountVectorizer()

x_cv = cv.fit_transform(X_train)
xcvt = cv.transform(X_test)

In [29]:
from sklearn.tree import DecisionTreeClassifier
model = DecisionTreeClassifier()

model.fit(x_cv, Y_train)
predict=model.score(xcvt, Y_test)

In [30]:
predict

0.8660113812726332

In [31]:
from sklearn.feature_extraction.text import TfidfVectorizer
import warnings
warnings.filterwarnings("ignore")
tfidvectorizer = TfidfVectorizer()

x_tfidf = tfidvectorizer.fit_transform(X_train)
x_tfidft = tfidvectorizer.transform(X_test)


In [32]:
from sklearn.tree import DecisionTreeClassifier
model = DecisionTreeClassifier()

model.fit(x_tfidf, Y_train)
model.score(x_tfidft, Y_test)

0.8887739265390585

In [33]:
prediction=model.predict(x_tfidft)
prediction

array([0, 0, 0, ..., 0, 0, 0], dtype=int64)

In [34]:
import pickle

In [35]:
pickle.dump(cv, open("vector.pkl","wb"))

In [36]:
pickle.dump(model, open("model.pkl","wb"))

In [37]:
vector_form=pickle.load(open("vector.pkl","rb"))

In [38]:
load_model=pickle.load(open("model.pkl","rb"))

In [41]:
def suicide_detection(suicide_text, vectorizer, model):
    suicide_text = stemming(suicide_text)
    input_data = [suicide_text]
    vector_form1 = vectorizer.transform(input_data)
    prediction = model.predict(vector_form1)
    return prediction

# Assuming 'vector_form' and 'load_model' are the vectorizer and model used during training
vector_form = TfidfVectorizer()  # Replace with the actual vectorizer used during training
load_model = DecisionTreeClassifier()  # Replace with the actual creation/loading of your DecisionTreeClassifier

# Example usage
text = """i ve kind of stuffed around a lot in my life delaying the inevitable of having to work a job and be a responsible adult and i m but the longest i ve ever held a job wa 9 month it wasn t that i m lazy i wa always doing other thing i enjoy but i know now unemployment ha caused most of my depression recently i just feel utterly hopeless when i think soon enough i ll have to move out on my own in some shitty house working a job i couldn t care le about to me it just seems like the perfect recipe to depression"""
val = suicide_detection(text, vector_form, load_model)

if val[0] == 1:
    print('suicidal')
else:
    print('non-suicidal') 


NotFittedError: The TF-IDF vectorizer is not fitted