In [1]:
"""
Title: Using pre-trained word embeddings
Author: [fchollet](https://twitter.com/fchollet)
Date created: 2020/05/05
Last modified: 2020/05/05
Description: Text classification on the Newsgroup20 dataset using pre-trained GloVe word embeddings.
"""

"""
## Setup
"""
import random
import re
import os
import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow import keras
from sklearn.preprocessing import LabelEncoder
# import these modules
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
  
lemmatizer = WordNetLemmatizer()
stop_words_nltk = set(stopwords.words('english'))
Le = LabelEncoder()

"""
## Introduction

In this example, we show how to train a text classification model that uses pre-trained
word embeddings.

We'll work with the Newsgroup20 dataset, a set of 20,000 message board messages
belonging to 20 different topic categories.

For the pre-trained word embeddings, we'll use
[GloVe embeddings](http://nlp.stanford.edu/projects/glove/).
"""

"""
## Download the Newsgroup20 data
"""

# data_path = keras.utils.get_file(
#     "news20.tar.gz",
#     "http://www.cs.cmu.edu/afs/cs.cmu.edu/project/theo-20/www/data/news20.tar.gz",
#     untar=True,
# )



2022-12-05 22:22:29.430585: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2022-12-05 22:22:29.715497: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory
2022-12-05 22:22:29.715542: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine.
2022-12-05 22:22:30.722906: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory
2022-

'\n## Download the Newsgroup20 data\n'

In [4]:
"""
## Let's take a look at the data
"""
import pathlib

# data_dir = pathlib.Path(data_path).parent / "20_newsgroup"
data_dir = '/home/ifte-home/Documents/20_newsgroup/'
dirnames = os.listdir(data_dir)
print("Number of directories:", len(dirnames))
print("Directory names:", dirnames)

fnames = os.listdir(data_dir+"/comp.graphics")
print("Number of files in comp.graphics:", len(fnames))
print("Some example filenames:", fnames[:5])

"""
Here's a example of what one file contains:
"""

print(open(data_dir+"/comp.graphics/38987").read())

Number of directories: 20
Directory names: ['rec.autos', 'sci.crypt', 'rec.sport.hockey', 'comp.windows.x', 'rec.sport.baseball', 'sci.med', 'talk.politics.misc', 'comp.sys.ibm.pc.hardware', 'sci.electronics', 'alt.atheism', 'talk.politics.guns', 'comp.os.ms-windows.misc', 'rec.motorcycles', 'comp.graphics', 'comp.sys.mac.hardware', 'soc.religion.christian', 'misc.forsale', 'sci.space', 'talk.politics.mideast', 'talk.religion.misc']
Number of files in comp.graphics: 1000
Some example filenames: ['38099', '38905', '39631', '39042', '38424']
Newsgroups: comp.graphics
Path: cantaloupe.srv.cs.cmu.edu!das-news.harvard.edu!noc.near.net!howland.reston.ans.net!agate!dog.ee.lbl.gov!network.ucsd.edu!usc!rpi!nason110.its.rpi.edu!mabusj
From: mabusj@nason110.its.rpi.edu (Jasen M. Mabus)
Subject: Looking for Brain in CAD
Message-ID: <c285m+p@rpi.edu>
Nntp-Posting-Host: nason110.its.rpi.edu
Reply-To: mabusj@rpi.edu
Organization: Rensselaer Polytechnic Institute, Troy, NY.
Date: Thu, 29 Apr 1993 23:2

In [5]:
"""
As you can see, there are header lines that are leaking the file's category, either
explicitly (the first line is literally the category name), or implicitly, e.g. via the
`Organization` filed. Let's get rid of the headers:
"""

samples = []
labels = []
class_names = []
class_index = 0
for dirname in sorted(os.listdir(data_dir)):
    class_names.append(dirname)
    dirpath = data_dir+'/'+dirname
    fnames = os.listdir(dirpath)
    print("Processing %s, %d files found" % (dirname, len(fnames)))
    for fname in fnames:
        fpath = dirpath+"/"+fname
        f = open(fpath, encoding="latin-1")
        content = f.read()
        lines = content.split("\n")
        lines = lines[10:]
        content = "\n".join(lines)
        samples.append(content)
        labels.append(class_index)
    class_index += 1

print("Classes:", class_names)
print("Number of samples:", len(samples))

"""
There's actually one category that doesn't have the expected number of files, but the
difference is small enough that the problem remains a balanced classification problem.
"""

"""
## Shuffle and split the data into training & validation sets
"""

Processing alt.atheism, 1000 files found
Processing comp.graphics, 1000 files found
Processing comp.os.ms-windows.misc, 1000 files found
Processing comp.sys.ibm.pc.hardware, 1000 files found
Processing comp.sys.mac.hardware, 1000 files found
Processing comp.windows.x, 1000 files found
Processing misc.forsale, 1000 files found
Processing rec.autos, 1000 files found
Processing rec.motorcycles, 1000 files found
Processing rec.sport.baseball, 1000 files found
Processing rec.sport.hockey, 1000 files found
Processing sci.crypt, 1000 files found
Processing sci.electronics, 1000 files found
Processing sci.med, 1000 files found
Processing sci.space, 1000 files found
Processing soc.religion.christian, 997 files found
Processing talk.politics.guns, 1000 files found
Processing talk.politics.mideast, 1000 files found
Processing talk.politics.misc, 1000 files found
Processing talk.religion.misc, 1000 files found
Classes: ['alt.atheism', 'comp.graphics', 'comp.os.ms-windows.misc', 'comp.sys.ibm.pc.ha

'\n## Shuffle and split the data into training & validation sets\n'

In [4]:
df = pd.read_csv('/home/ifte-home/Documents/mental_health/suicide/CSSRS/500_Reddit_users_posts_labels.csv')

In [5]:
def prepare_lines_from_df(frame):
    data = {'Post':[], 'Label':[]}
    for i, col in frame.iterrows():
        for items in col['Post'].split(','):
            data['Post'].append(items)
            data['Label'].append(col['Label'])
    return pd.DataFrame(data)

In [6]:
path = '/home/ifte-home/Documents/mental_health/suicide/CSSRS/'
categories = ['Indicator', 'Attempt','Behavior','Ideation']
file_names = ['suicidal_indicator.csv', 
              'suicidal_attempt.csv', 
              'suicidal_behavior.csv', 
              'suicidal_ideation.csv']

def generate_dataset(categories, res):
    data = []
    category = []
    for p in res:
        data.append(''.join(p))
        category.append(categories)

    return pd.DataFrame(zip(data,category), columns=['Post', 'Label'])

def generate_samples():
    sentence_num = 10
    index = 0
    dataset = pd.DataFrame()

    for items in file_names:
        frame = pd.read_csv(path+items)
        # print(datframe.columns)
        col_name = list(frame.columns)
        if index==0:
            size=4
        elif index==1:
            size=52
        elif index==3:
            size=14
        else:
            size=45
        for k in range(size):        
            temp = [col_name[i:i+sentence_num] for i in range(0, len(col_name), sentence_num)]
            dataset = pd.concat([dataset,generate_dataset(categories[index], temp)])
            random.shuffle(col_name)
        index+=1
    #     break
    return dataset.sample(frac=1)

In [7]:
total_dataframe = pd.concat([prepare_lines_from_df(df),generate_samples()])
total_dataframe['code'] = Le.fit_transform(total_dataframe['Label'])

In [8]:
def process(text):
    text = text.lower()
    text = re.sub('\W+', ' ',text)
    text = [x for x in [lemmatizer.lemmatize(w) for w in text.split()] if x not in stop_words_nltk]
    return ' '.join(text)

In [10]:
total_dataframe['Post'] = total_dataframe.Post.apply(lambda x: process(x))

In [12]:
total_dataframe.to_csv('/home/ifte-home/Documents/mental_health/suicide/CSSRS/500_Reddit_users_posts_labels_processed.csv')

In [24]:
total_dataframe = pd.read_csv('/home/ifte-home/Documents/mental_health/suicide/CSSRS/500_Reddit_users_posts_labels_processed.csv', index_col=0)

In [25]:
total_dataframe

Unnamed: 0,Post,Label,code
0,viable option,Supportive,4
1,youll leaving wife behind youd pain beyond com...,Supportive,4
2,know first hand definitely feel hopeless,Supportive,4
3,seem tired aware wife might need chip financia...,Supportive,4
4,even 10 15 hour asthenia could alleviate lot p...,Supportive,4
...,...,...,...
1,suicide self injury arson suicide self inflict...,Attempt,0
8,taken ledge severely anxious intentional self ...,Behavior,1
36,landau kleffner expressive language disorder a...,Ideation,2
11,suicide intent score subscale bitten hit bough...,Behavior,1


In [27]:
total_dataframe['code'] = Le.fit_transform(total_dataframe['Label'])

In [34]:
total_dataframe = total_dataframe.dropna()

In [35]:
labels = total_dataframe.code.values
samples = total_dataframe.Post.values
class_names = Le.classes_

In [36]:
# Shuffle the data
seed = 1337
rng = np.random.RandomState(seed)
rng.shuffle(samples)
rng = np.random.RandomState(seed)
rng.shuffle(labels)

# Extract a training & validation split
validation_split = 0.2
num_validation_samples = int(validation_split * len(samples))
train_samples = samples[:-num_validation_samples]
val_samples = samples[-num_validation_samples:]
train_labels = labels[:-num_validation_samples]
val_labels = labels[-num_validation_samples:]

In [38]:
"""
## Create a vocabulary index

Let's use the `TextVectorization` to index the vocabulary found in the dataset.
Later, we'll use the same layer instance to vectorize the samples.

Our layer will only consider the top 20,000 words, and will truncate or pad sequences to
be actually 200 tokens long.
"""

from tensorflow.keras.layers import TextVectorization

vectorizer = TextVectorization(max_tokens=10000, output_sequence_length=500)
text_ds = tf.data.Dataset.from_tensor_slices(train_samples).batch(128)
vectorizer.adapt(text_ds)

In [39]:
vectorizer.vocabulary_size()

10000

In [40]:
"""
You can retrieve the computed vocabulary used via `vectorizer.get_vocabulary()`. Let's
print the top 5 words:
"""

vectorizer.get_vocabulary()[:5]

"""
Let's vectorize a test sentence:
"""

output = vectorizer([["the cat sat on the mat"]])
output.numpy()[0, :6]

"""
As you can see, "the" gets represented as "2". Why not 0, given that "the" was the first
word in the vocabulary? That's because index 0 is reserved for padding and index 1 is
reserved for "out of vocabulary" tokens.

Here's a dict mapping words to their indices:
"""

voc = vectorizer.get_vocabulary()
word_index = dict(zip(voc, range(len(voc))))

"""
As you can see, we obtain the same encoding as above for our test sentence:
"""

test = ["cat", "sat"]
[word_index[w] for w in test]

[1751, 2665]

In [41]:
"""
## Load pre-trained word embeddings
"""

"""
Let's download pre-trained GloVe embeddings (a 822M zip file).

You'll need to run the following commands:

```
!wget http://nlp.stanford.edu/data/glove.6B.zip
!unzip -q glove.6B.zip
```
"""

"""
The archive contains text-encoded vectors of various sizes: 50-dimensional,
100-dimensional, 200-dimensional, 300-dimensional. We'll use the 100D ones.

Let's make a dict mapping words (strings) to their NumPy vector representation:
"""
path_to_glove_file = os.path.join(
    os.path.expanduser("~"), "work/resources/glove.6B.100d.txt"
)

embeddings_index = {}
with open(path_to_glove_file) as f:
    for line in f:
        word, coefs = line.split(maxsplit=1)
        coefs = np.fromstring(coefs, "f", sep=" ")
        embeddings_index[word] = coefs

print("Found %s word vectors." % len(embeddings_index))

"""
Now, let's prepare a corresponding embedding matrix that we can use in a Keras
`Embedding` layer. It's a simple NumPy matrix where entry at index `i` is the pre-trained
vector for the word of index `i` in our `vectorizer`'s vocabulary.
"""

num_tokens = len(voc) + 2
embedding_dim = 100
hits = 0
misses = 0

# Prepare embedding matrix
embedding_matrix = np.zeros((num_tokens, embedding_dim))
for word, i in word_index.items():
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        # Words not found in embedding index will be all-zeros.
        # This includes the representation for "padding" and "OOV"
        embedding_matrix[i] = embedding_vector
        hits += 1
    else:
        misses += 1
print("Converted %d words (%d misses)" % (hits, misses))

Found 400000 word vectors.
Converted 9145 words (855 misses)


In [42]:
"""
Next, we load the pre-trained word embeddings matrix into an `Embedding` layer.

Note that we set `trainable=False` so as to keep the embeddings fixed (we don't want to
update them during training).
"""

from tensorflow.keras.layers import Embedding

embedding_layer = Embedding(
    num_tokens,
    embedding_dim,
    embeddings_initializer=keras.initializers.Constant(embedding_matrix),
    trainable=False,
)

In [59]:
"""
## Build the model

A simple 1D convnet with global max pooling and a classifier at the end.
"""

from tensorflow.keras import layers

int_sequences_input = keras.Input(shape=(None,), dtype="int64")
embedded_sequences = embedding_layer(int_sequences_input)
x = layers.Conv1D(128, 5, activation="relu")(embedded_sequences)
x = layers.MaxPooling1D(5)(x)
x = layers.Conv1D(128, 5, activation="relu")(x)
x = layers.MaxPooling1D(5)(x)
x = layers.Conv1D(128, 5, activation="relu")(x)
x = layers.GlobalMaxPooling1D()(x)
x = layers.Dense(128, activation="relu")(x)
x = layers.Dropout(0.5)(x)
preds = layers.Dense(len(class_names), activation="softmax")(x)
model = keras.Model(int_sequences_input, preds)
model.summary()

Model: "model_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_2 (InputLayer)        [(None, None)]            0         
                                                                 
 embedding (Embedding)       (None, None, 100)         1000200   
                                                                 
 conv1d_2 (Conv1D)           (None, None, 128)         64128     
                                                                 
 max_pooling1d_1 (MaxPooling  (None, None, 128)        0         
 1D)                                                             
                                                                 
 conv1d_3 (Conv1D)           (None, None, 128)         82048     
                                                                 
 max_pooling1d_2 (MaxPooling  (None, None, 128)        0         
 1D)                                                       

In [60]:
x_train = vectorizer(np.array([[s] for s in train_samples])).numpy()
x_val = vectorizer(np.array([[s] for s in val_samples])).numpy()

y_train = np.array(train_labels)
y_val = np.array(val_labels)

In [61]:
"""
We use categorical crossentropy as our loss since we're doing softmax classification.
Moreover, we use `sparse_categorical_crossentropy` since our labels are integers.
"""

model.compile(
    loss="sparse_categorical_crossentropy", optimizer="rmsprop", metrics=["acc"]
)
model.fit(x_train, y_train, batch_size=128, epochs=20, validation_data=(x_val, y_val))


Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


<keras.callbacks.History at 0x7f2ea854ea90>

In [62]:
model.save('/home/ifte-home/Documents/mental_health/suicide/CSSRS/model.h5')

In [2]:
from tensorflow.keras.models import load_model
model = load_model('/home/ifte-home/Documents/mental_health/suicide/CSSRS/model.h5')

2022-12-05 22:22:50.769484: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcuda.so.1'; dlerror: libcuda.so.1: cannot open shared object file: No such file or directory
2022-12-05 22:22:50.769508: W tensorflow/compiler/xla/stream_executor/cuda/cuda_driver.cc:265] failed call to cuInit: UNKNOWN ERROR (303)
2022-12-05 22:22:50.769523: I tensorflow/compiler/xla/stream_executor/cuda/cuda_diagnostics.cc:156] kernel driver does not appear to be running on this host (ifte-HomePC): /proc/driver/nvidia/version does not exist
2022-12-05 22:22:50.769893: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [27]:
"""
## Export an end-to-end model

Now, we may want to export a `Model` object that takes as input a string of arbitrary
length, rather than a sequence of indices. It would make the model much more portable,
since you wouldn't have to worry about the input preprocessing pipeline.

Our `vectorizer` is actually a Keras layer, so it's simple:
"""

string_input = keras.Input(shape=(1,), dtype="string")
x = vectorizer(string_input)
preds = model(x)
end_to_end_model = keras.Model(string_input, preds)

probabilities = end_to_end_model.predict(
    [["this message is about computer graphics and 3D modeling"]]
)

class_names[np.argmax(probabilities[0])]



'Ideation'

In [56]:
x = vectorizer(df['Processed'][0])

In [58]:
preds = model(x)
preds

ValueError: Exception encountered when calling layer 'model' (type Functional).

Input 0 of layer "conv1d" is incompatible with the layer: expected min_ndim=3, found ndim=2. Full shape received: (500, 100)

Call arguments received by layer 'model' (type Functional):
  â¢ inputs=tf.Tensor(shape=(500,), dtype=int64)
  â¢ training=None
  â¢ mask=None

In [55]:
    
    preds = model(x)
    end_to_end_model = keras.Model(string_input, preds)
    probabilities = end_to_end_model.predict([[text]])

ValueError: Exception encountered when calling layer 'model' (type Functional).

Input 0 of layer "conv1d" is incompatible with the layer: expected min_ndim=3, found ndim=2. Full shape received: (500, 100)

Call arguments received by layer 'model' (type Functional):
  â¢ inputs=tf.Tensor(shape=(500,), dtype=int64)
  â¢ training=None
  â¢ mask=None

In [67]:
string_input = keras.Input(shape=(1,), dtype="string")
def predict(text):
    x = vectorizer(string_input)
    preds = model(x)
    end_to_end_model = keras.Model(string_input, preds)
    probabilities = end_to_end_model.predict([[text]])

    return class_names[np.argmax(probabilities[0])]

In [51]:
df = pd.read_csv('/home/ifte-home/Documents/mental_health/suicide/Suicide_Detection.csv', index_col=0)

In [37]:
df['Processed'] = df.text.apply(lambda x: process(x))

In [38]:
df.to_csv('/home/ifte-home/Documents/mental_health/suicide/Suicide_Detection_processed.csv')

In [65]:
df = pd.read_csv('/home/ifte-home/Documents/mental_health/suicide/Suicide_Detection_processed.csv')

In [54]:
df

Unnamed: 0.1,Unnamed: 0,text,class,Processed
0,2,Ex Wife Threatening SuicideRecently I left my ...,suicide,ex wife threatening suiciderecently left wife ...
1,3,Am I weird I don't get affected by compliments...,non-suicide,weird get affected compliment coming someone k...
2,4,Finally 2020 is almost over... So I can never ...,non-suicide,finally 2020 almost never hear 2020 ha bad yea...
3,8,i need helpjust help me im crying so hard,suicide,need helpjust help im cry hard
4,9,"Iâm so lostHello, my name is Adam (16) and Iâv...",suicide,losthello name adam 16 struggling year afraid ...
...,...,...,...,...
232069,348103,If you don't like rock then your not going to ...,non-suicide,like rock going get anything go http musictast...
232070,348106,You how you can tell i have so many friends an...,non-suicide,tell many friend lonely everything deprived pr...
232071,348107,pee probably tastes like salty teağğ¦â¼ï¸ can som...,non-suicide,pee probably taste like salty tea someone dran...
232072,348108,The usual stuff you find hereI'm not posting t...,suicide,usual stuff find herei posting sympathy pity k...


In [71]:
df = df.dropna()
df["Processed"]=df["Processed"].astype(str)

In [None]:
result = []
for i, j in df.iterrows():
    result.append(predict(j['Processed']))







































































































































































































































































In [3]:
result

NameError: name 'result' is not defined

In [None]:
df['suicide_intensity'] = result
df['intensity'] = list(Le.inverse_transform(result))
df.to_csv(path+'reddit_dataset_with_CSSR_intensity_glove.csv')
p = df.groupby(['class', 'intensity']).count()
p

In [1]:
import pandas as pd

In [2]:
df = pd.read_csv('/home/ifte-home/Documents/mental_health/suicide/CSSRS/reddit_dataset_with_CSSR_intensity.csv')

In [4]:
p = df.groupby(['class', 'intensity']).count()

In [7]:
df = pd.DataFrame()
for i in range(1,6):
    file = str(i)+'_reddit_dataset_with_CSSR_intensity_glove'
    data = pd.read_csv('/home/ifte-home/Documents/mental_health/suicide/CSSRS/reddit_dataset_with_CSSR_intensity.csv')
    df = pd.concat([df,data])
    

In [9]:
p = df.groupby(['class', 'intensity']).count()
p.transpose()

class,non-suicide,non-suicide,non-suicide,non-suicide,suicide,suicide,suicide,suicide
intensity,Attempt,Behavior,Ideation,Indicator,Attempt,Behavior,Ideation,Indicator
Unnamed: 0,10,370,564645,14855,35,730,575070,4265
Unnamed: 0.1,10,370,564645,14855,35,730,575070,4265
processed,10,370,564645,14855,35,730,575070,4265
category,10,370,564645,14855,35,730,575070,4265
suicide_intensity,10,370,564645,14855,35,730,575070,4265
