<a href="https://colab.research.google.com/github/Katonokatono/Suicide/blob/Preprocessing/SD_Augmentation.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# Loading the dataset
import pandas as pd
df = pd.read_csv('/content/final_data_suicide-UNP-1.csv')
df.head()

Unnamed: 0.1,Unnamed: 0,tweet,friends_count,followers_count,statuses_count,created_at,retweet_count,label,town,clean_text
0,1,BunMo Multi Item Stretchy Strings Fidget Toy 6...,182,83,6950,09/14/2021 7:47,0,anxiety,burundi,bunmo multi item stretchy string fidget toy pk...
1,2,@unkonfined The basic motivation behind our be...,983,915,12944,09/14/2021 7:31,0,anxiety,kenya,unkonfined basic motivation behind behavior bi...
2,3,@Munenekimathi5 @zablonorina1 Is your BP norma...,281,63,290,09/14/2021 7:30,0,anxiety,nairobi,munenekimathi zablonorina bp normal yes check ...
3,4,How do you deal with being anxious?\n\nShare y...,220,2052,37982,09/14/2021 7:02,0,anxiety,nairobi kenya,deal anxious share experience anxiety yvonne m...
4,5,"Did you know ‘dry spell’ causes anxiety, depre...",857,272,650,09/14/2021 7:00,2,general tweet,nairobi,know dry spell cause anxiety depression even s...


In [2]:
%%capture
!pip install nlpaug
!pip install transformers

In [3]:
# Loading the required augmentation Libraries
import nlpaug.augmenter.char as nac
import nlpaug.augmenter.word as naw
import nlpaug.augmenter.sentence as nas
import nlpaug.flow as naf
from tqdm import tqdm
from sklearn.utils import shuffle

from nlpaug.util import Action

In [4]:
#Split the train and test data
from sklearn.model_selection import train_test_split
train,valid=train_test_split(df,test_size=0.20 , stratify = df['label'])
train.shape, valid.shape

((3337, 10), (835, 10))

In [25]:
valid.to_csv('VAlid dataset.csv')

In [5]:
# Check the size of our columns so as to know how to augment each column
train['label'].value_counts()

general tweet      2088
awareness           382
anxiety             286
depression          284
substance abuse     107
thoughts            103
stress               87
Name: label, dtype: int64

In [6]:
# Test text to check augmentation quality.
text = train.iloc[0]['tweet']
text

'Check out our youtube channel for our latest video on World Suicide Prevention Day!  '

In [7]:
# ContextualWordEmbsAug : Augmenter that apply operation (word level) to textual input based on contextual word embeddings.

aug = naw.ContextualWordEmbsAug(
    model_path='bert-base-uncased', action="insert")
augmented_text = aug.augment(text)


print('Original text \n',text,'\n Augmented text\n', augmented_text)

Original text 
 Check out our youtube channel for our latest video on World Suicide Prevention Day!   
 Augmented text
 can check out her our youtube channel there for our latest video session on world suicide prevention day!


In [8]:
# Creating a copy of  the dataset
df1 = df.copy(deep=True)

In [9]:
import numpy as np
#For anxiety, class = 0,

# Creating augmented text data to increase our training dataset by 214 entries

def augment_text(df1,samples=214,pr=0.2):   
    aug.aug_p=pr
    new_text=[]
    
    #selecting the  class samples
    df_n=df1[df1['label']=='anxiety'].reset_index(drop=True)

    ## data augmentation loop
    for i in tqdm(np.random.randint(0,len(df_n),samples)):
        
            text = df_n.iloc[i]['tweet']
            augmented_text = aug.augment(text)
            new_text.append(augmented_text)
    
    
    ## dataframe
    new=pd.DataFrame({'tweet':new_text,'label':'anxiety'})
    df1=shuffle(df1.append(new).reset_index(drop=True))
    return df1
   
train = augment_text(train)
print(train.shape, '\n\n')
train.head()

100%|██████████| 214/214 [02:38<00:00,  1.35it/s]

(3551, 10) 







Unnamed: 0.1,Unnamed: 0,tweet,friends_count,followers_count,statuses_count,created_at,retweet_count,label,town,clean_text
1691,328.0,"Sex doesn’t have to be a daily indulgence, lik...",1816,56476.0,61451.0,14-09-21 4:00,0,general tweet,nairobi,sex doesnt daily indulgence like prescription ...
865,97.0,I'm too anxious for this,512,1833.0,28515.0,09/12/2021 6:40,0,general tweet,mombasa,anxious
435,7.0,I get so anxious whenever I'm going anywhere o...,1427,12980.0,139126.0,09/17/2021 14:41,0,anxiety,nairobi,get anxious whenever go anywhere house lol
753,257.0,Steve Biko: I have no good things to say about...,158,3486941.0,602880.0,09-10-21 4:04,11,general tweet,nairobi,steve biko good thing say young people leaders...
307,274.0,"Dear,Good morning,and hope this weekend will w...",2030,4291.0,14939.0,2021-09-12 08:34:02,0,general tweet,nairobi,dear good morning hope weekend work gateway he...


In [10]:
import numpy as np
#For awareness, 

# Creating augmented text data to increase our training dataset by 118 entries

def augment_text(df1,samples=118,pr=0.2):   
    aug.aug_p=pr
    new_text=[]
    
    #selecting the  class samples
    df_n=df1[df1['label']=='awareness'].reset_index(drop=True)

    ## data augmentation loop
    for i in tqdm(np.random.randint(0,len(df_n),samples)):
        
            text = df_n.iloc[i]['clean_text']
            augmented_text = aug.augment(text)
            new_text.append(augmented_text)
    
    
    ## dataframe
    new=pd.DataFrame({'clean_text':new_text,'label':'awareness'})
    df1=shuffle(df1.append(new).reset_index(drop=True))
    return df1
   
train = augment_text(train)
print(train.shape, '\n\n')
train.head()

100%|██████████| 118/118 [00:38<00:00,  3.08it/s]

(3669, 10) 







Unnamed: 0.1,Unnamed: 0,tweet,friends_count,followers_count,statuses_count,created_at,retweet_count,label,town,clean_text
3168,52.0,@PlutusTheFarmer For me it’s the anxiety and h...,360,1264.0,16875.0,09/13/2021 15:20,0,general tweet,kenya,plutusthefarmer anxiety deal rude ppl always g...
1596,31.0,@RailaOdinga Thank you Baba for coming to our ...,78,90.0,306.0,2021-09-16 16:15:33,0,general tweet,nairobi,railaodinga thank baba come aid almost die let...
1738,217.0,@gufydox Call a psychologist you know and trus...,1345,757.0,1401.0,09/11/2021 1:55,0,anxiety,nairobi,gufydox call psychologist know trust especiall...
2972,328.0,We need to come together and end stima related...,213,159.0,221.0,9/10/2021 11:05,0,awareness,nairobi,need come together end stima relate suicide
373,32.0,@self_essteem Doing drugs,882,420.0,2727.0,09/17/2021 15:01,0,general tweet,nairobi,self essteem drug


In [11]:
print(train.shape)


(3669, 10)


In [12]:
#For depression

# Creating augmented text data to increase our training dataset by 216 entries

def augment_text(df1,samples=216,pr=0.2):  
    aug.aug_p=pr
    new_text=[]
    
    #selecting the  class samples
    df_n=df1[df1['label']=='depression'].reset_index(drop=True)

    ## data augmentation loop
    for i in tqdm(np.random.randint(0,len(df_n),samples)):
        
            text = df_n.iloc[i]['tweet']
            augmented_text = aug.augment(text)
            new_text.append(augmented_text)
    
    
    ## dataframe
    new=pd.DataFrame({'tweet':new_text,'label':'depression'})
    df1=shuffle(df1.append(new).reset_index(drop=True))
    return df1
   
train = augment_text(train)
print(train.shape, '\n\n')
train.head()

100%|██████████| 216/216 [03:06<00:00,  1.16it/s]

(3885, 10) 







Unnamed: 0.1,Unnamed: 0,tweet,friends_count,followers_count,statuses_count,created_at,retweet_count,label,town,clean_text
1104,150.0,Proper mental health is vital for your proper ...,898.0,949.0,438.0,09/15/2021 4:15,5.0,awareness,fort-portal,proper mental health vital proper function dai...
3599,,@ sg4devpt a. 3 anxieties for me manifests thr...,,,,,,anxiety,,
3835,,i'm depressed about means i'm half funny till ...,,,,,,depression,,
194,278.0,Women should be the last stuff to give you Stress,121.0,129.0,2030.0,2021-09-12 06:11:49,3.0,stress,nakuru,woman last stuff give stress
2773,62.0,"@Wordslinger__ Thank you, thank you ❤️❤️❤️\n\n...",845.0,3225.0,39889.0,09/13/2021 11:16,0.0,general tweet,kenya,wordslinger thank thank mind need calm though ...


In [13]:
print(train.shape)


(3885, 10)


In [14]:
#For substance abuse
# Creating augmented text data to increase our training dataset by 397 entries

def augment_text(df1,samples=393,pr=0.2):   #70 aurgumented data
    aug.aug_p=pr
    new_text=[]
    
    #selecting the  class samples
    df_n=df1[df1['label']=='substance abuse'].reset_index(drop=True)

    ## data augmentation loop
    for i in tqdm(np.random.randint(0,len(df_n),samples)):
        
            text = df_n.iloc[i]['tweet']
            augmented_text = aug.augment(text)
            new_text.append(augmented_text)
    
    
    ## dataframe
    new=pd.DataFrame({'tweet':new_text,'label':'substance abuse'})
    df1=shuffle(df1.append(new).reset_index(drop=True))
    return df1
   
train = augment_text(train)
print(train.shape, '\n\n')
train.head()

100%|██████████| 393/393 [05:28<00:00,  1.20it/s]

(4278, 10) 







Unnamed: 0.1,Unnamed: 0,tweet,friends_count,followers_count,statuses_count,created_at,retweet_count,label,town,clean_text
2357,257.0,@syoxxx @OleItumbi @WilliamsRuto I understand ...,188.0,79.0,439.0,2021-09-12 15:27:03,0.0,general tweet,nairobi ke,syoxxx oleitumbi williamsruto understand bro g...
182,,,,,,,,awareness,,want justice came today stop dwell past think ...
583,515.0,I don't need DRUGS because I got the most HIGH🤲,5443.0,6521.0,1842.0,09-10-21 20:33,2.0,general tweet,kenya,need drug get high
4203,,2. agenda two from young people. drug & amp ; ...,,,,,,substance abuse,,
2033,144.0,@bevalynekwambo3 And you big number depressed,2300.0,3662.0,21037.0,08-09-21 9:30,0.0,depression,nairobi,bevalynekwambo big number depress


In [15]:
#For thoughts


# Creating augmented text data to increase our training dataset by 397 entries

def augment_text(df1,samples=397,pr=0.2):   #70 aurgumented data
    aug.aug_p=pr
    new_text=[]
    
    #selecting the  class samples
    df_n=df1[df1['label']=='thoughts'].reset_index(drop=True)

    ## data augmentation loop
    for i in tqdm(np.random.randint(0,len(df_n),samples)):
        
            text = df_n.iloc[i]['tweet']
            augmented_text = aug.augment(text)
            new_text.append(augmented_text)
    
    
    ## dataframe
    new=pd.DataFrame({'tweet':new_text,'label':'thoughts'})
    df1=shuffle(df1.append(new).reset_index(drop=True))
    return df1
   
train = augment_text(train)
print(train.shape, '\n\n')
train.head()

100%|██████████| 397/397 [05:32<00:00,  1.20it/s]

(4675, 10) 







Unnamed: 0.1,Unnamed: 0,tweet,friends_count,followers_count,statuses_count,created_at,retweet_count,label,town,clean_text
4269,52.0,@PlutusTheFarmer For me it’s the anxiety and h...,360.0,1264.0,16875.0,09/13/2021 15:20,0.0,general tweet,kenya,plutusthefarmer anxiety deal rude ppl always g...
90,,now that it'bal s not scientifically first pro...,,,,,,depression,,
730,584.0,@muthumbinm This client needs to be put on sui...,523.0,694.0,14536.0,9/9/2021 11:39,0.0,general tweet,nairobi,muthumbinm client need put suicide watch texts...
1258,,the most frequent of routine diagnosis of term...,,,,,,substance abuse,,
1535,,: did you know most of the athletes right espe...,,,,,,depression,,


In [16]:
#For stress

# Creating augmented text data to increase our training dataset by 413 entries

def augment_text(df1,samples=413,pr=0.2):   #70 aurgumented data
    aug.aug_p=pr
    new_text=[]
    
    #selecting the  class samples
    df_n=df1[df1['label']=='stress'].reset_index(drop=True)

    ## data augmentation loop
    for i in tqdm(np.random.randint(0,len(df_n),samples)):
        
            text = df_n.iloc[i]['tweet']
            augmented_text = aug.augment(text)
            new_text.append(augmented_text)
    
    
    ## dataframe
    new=pd.DataFrame({'tweet':new_text,'label':'stress'})
    df1=shuffle(df1.append(new).reset_index(drop=True))
    return df1
   
train = augment_text(train)
print(train.shape, '\n\n')
train.head()

100%|██████████| 413/413 [05:32<00:00,  1.24it/s]

(5088, 10) 







Unnamed: 0.1,Unnamed: 0,tweet,friends_count,followers_count,statuses_count,created_at,retweet_count,label,town,clean_text
1348,,@ high mikealfred anxiety ; levels spiking,,,,,,anxiety,,
2024,100.0,Found this is my music library. How fucking cl...,401.0,837.0,3639.0,09/14/2021 18:23,0.0,general tweet,nairobi,find music library fucking classic still love ...
1657,165.0,looks like the effects of burnout...played too...,2052.0,1024.0,38944.0,2021-09-14 10:51:59,0.0,stress,nairobi,look like effect burnout play much club countr...
4671,360.0,#KomarockModern Health Care Facility stands wi...,9543.0,77549.0,129884.0,2021-09-10 09:30:06,13.0,general tweet,kisumu,komarockmodern health care facility stand toa ...
1002,479.0,The persistent reported traumatic events such ...,257.0,264.0,2764.0,09/10/2021 6:39,0.0,depression,kenya,persistent report traumatic event violence dis...


In [17]:
df_gen_tweet=df[df['label']=='general tweet']
df_gen_tweet_under = df_gen_tweet.sample(500)

In [18]:
train = train[train['label'] != 'general tweet']

In [19]:
# Previewing the train dataset
train.columns

Index(['Unnamed: 0', 'tweet', 'friends_count', 'followers_count',
       'statuses_count', 'created_at', 'retweet_count', 'label', 'town',
       'clean_text'],
      dtype='object')

In [20]:
# Check the number of entries for each disorder in the train dataset
train['label'].value_counts()

stress             500
awareness          500
thoughts           500
depression         500
substance abuse    500
anxiety            500
Name: label, dtype: int64

In [21]:
train = pd.concat([df_gen_tweet_under,train], axis=0,ignore_index=True)

In [23]:
# Check the number of entries for each disorder in the train dataset
train['label'].value_counts()

stress             500
awareness          500
thoughts           500
depression         500
substance abuse    500
general tweet      500
anxiety            500
Name: label, dtype: int64

In [26]:
# Previewing the train dataset
train.columns

Index(['Unnamed: 0', 'tweet', 'friends_count', 'followers_count',
       'statuses_count', 'created_at', 'retweet_count', 'label', 'town',
       'clean_text'],
      dtype='object')

In [27]:
train_1=train[['tweet','label']]

Tweet Preprocessing

In [32]:
import pandas as pd
import numpy as np
import pandas as pd
import numpy as np
#for text pre-processing
import re, string
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import SnowballStemmer
from nltk.corpus import wordnet
from nltk.stem import WordNetLemmatizer
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('wordnet')
#for model-building
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report, f1_score, accuracy_score, confusion_matrix
from sklearn.metrics import roc_curve, auc, roc_auc_score
# bag of words
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
#for word embedding
import gensim
from gensim.models import Word2Vec
from nltk.corpus import stopwords
nltk.download('stopwords')
from nltk.util import ngrams
from spacy.lang.en import English
nlp = English()
import spacy
from nltk.stem import WordNetLemmatizer
  
lemmatizer = WordNetLemmatizer()

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [39]:
train_1['tweet']=train_1['tweet'].astype(str)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


In [40]:
# Remove url
#removing the URL links
example="New competition launched :https://www.kaggle.com/c/nlp-getting-started"
def remove_URL(text):
    url = re.compile(r'https?://\S+|www\.\S+')
    return url.sub(r'',text)

remove_URL(example)
train_1['tweet']=train_1['tweet'].apply(lambda x : remove_URL(x))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  if __name__ == '__main__':


In [38]:
train_1.dtypes

tweet    object
label    object
dtype: object

In [36]:
#convert to lowercase, strip and remove punctuations
def preprocess(text):
    text = text.lower() 
    text=text.strip()  
    text=re.compile('<.*?>').sub('', text) 
    text = re.compile('[%s]' % re.escape(string.punctuation)).sub(' ', text)  
    text = re.sub('\s+', ' ', text)  
    text = re.sub(r'\[[0-9]*\]',' ',text) 
    text=re.sub(r'[^\w\s]', '', str(text).lower().strip())
    text = re.sub(r'\d',' ',text) 
    text = re.sub(r'\s+',' ',text) 
    text = re.sub("@\S+", "", text)
    re.sub("\$", "", text)
    text = re.sub("https?:\/\/.*[\r\n]*", "", text)
    re.sub("#", "", text)
    return text


#tokenizer, pos tagging and entity recognition

 
# STOPWORD REMOVAL
def stopword(string):
    a= [i for i in string.split() if i not in stopwords.words('english')]
    return ' '.join(a)
#LEMMATIZATION
# Initialize the lemmatizer
wl = WordNetLemmatizer()
 
# This is a helper function to map NTLK position tags
def get_wordnet_pos(tag):
    if tag.startswith('J'):
        return wordnet.ADJ
    elif tag.startswith('V'):
        return wordnet.VERB
    elif tag.startswith('N'):
        return wordnet.NOUN
    elif tag.startswith('R'):
        return wordnet.ADV
    else:
        return wordnet.NOUN
# Tokenize the sentence
def lemmatizer(string):
    word_pos_tags = nltk.pos_tag(word_tokenize(string)) # Get position tags
    a=[wl.lemmatize(tag[0], get_wordnet_pos(tag[1])) for idx, tag in enumerate(word_pos_tags)] # Map the position tag and lemmatize the word/token
    return " ".join(a)

In [41]:

def finalpreprocess(string):
    return lemmatizer(stopword(preprocess(string)))
train_1['clean_text'] = train_1['tweet'].apply(lambda x: finalpreprocess(x))
# df = df.drop(['tweet'], axis= 1)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  after removing the cwd from sys.path.


In [42]:
train_1


Unnamed: 0,tweet,label,clean_text
0,"Lord, this is the only anxiety and stress I ne...",general tweet,lord anxiety stress need
1,@uhruganda we envision to empowering and tran...,general tweet,uhruganda envision empower transform life peop...
2,@jwkhasndi Uzee. Age is a big factor in these ...,general tweet,jwkhasndi uzee age big factor thing huwezani n...
3,@maijathemzungu I flew a lot as a kid ...I dun...,general tweet,maijathemzungu fly lot kid dunno happen turn s...
4,8pm #MazungumzoWaziWazi stress ya leo\n@Radi...,general tweet,pm mazungumzowaziwazi stress ya leo radiojambo...
...,...,...,...
3495,"@iamjojo All of it, sis. The directionlessness...",depression,iamjojo si directionlessness depression pile b...
3496,idk why don people disturb @ your joyjmurraya ...,depression,idk people disturb joyjmurraya see clapbacks m...
3497,OVERCOME ANXIETY \n\nTake a time-out\n\nEat we...,anxiety,overcome anxiety take time eat well balanced m...
3498,@tashmitambo @MisterAlbie Stress/depression. I...,stress,tashmitambo misteralbie stress depression grey...


In [43]:
train_1.to_csv('Train_augmented_SD.csv')