In [2]:
import pickle
import re
import numpy as np
import pandas as pd
from tqdm import tqdm
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\gaura\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

# Data Preprocessing


In [35]:
def preprocess_text(text):
    #returns the string obtained by replacing HTML tags to "" i.e., remove the HTML tags
    
    text = re.sub('<[^>]*>', '', text)
    #re.search() to find the first match for a pattern. findall() finds *all* the matches and returns them as a list of strings
    #remove all the non-word characters
    emoticons = re.findall('(?::|;|=)(?:-)?(?:\)|\(|D|P)', text)
    #convert the text to lowercase
    text = re.sub('[\W]+', ' ', text.lower())
    
    text = text+' '.join(emoticons).replace('-', '') 
    return text

text = "Hello, this is Neha. How are you doing? My email is neha@gmail.com:: Yes!!"
print(preprocess_text(text))

hello this is neha how are you doing my email is neha gmail com yes 


In [38]:
#Applying the preprocess_text function to our suicidal dataset
#Using tdqm : percentage of progress made in accomplishing a task.
tqdm.pandas()
df = pd.read_csv('suicidal_data.csv')
df['tweet'] = df['tweet'].progress_apply(preprocess_text)

100%|███████████████████████████████████████████████████████████████████████████| 9119/9119 [00:00<00:00, 14029.49it/s]


In [39]:
#Tokenize the text by word using split()
#Word stemming performed to convert the word to its root form
from nltk.stem.porter import PorterStemmer
porter = PorterStemmer()
def tokenizer_porter(text):
    return [porter.stem(word) for word in text.split()]

In [41]:
#remove stopwords in the text
from nltk.corpus import stopwords
stop = stopwords.words('english')

In [42]:
#Testing on a sentence
[w for w in tokenizer_porter('I like loving myself and I love people who have loved themselves') if w not in stop]

['I', 'like', 'love', 'I', 'love', 'peopl', 'love', 'themselv']

# Vectorization

## Using the hashing vectorizer

#### convert a collection of text documents to a matrix of token occurrences. If your are looking to get term frequencies weighted by their relative importance (IDF) then Tfidfvectorizer is what you should use. If you need the raw counts or normalized counts (term frequency), then you should use CountVectorizer or HashingVectorizer.

In [9]:
def tokenizer(text):
    text = re.sub('<[^>]*>', '', text)
    emoticons = re.findall('(?::|;|=)(?:-)?(?:\(|D|P)',text.lower())
    text = re.sub('[\W]+', ' ', text.lower())
    text += ' '.join(emoticons).replace('-', '')
    tokenized = [w for w in tokenizer_porter(text) if w not in stop]
    return tokenized

In [44]:
from sklearn.feature_extraction.text import HashingVectorizer

#Hashing vectorizer object
vect = HashingVectorizer(decode_error='ignore', n_features=2**21, 
                         preprocessor=None,tokenizer=tokenizer)

## Building the Model

In [11]:
# stochastic gradient descent classifier algorithm.
from sklearn.linear_model import SGDClassifier
clf = SGDClassifier(loss='log', random_state=1)

## Training

In [12]:

X = df["tweet"].to_list()
y = df['label']

In [13]:
#splitting our dataset - 80% for training, 20% for testing
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test = train_test_split(X,
                                                 y,
                                                 test_size=0.20,
                                                 random_state=0)

In [14]:
#transforming the text data to vectors with the Hashing Vectorizer we created earlier
X_train = vect.transform(X_train)
X_test = vect.transform(X_test)


In [17]:
#fit the data to the algorithm
classes = np.array([0, 1])
clf.partial_fit(X_train, y_train,classes=classes)

SGDClassifier(alpha=0.0001, average=False, class_weight=None,
              early_stopping=False, epsilon=0.1, eta0=0.0, fit_intercept=True,
              l1_ratio=0.15, learning_rate='optimal', loss='log', max_iter=1000,
              n_iter_no_change=5, n_jobs=None, penalty='l2', power_t=0.5,
              random_state=1, shuffle=True, tol=0.001, validation_fraction=0.1,
              verbose=0, warm_start=False)

In [18]:
#Accuracy test on our dataset
print('Accuracy: %.3f' % clf.score(X_test, y_test))


Accuracy: 0.907


In [45]:
#updating the model with prediction
clf = clf.partial_fit(X_test, y_test)

# Testing and Predicting 

In [48]:
label = {0:'negative', 1:'positive'}
example = ["I'm going to die and I hate my life as fuck"]
X = vect.transform(example)
print('Prediction: %s\nProbability: %.2f%%'
      %(label[clf.predict(X)[0]],np.max(clf.predict_proba(X))*100))

Prediction: negative
Probability: 51.33%


In [49]:
label = {0:'negative', 1:'positive'}
example = ["I'm depressed and lonely.... Please help me."]
X = vect.transform(example)
print('Prediction: %s\nProbability: %.2f%%'
      %(label[clf.predict(X)[0]],np.max(clf.predict_proba(X))*100))

Prediction: positive
Probability: 86.94%


In [50]:
label = {0:'negative', 1:'positive'}
example = ["I want to sleep today."]
X = vect.transform(example)
print('Prediction: %s\nProbability: %.2f%%'
      %(label[clf.predict(X)[0]],np.max(clf.predict_proba(X))*100))

Prediction: negative
Probability: 92.28%
