## Depression in Tweets

In [1]:
# import nltk library
import nltk; nltk.download('punkt')
from nltk import sent_tokenize
from nltk.stem.snowball import SnowballStemmer
from nltk.tokenize.treebank import TreebankWordTokenizer

# import stopword libraries
nltk.download('stopwords'); from nltk.corpus import stopwords
from sklearn.feature_extraction import stop_words

# import other libraries
import pandas as pd
import numpy as np
import string
#from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import *
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import *
from sklearn.linear_model import LogisticRegression
from sklearn.grid_search import GridSearchCV

# import word embedding library
#import glove_helper

# import helper libraries
import collections
from common import utils, vocabulary

#display multiple results per cell
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

#export models
from sklearn.externals import joblib

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/benthompson/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/benthompson/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!




In [2]:
#read in tweets
df = pd.DataFrame.from_csv('../depression_subreddit_nondeleted_201801_06.csv')

In [3]:
#add index
df = df.reset_index()

In [4]:
#look at data
df.head(5)

Unnamed: 0,body
0,I was on 20mg fluoxetine for a while and it di...
1,"Wonderful. You are the poet amongst fools, you..."
2,&gt;Women dont like depressed n***ers \n\n\n\n...
3,Whoosh
4,I definitely hear this. I'm sorry you're feeli...


In [5]:
#how man non-distinct tweets
len(df)

311132

In [6]:
#create column on 1's
x = [1]
x = x * len(df)
df['target'] = x

In [7]:
#make all lowercase
df['body'] = df['body'].str.lower()

In [8]:
df.head(5)

Unnamed: 0,body,target
0,i was on 20mg fluoxetine for a while and it di...,1
1,"wonderful. you are the poet amongst fools, you...",1
2,&gt;women dont like depressed n***ers \n\n\n\n...,1
3,whoosh,1
4,i definitely hear this. i'm sorry you're feeli...,1


## Bring in random tweets

In [9]:
#read in tweets
df_2 = pd.DataFrame.from_csv('../random_tweets.csv', header=None)

In [10]:
#look at data
df_2.head()

True but she still cancelled tho.
"RT @roxxxdoxxx: when she said ""i gotta ask first"" i felt that 😫😅😂 https://t.co/BGPZqFLb9v"
appreciate this perfectly timed pic of me and catto pls https://t.co/GE5poooRcF
one of Beyoncé’s most underrated looks is the one from Jealous. don’t @ me
"""Once you create a system for censoring speech on the grounds that it is 'fake news' (even if it's parody, or sarca… https://t.co/EpuSUaK0UC"


In [11]:
#how many
len(df_2)

135177

In [12]:
#give index
df_2 = df_2.reset_index()

#give column name
df_2.columns = ['body']

In [13]:
#how many distinct tweets
len(df_2.body.unique())

111985

In [14]:
#Make dataframe of unique
df_2 = pd.DataFrame(df_2.body.unique())

#give column name
df_2.columns = ['body']

In [15]:
#make all tweets lowercase
df_2['body'] = df_2['body'].str.lower()
df_2.columns = ['body']

In [16]:
df_2.head()

Unnamed: 0,body
0,true but she still cancelled tho.
1,"rt @roxxxdoxxx: when she said ""i gotta ask fir..."
2,appreciate this perfectly timed pic of me and ...
3,one of beyoncé’s most underrated looks is the ...
4,"""once you create a system for censoring speech..."


In [17]:
#check for tweets that use depression
df_2[(df_2['body'].str.contains('depressed') | df_2['body'].str.contains('depression'))]

#drop them
df_2.drop(df_2[(df_2.body.str.contains('depressed')) | (df_2.body.str.contains('depression'))].index, inplace=True)

Unnamed: 0,body
1769,rt @nickhansonmn: hey sorry i’ve been distant ...
1918,rt @matrix_reioaded: ahh i’m depressed... but ...
2568,rt @caucasianjames: on tinder depressed
2763,rt @softyoonle: hi this would really help me a...
3507,rt @iamsofiadg: philippines 🇵🇭 \nneed someone ...
4159,rt @depressionnote: warning signs of depressio...
4268,rt @fuxksalliemae: alot of nigerians are strug...
4547,rt @daitonreed: you do not have to come this h...
5501,rt @sionesnow: when i read the first sentence ...
6143,i can't help the fact that i make all my chara...


In [18]:
#recheck length
len(df_2)

111847

In [19]:
#export to check quality
# df_2_sample = df_2.sample(n=100)
# df_2_sample.to_csv('../sample_100_random_tweets.csv')

In [20]:
#column of 0's
x = 0
x = x * len(df_2)

df_2['target'] = x

In [21]:
#balance classes
df_3 = df.sample(n=len(df_2))

In [22]:
# df_3.head()

In [23]:
#combine dfs
df = pd.concat([df_3,df_2])

In [24]:
len(df)

223694

In [25]:
df.head()

Unnamed: 0,body,target
60649,yeah this is something i struggle with all the...,1
127446,"being on the reverse, the thing i want more th...",1
127017,know how it feels man. for the past 4 months i...,1
88863,i have a genetic condition of gaining weight. ...,1
125979,i am over it nigga i’m at the gym keep it moving,1


In [28]:
#preprocess tweets
example_text="""'RT @techreview: A neural network can 
detect depression and mania in bipolar subjects 
by analyzing how they hold and tap on their smartphone…'"""

# tokenize
def tokenize_text(input_text):
    """
    Args: 
    input_text: a string representing an 
    individual review
        
    Returns:
    input_token: a list containing stemmed 
    tokens, with punctutations removed, for 
    an individual review
        
    """
    input_tokens=[]
        
    # Split sentence
    sents=sent_tokenize(input_text)
            
    # Split word
    for sent in sents:
        input_tokens+=TreebankWordTokenizer().tokenize(sent)
        
    return input_tokens


# canonicalize
def canonicalize_tokens(input_tokens):
    """
    Args:
    input_tokens: a list containing tokenized 
    tokens for an individual review
    
    Returns:
    input_tokens: a list containing canonicalized 
    tokens for an individual review
    
    """
    input_tokens=utils.canonicalize_words(input_tokens)
    return input_tokens


# preprocessor 
def preprocessor(raw_text):
    """
    Args:
    raw_text: a string representing an
    individual review
    
    Returns:
    preprocessed_text: a string representing 
    a preprocessed individual review
    
    """
    # tokenize
    tokens=tokenize_text(raw_text)
    
    # canonicalize
    canonical_tokens=canonicalize_tokens(tokens)
    
    # rejoin string
    preprocessed_text=(" ").join(canonical_tokens) 
    return preprocessed_text

# example data
#input_tokens=tokenize_text(example_text)
#print(input_tokens)

#canonical_tokens=canonicalize_tokens(input_tokens)
#print(canonical_tokens)

preprocessed_text=preprocessor(example_text) 
print(preprocessed_text)

'rt @ techreview : a neural network can detect depression and mania in bipolar subjects by analyzing how they hold and tap on their smartphone… '


In [29]:
# examine stopwords

# sklearn stopwords (frozenset)
sklearn_stopwords=stop_words.ENGLISH_STOP_WORDS
print("number of sklearn stopwords: %d" %(len(sklearn_stopwords)))
#print(sklearn_stopwords)

# nltk stopwords (list)
nltk_stopwords=stopwords.words("english")
print("number of nltk stopwords: %d" %(len(nltk_stopwords)))
#print(nltk_stopwords)

# combined sklearn, nltk, other stopwords (set)
total_stopwords=set(list(sklearn_stopwords.difference(set(nltk_stopwords)))+nltk_stopwords)

other_stopwords=["DG", "DGDG", "@", "rt", "'rt", "'", ":", "depression", "depressed", "RT"]
for w in other_stopwords:
    total_stopwords.add(w)
    
print("number of total stopwords: %d" %(len(total_stopwords)))

number of sklearn stopwords: 318
number of nltk stopwords: 179
number of total stopwords: 388


In [30]:
#look at review w/o stop words
new_review = []
for i in preprocessed_text.split():
    if i in total_stopwords:
        continue
    else:
        new_review.append(i)
        
print(new_review)

['techreview', 'neural', 'network', 'detect', 'mania', 'bipolar', 'subjects', 'analyzing', 'hold', 'tap', 'smartphone…']


In [31]:
#reset index
df = df.reset_index(drop=True)

In [35]:
#split into test, train before sampling to belance
# using recoded labels
#create train, test data
df['is_train'] = np.random.uniform(0,1, len(df)) <= .8

train_data, test_data = df[df['is_train'] == True], df[df['is_train'] == False]

# examine train, test shapes
print("train, test set size: %d, %d" %(len(train_data), len(test_data)))
print("")

# examine train set examples
print("example:")
print("body: %s" %(train_data.get_value(10,'body')))
print("label: %s" %(train_data.get_value(10,'target')))

train, test set size: 178567, 45127

example:
body: yes i literally want to vanish to another world at this point. i feel like everything is pointless, like i want to be productive person but at the end of the day for what? i don't care. 
label: 1


In [36]:
#check class balance
train_data['target'].value_counts()

0    89332
1    89235
Name: target, dtype: int64

In [38]:
print("example:")
print("body: %s" %(train_data.get_value(32,'body')))
print("label: %s" %(train_data.get_value(32,'target')))

example:
body: it was consensual he asked first
label: 1


## Logistic Regression

In [40]:
#build tf-idf model
vec=TfidfVectorizer(preprocessor=preprocessor, ngram_range=(1,3), stop_words=total_stopwords, max_features=10000)
vec_train_data=vec.fit_transform(train_data['body']) 
vec_test_data=vec.transform(test_data['body']) 

In [41]:
# train Logistic Regression
logit=LogisticRegression(penalty='l2')
logit.fit(vec_train_data, train_data['target'])
pred_labels=logit.predict(vec_test_data)
    
# assess model
f1=f1_score(test_data['target'], pred_labels, average="weighted") 
accuracy=accuracy_score(test_data['target'], pred_labels)
confusion=confusion_matrix(test_data['target'], pred_labels)
print("logistic regression f1 score: %.3f" %(f1))
print("logistic regression accuracy score: %.3f" %(accuracy))
print("logistic regression confusion matrix:")
print(confusion)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

logistic regression f1 score: 0.896
logistic regression accuracy score: 0.896
logistic regression confusion matrix:
[[19983  2532]
 [ 2155 20457]]


In [70]:
#try Keras
from keras.preprocessing.text import one_hot
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import Flatten
from keras.layers.embeddings import Embedding
from keras.preprocessing import sequence

In [45]:
#create integer encoding of docs
vocab_size = 100
encoded_docs = [one_hot(d, vocab_size) for d in df['tweets']]

In [71]:
#try tokenizer instead
t = Tokenizer()
t.fit_on_texts(df['tweets'])
vocab_t_size = len(t.word_index) + 1

In [72]:
#create sequence
encoded_t_docs = t.texts_to_sequences(df['tweets'])

In [73]:
# pad docs to equals size
pad = 40
# padded_docs = pad_sequences(encoded_docs, maxlen=pad, padding='post')
padded_t_docs = pad_sequences(encoded_t_docs, maxlen=pad, padding='post')

In [74]:
padded_docs[11105]

array([85,  8, 87,  8, 14, 82, 55, 28, 34, 27, 27, 92, 81, 55, 62, 34, 30,
       11, 73, 47, 90, 56, 53, 82,  9,  0,  0,  0,  0,  0,  0,  0,  0,  0,
        0,  0,  0,  0,  0,  0], dtype=int32)

In [143]:
from sklearn.model_selection import train_test_split
# X_train,X_test,Y_train,Y_test = train_test_split(padded_docs, df['target'], test_size=.8)
X_train,X_test,Y_train,Y_test = train_test_split(padded_t_docs, df['target'], test_size=.8)

In [144]:
X_train.shape

(44738, 40)

In [141]:
# create the model
embedding_size = 32

model = Sequential()
# model.add(Embedding(vocab_size, embedding_size, input_length=pad))
model.add(Embedding(vocab_t_size, embedding_size, input_length=pad))
model.add(Flatten())
model.add(Dense(250, activation='relu'))
model.add(Dense(1, activation='sigmoid'))
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
print(model.summary())

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_4 (Embedding)      (None, 40, 32)            11611520  
_________________________________________________________________
flatten_4 (Flatten)          (None, 1280)              0         
_________________________________________________________________
dense_7 (Dense)              (None, 250)               320250    
_________________________________________________________________
dense_8 (Dense)              (None, 1)                 251       
Total params: 11,932,021
Trainable params: 11,932,021
Non-trainable params: 0
_________________________________________________________________
None


In [142]:
# Fit the model
epochs=3
batch_size=128

history = model.fit(X_train, Y_train,
                    batch_size=batch_size,
                    epochs=epochs,
                    verbose=1,
                    validation_split=0.2)

# Final evaluation of the model
scores = model.evaluate(X_test, Y_test, verbose=0)
print("Accuracy: %.2f%%" % (scores[1]*100))

Train on 35790 samples, validate on 8948 samples
Epoch 1/3
Epoch 2/3

KeyboardInterrupt: 

In [129]:
keras_journal = ["Sometime I feel very alone and anxious"]

In [130]:
encoded_journal = t.texts_to_sequences(keras_journal)

In [131]:
encoded_journal

[[5874, 8, 85, 163, 332, 9, 2197]]

In [132]:
#pad
pad = 40
padded_journal = pad_sequences(encoded_journal, maxlen=pad, padding='post')

In [133]:
padded_journal

array([[5874,    8,   85,  163,  332,    9, 2197,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0]], dtype=int32)

In [138]:
ynew = model.predict_proba(padded_journal)



In [140]:
ynew

array([[ 0.44699621]], dtype=float32)

In [42]:
#get top words
#look at top 5 weights for each class
#get coefficients for all features
coef_sq = logit.coef_

#get index of top 5 absolute values for each class
weight_indx = np.argsort(coef_sq)[:, -20:]

#flatten so can use to look up wieghts
weight_indx = weight_indx.flatten()

#get coefficients based on index
weights = coef_sq[:, weight_indx]
 
#get words that match weights based on index
vocab = np.array(vec.get_feature_names())[weight_indx]

# make table
df = pd.DataFrame({'Weights of words that predict depression': weights[0]}
                  , index=vocab)
df

Unnamed: 0,Weights of words that predict depression
medication,4.153974
advice,4.159002
relate,4.308234
suicidal,4.33576
video games,4.342867
helped,4.386488
normal,4.563139
anxiety,4.563813
meds,4.659559
www,4.991141


In [43]:
#try to make up an example journal
journal = """Today was wonderful. I had a strange interaction at the store. 
The cashier seemed irratated. I'm not sure what's going on but it makes me feel weird"""

#score test journal
vec_test_example=vec.transform([journal]) 
print("probability of class 0 and 1: ",logit.predict_proba(vec_test_example))

#get words and weights from test journal
word_idx = np.nonzero(vec_test_example)[1]
vocab = np.array(vec.get_feature_names())[word_idx]
weights = coef_sq[:, word_idx]
df = pd.DataFrame({'Weights of words in sample Journal': weights[0]}
                  , index=vocab)
df.sort_values(by='Weights of words in sample Journal')

probability of class 0 and 1:  [[ 0.12156455  0.87843545]]


Unnamed: 0,Weights of words in sample Journal
today,-1.311699
sure going,-0.339758
store,-0.043489
cashier,0.342796
wonderful,0.557809
strange,1.119725
weird,1.138772
going,1.249253
makes feel,1.514279
interaction,1.587885


In [127]:
#export tfidf model
tfidf_file = 'tfidf_exported_model'
joblib.dump(vec, tfidf_file)

['tfidf_exported_model']

In [128]:
#export logistic regression
logistic_regression_file = 'logistic_regression_model'
joblib.dump(logit, logistic_regression_file)

['logistic_regression_model']

In [129]:
#test out exported models against prev sample journal
loaded_tfidf = joblib.load('tfidf_exported_model')
loaded_lr = joblib.load('logistic_regression_model')

#score test journal
export_test_example=loaded_tfidf.transform([journal]) 
print("probability of class 0 and 1: ",loaded_lr.predict_proba(export_test_example))


probability of class 0 and 1:  [[ 0.43736519  0.56263481]]
