## Depression in Tweets

In [1]:
# import nltk library
import nltk; nltk.download('punkt')
from nltk import sent_tokenize
from nltk.stem.snowball import SnowballStemmer
from nltk.tokenize.treebank import TreebankWordTokenizer

# import stopword libraries
nltk.download('stopwords'); from nltk.corpus import stopwords
from sklearn.feature_extraction import stop_words

# import other libraries
import pandas as pd
import numpy as np
import string
#from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import *
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import *
from sklearn.linear_model import LogisticRegression
from sklearn.grid_search import GridSearchCV

# import word embedding library
#import glove_helper

# import helper libraries
import collections
from common import utils, vocabulary

#display multiple results per cell
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

#export models
from sklearn.externals import joblib

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/benthompson/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/benthompson/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!




In [2]:
#read in tweets
df = pd.DataFrame.from_csv('../depression_tweets.csv', header=None, parse_dates=True, infer_datetime_format=True)

In [3]:
#add index
df = df.reset_index()

#set column names
df.columns = ['date','tweet_id', 'handle', 'id', 'tweet', 'language', 'device', 'notes', 'notes_2']

In [4]:
#look at data
df.head(5)

Unnamed: 0,date,tweet_id,handle,id,tweet,language,device,notes,notes_2
0,2018-04-05 19:14:48,981973445616525312,Haldol,816793117785542656,Currently I am on 150 mg of hydroxyzine for in...,en,Twitter for iPhone,,
1,2018-04-05 19:14:48,981973444723064832,Rick O,3192532759,Integrated behavioral health for POLICE. Treat...,en,Twitter for iPhone,,
2,2018-04-05 19:14:47,981973443988996096,olivia 🧝🏽‍♀️ボス,1321438920,RT @DevinnJay: I won’t allow depression to fuc...,en,Twitter for iPhone,,
3,2018-04-05 19:14:47,981973443154505728,LeFrenchNeuropsy,2887994266,RT @LePsylab: For science ! Un questionnaire p...,fr,Twitter Web Client,,
4,2018-04-05 19:14:45,981973435705421826,GEEZ,311289251,I lost my brova I fell deep in depression!,en,Twitter for Android,,


In [5]:
#how man non-distinct tweets
len(df)

530773

In [6]:
#filter to english only
df = df[df['language'] == 'en']

In [7]:
#how many tweets now
len(df)

484663

In [8]:
#any users w/lots of tweets that might skew model?
#not any that seem too high
df['handle'].value_counts().head(5)

.                    1073
Aiden Hatfield        460
ً                     445
Documentary           313
In Music We Trust     312
Name: handle, dtype: int64

In [9]:
#how many distinct tweets
len(df.tweet.unique())

186362

In [10]:
#make distinct tweets the df
df = pd.DataFrame(df.tweet.unique())

In [11]:
#rename columns
df.columns = ['tweets']

In [13]:
#export sample to check quality
# pd.options.display.max_colwidth = 1000
# df_sample = df.sample(n=100)
# df_sample.to_csv('../sample_100_depression_tweets.csv')

In [16]:
#look up specific tweet
pd.options.display.max_colwidth = 10000
df.iloc[45055]

tweets    Y’all be using mental illness as a way to justify your bitchy behavior and it’s honestly a no from fucking me. Depr… https://t.co/YzlhNSV9RA
Name: 45055, dtype: object

In [98]:
#create column on 1's
x = [1]
x = x * len(df)
df['target'] = x

In [99]:
df.head(5)

Unnamed: 0,tweets,target
0,Currently I am on 150 mg of hydroxyzine for in...,1
1,Integrated behavioral health for POLICE. Treat...,1
2,RT @DevinnJay: I won’t allow depression to fuc...,1
3,I lost my brova I fell deep in depression!,1
4,RT @peachesfrfr: so there i am depression all...,1


## Bring in random tweets

In [13]:
#read in tweets
df_2 = pd.DataFrame.from_csv('../random_tweets_2018-06-15.csv', header=None)

In [14]:
#look at data
df_2.head()

"RT @EmmanuelMacron: France, Germany, and the UK regret the U.S. decision to leave the JCPOA. The nuclear non-proliferation regime is at sta…"
RT @glossoIogy: Gemini: bitch\nCancer: blocked \nGemini: unblock me I need to tell you something \nCancer: what \nGemini: bitch
RT @JDiamond1: Iranian President Hassan Rouhani: Iran will abide by its JCPOA commitments despite US withdrawal. Says agreement now between…
"RT @PaulLee85: @Redheaded_Jenn @EGh69 Jenn, is my white knight. But apparently she isn’t American, since she’s more interested I. What goes…"
Rereading it now too I realize I was being hard on myself but MAN it'S REALLY NICE TO GET GOOD RECEPTION ON STUFF Y… https://t.co/lUlaFc1ztu


In [15]:
#how many
len(df_2)

96206

In [16]:
#give index
df_2 = df_2.reset_index()

#give column name
df_2.columns = ['tweets']

In [17]:
#how many distinct tweets
len(df_2.tweets.unique())

78329

In [18]:
#Make dataframe of unique
df_2 = pd.DataFrame(df_2.tweets.unique())

#give column name
df_2.columns = ['tweets']

In [19]:
#make all tweets lowercase
df_2['tweets'] = df_2['tweets'].str.lower()
df_2.columns = ['tweets']

In [20]:
df_2.head()

Unnamed: 0,tweets
0,"rt @emmanuelmacron: france, germany, and the u..."
1,rt @glossoiogy: gemini: bitch\ncancer: blocked...
2,rt @jdiamond1: iranian president hassan rouhan...
3,"rt @paullee85: @redheaded_jenn @egh69 jenn, is..."
4,rereading it now too i realize i was being har...


In [21]:
#check for tweets that use depression
df_2[(df_2['tweets'].str.contains('depressed') | df_2['tweets'].str.contains('depression'))]

#drop them
df_2.drop(df_2[(df_2.tweets.str.contains('depressed')) | (df_2.tweets.str.contains('depression'))].index, inplace=True)

Unnamed: 0,tweets
1241,rt @sophxthompson: it's #mentalhealthawareness...
1894,rt @duumb: commercial: 2 out of 3 people suffe...
1900,rt @depresseddarth: nobody respects me anymore...
2814,rt @heavybagofbones: bitches who got full scor...
6739,rt @kirebe16: i suffer from anxiety but lately...
7128,rt @evanedinger: after realising last week my ...
7300,"rt @depresseddarth: “sorry, wrong galaxy” http..."
8701,rt @notrllyhere: sometimes you just b depresse...
9750,rt @brysontiller: 1. i was depressed before i ...
10122,rt @depressionnote: what people need to unders...


In [22]:
#recheck length
len(df_2)

78233

In [23]:
#export to check quality
# df_2_sample = df_2.sample(n=100)
# df_2_sample.to_csv('../sample_100_random_tweets.csv')

In [110]:
#column of 0's
x = 0
x = x * len(df_2)

df_2['target'] = x

In [111]:
#balance classes
df_3 = df.sample(n=len(df_2))

In [112]:
# df_3.head()

In [113]:
#combine dfs
df = pd.concat([df_3,df_2])

In [115]:
len(df)

156466

In [116]:
#preprocess tweets
example_text="""'RT @techreview: A neural network can 
detect depression and mania in bipolar subjects 
by analyzing how they hold and tap on their smartphone…'"""

# tokenize
def tokenize_text(input_text):
    """
    Args: 
    input_text: a string representing an 
    individual review
        
    Returns:
    input_token: a list containing stemmed 
    tokens, with punctutations removed, for 
    an individual review
        
    """
    input_tokens=[]
        
    # Split sentence
    sents=sent_tokenize(input_text)
            
    # Split word
    for sent in sents:
        input_tokens+=TreebankWordTokenizer().tokenize(sent)
        
    return input_tokens


# canonicalize
def canonicalize_tokens(input_tokens):
    """
    Args:
    input_tokens: a list containing tokenized 
    tokens for an individual review
    
    Returns:
    input_tokens: a list containing canonicalized 
    tokens for an individual review
    
    """
    input_tokens=utils.canonicalize_words(input_tokens)
    return input_tokens


# preprocessor 
def preprocessor(raw_text):
    """
    Args:
    raw_text: a string representing an
    individual review
    
    Returns:
    preprocessed_text: a string representing 
    a preprocessed individual review
    
    """
    # tokenize
    tokens=tokenize_text(raw_text)
    
    # canonicalize
    canonical_tokens=canonicalize_tokens(tokens)
    
    # rejoin string
    preprocessed_text=(" ").join(canonical_tokens) 
    return preprocessed_text

# example data
#input_tokens=tokenize_text(example_text)
#print(input_tokens)

#canonical_tokens=canonicalize_tokens(input_tokens)
#print(canonical_tokens)

preprocessed_text=preprocessor(example_text) 
print(preprocessed_text)

'rt @ techreview : a neural network can detect depression and mania in bipolar subjects by analyzing how they hold and tap on their smartphone… '


In [117]:
# examine stopwords

# sklearn stopwords (frozenset)
sklearn_stopwords=stop_words.ENGLISH_STOP_WORDS
print("number of sklearn stopwords: %d" %(len(sklearn_stopwords)))
#print(sklearn_stopwords)

# nltk stopwords (list)
nltk_stopwords=stopwords.words("english")
print("number of nltk stopwords: %d" %(len(nltk_stopwords)))
#print(nltk_stopwords)

# combined sklearn, nltk, other stopwords (set)
total_stopwords=set(list(sklearn_stopwords.difference(set(nltk_stopwords)))+nltk_stopwords)

other_stopwords=["DG", "DGDG", "@", "rt", "'rt", "'", ":", "depression", "depressed"]
for w in other_stopwords:
    total_stopwords.add(w)
    
print("number of total stopwords: %d" %(len(total_stopwords)))

number of sklearn stopwords: 318
number of nltk stopwords: 179
number of total stopwords: 387


In [118]:
#look at review w/o stop words
new_review = []
for i in preprocessed_text.split():
    if i in total_stopwords:
        continue
    else:
        new_review.append(i)
        
print(new_review)

['techreview', 'neural', 'network', 'detect', 'mania', 'bipolar', 'subjects', 'analyzing', 'hold', 'tap', 'smartphone…']


In [119]:
#reset index
df = df.reset_index(drop=True)

In [120]:
#split into test, train before sampling to belance
# using recoded labels
#create train, test data
df['is_train'] = np.random.uniform(0,1, len(df)) <= .8

train_data, test_data = df[df['is_train'] == True], df[df['is_train'] == False]

# examine train, test shapes
print("train, test set size: %d, %d" %(len(train_data), len(test_data))) # train_data: 129023, test_data: 32256
print("")

# examine train set examples
print("example:")
print("tweet: %s" %(train_data.get_value(5,'tweets')))
print("label: %s" %(train_data.get_value(5,'target')))

train, test set size: 125221, 31245

example:


KeyError: 5

In [121]:
#check class balance
train_data['target'].value_counts()

0    62645
1    62576
Name: target, dtype: int64

In [122]:
print("example:")
print("tweet: %s" %(train_data.get_value(30,'tweets')))
print("label: %s" %(train_data.get_value(30,'target')))

example:
tweet: @akcgoros I send you hugs. It's really hard when you want to create something but depression won't let you  so you get depressed.
label: 1


## Logistic Regression

In [123]:
#build tf-idf model
vec=TfidfVectorizer(preprocessor=preprocessor, ngram_range=(1,3), stop_words=total_stopwords, max_features=10000)
vec_train_data=vec.fit_transform(train_data['tweets']) 
vec_test_data=vec.transform(test_data['tweets']) 

In [124]:
# train Logistic Regression
logit=LogisticRegression(penalty='l2')
logit.fit(vec_train_data, train_data['target'])
pred_labels=logit.predict(vec_test_data)
    
# assess model
f1=f1_score(test_data['target'], pred_labels, average="weighted") 
accuracy=accuracy_score(test_data['target'], pred_labels)
confusion=confusion_matrix(test_data['target'], pred_labels)
print("logistic regression f1 score: %.3f" %(f1))
print("logistic regression accuracy score: %.3f" %(accuracy))
print("logistic regression confusion matrix:")
print(confusion)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

logistic regression f1 score: 0.816
logistic regression accuracy score: 0.816
logistic regression confusion matrix:
[[13426  2162]
 [ 3584 12073]]


In [125]:
#get top words
#look at top 5 weights for each class
#get coefficients for all features
coef_sq = logit.coef_

#get index of top 5 absolute values for each class
weight_indx = np.argsort(coef_sq)[:, -20:]

#flatten so can use to look up wieghts
weight_indx = weight_indx.flatten()

#get coefficients based on index
weights = coef_sq[:, weight_indx]
 
#get words that match weights based on index
vocab = np.array(vec.get_feature_names())[weight_indx]

# make table
df = pd.DataFrame({'Weights of words that predict depression': weights[0]}
                  , index=vocab)
df

Unnamed: 0,Weights of words that predict depression
naps,5.409314
suicidal,5.41219
battling,5.694694
cures,5.7437
clinical,5.814887
nap,5.818789
sadness,5.941904
postpartum,6.029521
suffering,6.318642
tropical,6.431674


In [126]:
#try to make up an example journal
journal = """Today was wonderful. I had a strange interaction at the store. 
The cashier seemed irratated. I'm not sure what's going on but it makes me feel weird"""

#score test journal
vec_test_example=vec.transform([journal]) 
print("probability of class 0 and 1: ",logit.predict_proba(vec_test_example))

#get words and weights from test journal
word_idx = np.nonzero(vec_test_example)[1]
vocab = np.array(vec.get_feature_names())[word_idx]
weights = coef_sq[:, word_idx]
df = pd.DataFrame({'Weights of words in sample Journal': weights[0]}
                  , index=vocab)
df.sort_values(by='Weights of words in sample Journal')

probability of class 0 and 1:  [[ 0.43736519  0.56263481]]


Unnamed: 0,Weights of words in sample Journal
store,-1.171633
sure,-0.457797
interaction,-0.419084
wonderful,-0.374465
strange,0.326909
weird,0.392027
today,0.923644
makes feel,1.083485
going,1.660564
makes,1.798566


In [127]:
#export tfidf model
tfidf_file = 'tfidf_exported_model'
joblib.dump(vec, tfidf_file)

['tfidf_exported_model']

In [128]:
#export logistic regression
logistic_regression_file = 'logistic_regression_model'
joblib.dump(logit, logistic_regression_file)

['logistic_regression_model']

In [129]:
#test out exported models against prev sample journal
loaded_tfidf = joblib.load('tfidf_exported_model')
loaded_lr = joblib.load('logistic_regression_model')

#score test journal
export_test_example=loaded_tfidf.transform([journal]) 
print("probability of class 0 and 1: ",loaded_lr.predict_proba(export_test_example))


probability of class 0 and 1:  [[ 0.43736519  0.56263481]]
