In [1]:
pip install imblearn --quiet

In [2]:
# Import libraries and packages
import numpy as np
import pandas as pd
import re 
import pickle

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer

import nltk
from nltk import word_tokenize, pos_tag
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer, SnowballStemmer
nltk.download('punkt')

import gensim
import gensim.downloader as api

from imblearn.under_sampling import RandomUnderSampler

np.random.seed(42)

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/mathildelundsberg/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


## Loading data

In [3]:
# Import data
raw = pd.read_csv('mental_disorders_reddit.csv')
data = raw[['title','selftext','subreddit']] 
data.head()

Unnamed: 0,title,selftext,subreddit
0,Life is so pointless without others,Does anyone else think the most important part...,BPD
1,Cold rage?,Hello fellow friends üòÑ\n\nI'm on the BPD spect...,BPD
2,I don‚Äôt know who I am,My [F20] bf [M20] told me today (after I said ...,BPD
3,HELP! Opinions! Advice!,"Okay, I‚Äôm about to open up about many things I...",BPD
4,help,[removed],BPD


In [4]:
# Key info on the data 
print("\033[1mDataframe info:\033[0m")
print(data.info())
print("\n\033[1mUnique values:\033[0m")
print(data.nunique())
print("\n\033[1mNumber of duplicate rows:\033[0m")
print(data.duplicated().sum())
#print("\n\033[1mNumber of posts for each subreddit label:\033[0m")
#print(data['subreddit'].value_counts())

[1mDataframe info:[0m
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 701787 entries, 0 to 701786
Data columns (total 3 columns):
 #   Column     Non-Null Count   Dtype 
---  ------     --------------   ----- 
 0   title      701741 non-null  object
 1   selftext   668096 non-null  object
 2   subreddit  701787 non-null  object
dtypes: object(3)
memory usage: 16.1+ MB
None

[1mUnique values:[0m
title        621001
selftext     563917
subreddit         6
dtype: int64

[1mNumber of duplicate rows:[0m
9825


In [5]:
# Investigate class imbalance
counts = data['subreddit'].value_counts()
ratios = data['subreddit'].value_counts(normalize=True)
pd.DataFrame({"Number of posts": counts, "%": ratios*100, 
              "ratio": ratios/min(data['subreddit'].value_counts(normalize=True))}).round({'%': 2, 'ratio': 1})

Unnamed: 0,Number of posts,%,ratio
BPD,241116,34.36,9.5
Anxiety,173990,24.79,6.9
depression,156972,22.37,6.2
mentalillness,53232,7.59,2.1
bipolar,51112,7.28,2.0
schizophrenia,25365,3.61,1.0


*We have a class imbalance of approximately 10:7:6:2:2:1*

### Dropping invalid and null rows

In [6]:
# How many null values
print(f"Count null: \n{data.isna().sum()} \n")

# How many deleted or removed posts
print(f"Count of \"[deleted]\" in selftext: {data['selftext'][data['selftext'] == '[deleted]'].count()}")
print(f"Count of \"[removed]\" in selftext: {data['selftext'][data['selftext'] == '[removed]'].count()}")
print(f"Count of \"[deleted]\" in title: {data['selftext'][data['title'] == '[deleted]'].count()}")
print(f"Count of \"[removed]\" in title: {data['selftext'][data['title'] == '[removed]'].count()}")

Count null: 
title           46
selftext     33691
subreddit        0
dtype: int64 

Count of "[deleted]" in selftext: 9742
Count of "[removed]" in selftext: 86875
Count of "[deleted]" in title: 1
Count of "[removed]" in title: 0


In [7]:
# Remove posts with null values and posts that were removed or deleted 
data = data[data['selftext'] != '[removed]']
data = data[data['selftext'] != '[deleted]']
data = data[data['title'] != '[deleted]']
data = data.dropna()

In [8]:
# How many null values after dropping rows
print(f"Count null: {data.isna().sum().sum()}")

# How many deleted or removed posts after dropping rows
print(f"Count of \"[deleted]\" and \"[removed]\" in selftext and title: {data['selftext'][data['selftext'] == '[deleted]'].count() + data['selftext'][data['selftext'] == '[removed]'].count() + data['selftext'][data['title'] == '[deleted]'].count() + data['selftext'][data['title'] == '[removed]'].count()}")

Count null: 0
Count of "[deleted]" and "[removed]" in selftext and title: 0


In [9]:
# Checking for "[removed]" or "[deleted]" in the title and selftext when the 
# title and selftext also contains other text (i.e. not the whole post deleted or removed)
print("Number of posts containing \"[removed]\" or \"[deleted]\":", 
      data['selftext'][data['selftext'].str.contains('\[deleted|deleted\]|\[removed|removed\]')].count() +
      data['title'][data['title'].str.contains('\[deleted|deleted\]|\[removed|removed\]')].count() +
      data['subreddit'][data['subreddit'].str.contains('\[deleted|deleted\]|\[removed|removed\]')].count())

Number of posts containing "[removed]" or "[deleted]": 123


*123 rows is not many out of 580,000 rows so we just drop these rows.*

In [10]:
# Dropping rows containing "[removed]" or "[deleted]"
data = data[data['selftext'].str.contains("\[deleted|deleted\]|\[removed|removed\]") == False]
data = data[data['title'].str.contains("\[deleted|deleted\]|\[removed|removed\]") == False]

In [11]:
# Checking that those rows have been dropped
print("Number of posts containing \"[removed]\" or \"[deleted]\":", 
      data['selftext'][data['selftext'].str.contains('\[deleted|deleted\]|\[removed|removed\]')].count() +
      data['title'][data['title'].str.contains('\[deleted|deleted\]|\[removed|removed\]')].count() +
      data['subreddit'][data['subreddit'].str.contains('\[deleted|deleted\]|\[removed|removed\]')].count())

Number of posts containing "[removed]" or "[deleted]": 0


In [12]:
# Key info after dropping
print("\033[1mDataframe info after dropping:\033[0m")
print(data.info())

[1mDataframe info after dropping:[0m
<class 'pandas.core.frame.DataFrame'>
Int64Index: 571352 entries, 0 to 701786
Data columns (total 3 columns):
 #   Column     Non-Null Count   Dtype 
---  ------     --------------   ----- 
 0   title      571352 non-null  object
 1   selftext   571352 non-null  object
 2   subreddit  571352 non-null  object
dtypes: object(3)
memory usage: 17.4+ MB
None


In [13]:
# Investigate class imbalance after dropping
counts = data['subreddit'].value_counts()
ratios = data['subreddit'].value_counts(normalize=True)
print("\033[1mNumber of posts after dropping:\033[0m")
pd.DataFrame({"Number of posts": counts, "%": ratios*100, 
              "ratio": ratios/min(data['subreddit'].value_counts(normalize=True))}).round({'%': 2, 'ratio': 1})

[1mNumber of posts after dropping:[0m


Unnamed: 0,Number of posts,%,ratio
BPD,205136,35.9,17.6
Anxiety,160570,28.1,13.8
depression,120990,21.18,10.4
mentalillness,37436,6.55,3.2
bipolar,35589,6.23,3.1
schizophrenia,11631,2.04,1.0


## Format

In [14]:
# Concatenate title and selftext columns into one feature for classification 
data["post"] = data["title"] + " " + data["selftext"]
data = data[['post', 'subreddit']]
data.head()

Unnamed: 0,post,subreddit
0,Life is so pointless without others Does anyon...,BPD
1,Cold rage? Hello fellow friends üòÑ\n\nI'm on th...,BPD
2,I don‚Äôt know who I am My [F20] bf [M20] told m...,BPD
3,"HELP! Opinions! Advice! Okay, I‚Äôm about to ope...",BPD
5,My ex got diagnosed with BPD Without going int...,BPD


In [15]:
# Lowercase the Anxiety subreddit label (for cute team member who was annoyed by the capital letter)
data['subreddit'][data['subreddit'] == 'Anxiety'] = 'anxiety'

## Splitting & Undersampling

### Splitting

In [16]:
# Splitting the data using stratified sampling (data is imbalanced) (20/80 split)
train, test = train_test_split(data, test_size = 0.2, random_state = 42, stratify=data['subreddit'])

In [17]:
# See stratified sampling distributions
print("\033[1mNumber of posts in data:\033[0m")
print(data['subreddit'].value_counts())
print("\n\033[1mNumber of posts in train:\033[0m")
print(train['subreddit'].value_counts())
print("\n\033[1mNumber of posts in test:\033[0m")
print(test['subreddit'].value_counts())

[1mNumber of posts in data:[0m
BPD              205136
anxiety          160570
depression       120990
mentalillness     37436
bipolar           35589
schizophrenia     11631
Name: subreddit, dtype: int64

[1mNumber of posts in train:[0m
BPD              164108
anxiety          128456
depression        96792
mentalillness     29949
bipolar           28471
schizophrenia      9305
Name: subreddit, dtype: int64

[1mNumber of posts in test:[0m
BPD              41028
anxiety          32114
depression       24198
mentalillness     7487
bipolar           7118
schizophrenia     2326
Name: subreddit, dtype: int64


### Undersampling

In [18]:
# Set undersampling ratio (5:3.5:3:2:2:1) {original ratio 10:7:6:2:2:1}
len_min_class = train['subreddit'][train['subreddit'] == 'schizophrenia'].value_counts()[0]
rus_ratio = {'BPD': int(len_min_class*5),
              'anxiety': int(len_min_class*3.5),
              'depression': int(len_min_class*3),
              'mentalillness': int(len_min_class*2),
              'bipolar': int(len_min_class*2),
              'schizophrenia': int(len_min_class*1)}

# Instatiate under-sampler
rus = RandomUnderSampler(sampling_strategy = rus_ratio, random_state=42)

# Resample the training set only
X_train, y_train = rus.fit_resample(train[['post']], train['subreddit'])

In [19]:
# Store the text data for the train set
org_train = pd.concat([X_train, y_train], axis=1)

# Test set into X and y, and store the text data for the test set
X_test = test[['post']]
y_test = test['subreddit']
org_test = pd.concat([X_test, y_test], axis=1)

In [20]:
# Total number of rows after undersampling
print("\n\033[1mTotal number of posts/observations in train set after undersampling:\033[0m")
print(len(y_train))

# Investigate class imbalance after undersampling
counts = y_train.value_counts()
ratios = y_train.value_counts(normalize=True)
print("\n\033[1mNumber of posts in train set after undersampling:\033[0m")
pd.DataFrame({"Number of posts": counts, "%": ratios*100, 
              "ratio": ratios/min(y_train.value_counts(normalize=True))}).round({'%': 2, 'ratio': 1})


[1mTotal number of posts/observations in train set after undersampling:[0m
153532

[1mNumber of posts in train set after undersampling:[0m


Unnamed: 0,Number of posts,%,ratio
BPD,46525,30.3,5.0
anxiety,32567,21.21,3.5
depression,27915,18.18,3.0
bipolar,18610,12.12,2.0
mentalillness,18610,12.12,2.0
schizophrenia,9305,6.06,1.0


## Pre-processing

### Pre-processing Functions

In [21]:
# Function to call in CountVectorizer:
    # preprocessor = None (will happen inside tokenizer)
    # stop_words = None (will happen inside tokenizer)
    # tokenizer = preprocess()

# Define preprocessing function for CountVectorizer 
    # (mask URLs, tokenize, remove stop words, lemmatize, stem, lowercase)
def preprocess(text):
    stemmer = SnowballStemmer('english')
    stop_words = set(stopwords.words('english'))
    preprocessed_post = []
    
    # Mask URLs
    url = re.compile(r'(http?://|www\.)\S+')
    text = url.sub('[url]', text)
    
    # Iterate through tokens and POS tags (word_tokenize also removes white spaces and line breaks)
    for token, tag in nltk.pos_tag(gensim.utils.simple_preprocess(text)):
        pos=tag[0].lower()
        
        # Set POS tag if not in list
        if pos not in ['a', 'r', 'n', 'v']:
            pos='n'
        
        # Check if token is a stop word
        if token not in stop_words:
            
            # Lowercase, lemmatize and stem, then append to output list
            preprocessed_post.append(stemmer.stem(WordNetLemmatizer().lemmatize(token.lower(), pos=pos)))
    
    return preprocessed_post

In [22]:
# Define preprocessing function for Word2Vec 
    #(mask URLs, tokenize, lowercase, remove stopwords)
def preprocess_w2v(text):
    stop_words = set(stopwords.words('english'))
    preprocessed_post = []
    
    # Mask URLs
    url = re.compile(r'(http?://|www\.)\S+')
    text = url.sub('[url]', text)
    
    # Iterate through tokens and POS tags (word_tokenize also removes white spaces and line breaks)
    for token in gensim.utils.simple_preprocess(text):
        
        # Append token to output list if token is not a stop word
        if token not in stop_words:
            preprocessed_post.append(token)
    
    return preprocessed_post

### Preprocess & Vectorize - CountVec

In [23]:
# Instatiate vectorizer and TF-IDF transformer
vectorizer = CountVectorizer(decode_error = 'ignore', preprocessor = None, 
                             stop_words = None, tokenizer = preprocess, token_pattern=None)
transformer = TfidfTransformer()

In [24]:
%%time
# Fit and transform train set
X_train_sparse = vectorizer.fit_transform(X_train['post'])
X_train_sparse = transformer.fit_transform(X_train_sparse)
X_train_sparse

CPU times: user 18min 25s, sys: 8.64 s, total: 18min 34s
Wall time: 18min 34s


<153532x56705 sparse matrix of type '<class 'numpy.float64'>'
	with 9137670 stored elements in Compressed Sparse Row format>

In [25]:
# Transform test set
X_test_sparse = vectorizer.transform(X_test['post'])
X_test_sparse = transformer.transform(X_test_sparse)
X_test_sparse

<114271x56705 sparse matrix of type '<class 'numpy.float64'>'
	with 6877860 stored elements in Compressed Sparse Row format>

In [26]:
# Verify that second dimension of train and test match
X_train_sparse.shape[1] == X_test_sparse.shape[1]

True

In [27]:
# Store the y values
y_train_sparse = y_train
y_test_sparse = y_test

### Preprocess & Vectorize - Word2Vec

In [28]:
# Define function to get word2vec vectors
def get_word_vectors(tokens, model):
    vectors = []
    for token in tokens:
        try:
            vector = model[token]
            vectors.append(vector)
        except KeyError:
            continue
    return np.array(vectors)

# Define function to check vector list dimensions
# Will return dim of first elements in each level of nested list
def dim(a):
    if (type(a) != list) and (type(a) != np.ndarray):
        return []
    return [len(a)] + dim(a[0])

In [29]:
# New dataframes for w2v
w2v_train = org_train.copy()

# Load pre-trained word2vec model (trained on google news dataset with ~100 billion words)
w2v_model = api.load('word2vec-google-news-300')

#### Train set

In [30]:
# Apply w2v preprocessing to each post's text and add to new column in dataframe 
w2v_train['tokenized'] = w2v_train['post'].apply(preprocess_w2v)
w2v_train

Unnamed: 0,post,subreddit,tokenized
0,Just wanted to drop a note telling you I care‚Ä¶...,BPD,"[wanted, drop, note, telling, care, wanted, te..."
1,Do you guys ever regret or hesitate disclosing...,BPD,"[guys, ever, regret, hesitate, disclosing, bpd..."
2,DAE feel like a hermit? I have BPD and I often...,BPD,"[dae, feel, like, hermit, bpd, often, want, ho..."
3,my FP pushed me away feels like I'd rather bea...,BPD,"[fp, pushed, away, feels, like, rather, beaten..."
4,Feeling empowered with self hate Because I kno...,BPD,"[feeling, empowered, self, hate, know, hate, a..."
...,...,...,...
153527,Wanted to share my plan weight loss strategy t...,schizophrenia,"[wanted, share, plan, weight, loss, strategy, ..."
153528,Felt lonely and made a server with a few frien...,schizophrenia,"[felt, lonely, made, server, friends, talk, pe..."
153529,"how I figured schizophrenia out So, 10 years a...",schizophrenia,"[figured, schizophrenia, years, ago, mom, divo..."
153530,It's my 31st B-day tomorrow. SO far I've given...,schizophrenia,"[st, day, tomorrow, far, given, opiates, quit,..."


In [31]:
# Get tokens as a list of strings
w2v_tokens = w2v_train['tokenized'].tolist()

In [32]:
%%time
# Iterate function over each post to get a list of vector arrays
w2v_token_vectors = [get_word_vectors(toks, w2v_model) for toks in w2v_tokens]

# Check dimensions - should be 3D with second dim == len(1st post) and last dimension == 300
dim(w2v_token_vectors)

CPU times: user 23.3 s, sys: 4.04 s, total: 27.3 s
Wall time: 34.8 s


[153532, 56, 300]

In [33]:
# Get the mean vector for each post (average of the token vectors for each post)
w2v_post_vectors = [np.mean(token_vec, axis=0) for token_vec in w2v_token_vectors]
    # Gives warning due to getting mean of empty vectors (‚àµ no words recognized by w2v, e.g. posts in chinese)

# Check dimensions - should be 2D with last dimension == 300 (averaging removes a dimension)
dim(w2v_post_vectors)

  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)


[153532, 300]

In [34]:
# Add dense vector embeddings to dataframe
w2v_train['vector'] = w2v_post_vectors
w2v_train

Unnamed: 0,post,subreddit,tokenized,vector
0,Just wanted to drop a note telling you I care‚Ä¶...,BPD,"[wanted, drop, note, telling, care, wanted, te...","[0.040843215, 0.049962725, 0.0064185006, 0.084..."
1,Do you guys ever regret or hesitate disclosing...,BPD,"[guys, ever, regret, hesitate, disclosing, bpd...","[0.046438448, 0.033510163, 0.043753274, 0.0814..."
2,DAE feel like a hermit? I have BPD and I often...,BPD,"[dae, feel, like, hermit, bpd, often, want, ho...","[0.070028685, 0.024488831, -0.0045175552, 0.09..."
3,my FP pushed me away feels like I'd rather bea...,BPD,"[fp, pushed, away, feels, like, rather, beaten...","[-0.0135599775, 0.08199056, 0.07342699, 0.0509..."
4,Feeling empowered with self hate Because I kno...,BPD,"[feeling, empowered, self, hate, know, hate, a...","[0.06837972, 0.028429667, 0.059834797, 0.07486..."
...,...,...,...,...
153527,Wanted to share my plan weight loss strategy t...,schizophrenia,"[wanted, share, plan, weight, loss, strategy, ...","[0.008502463, 0.07149074, -0.017547939, 0.1418..."
153528,Felt lonely and made a server with a few frien...,schizophrenia,"[felt, lonely, made, server, friends, talk, pe...","[0.024809647, 0.02319336, -0.035194397, 0.0661..."
153529,"how I figured schizophrenia out So, 10 years a...",schizophrenia,"[figured, schizophrenia, years, ago, mom, divo...","[0.0016873488, 0.066772856, -0.03414416, 0.087..."
153530,It's my 31st B-day tomorrow. SO far I've given...,schizophrenia,"[st, day, tomorrow, far, given, opiates, quit,...","[0.048358917, 0.016300201, 0.01368475, 0.04881..."


In [35]:
# Checking for the empty vectors that produced warning
w2v_train[w2v_train['vector'].isna()].sample(10)

Unnamed: 0,post,subreddit,tokenized,vector
148902,||;;:|\\||!; ‚Ä¢‚Ä¢‚Ä¢---‚Ä¢‚Ä¢‚Ä¢,schizophrenia,[],
138287,7463819273636787717263669 646278-84847891‚Äê1290...,mentalillness,[],
145941,·ã∞·àõ·â∏·ãç ·ä†·çç·àµ·à∑·àç·ç¢ ·àà·ãµ·à≠·åä·â∂·âΩ·ãé ·ã®·ãò·àã·àà·àù ·à•·âÉ·ã≠ ·ã≠·à∞·àõ·ãé·â≥·àç·ç¢ ·â∞·å†·âÇ·ãé·âπ ·ãà·ã∞...,schizophrenia,"[·ã∞·àõ·â∏·ãç, ·ä†·çç·àµ·à∑·àç, ·àà·ãµ·à≠·åä·â∂·âΩ·ãé, ·ã®·ãò·àã·àà·àù, ·à•·âÉ·ã≠, ·ã≠·à∞·àõ·ãé·â≥·àç, ·â∞·å†·âÇ...",
140242,IamlivinginyourwallsIamlivinginyourwallsIamliv...,mentalillness,[],
122100,AAAAAAAAAAAAAAAAAAAA #AAAAAAAAAAAAAAAAAAAAAAAA...,depression,[],
128690,Êàë‰ΩèÂú®‰Ω†ÁöÑÁâÜË£° Êàë‰ΩèÂú®‰Ω†ÁöÑÂú∞Êùø‰∏ä Êàë‰ΩèÂú®‰Ω†ÁöÑÂ∫äÂ∫ï‰∏ã Êàë‰ΩèÂú®‰Ω†ÁöÑÈñ£Ê®ìË£° ‰æÜÊâæÊàë ‰æÜÊâæÊàë ‰æÜÊâæÊàë...,mentalillness,"[Êàë‰ΩèÂú®‰Ω†ÁöÑÁâÜË£°, Êàë‰ΩèÂú®‰Ω†ÁöÑÂú∞Êùø‰∏ä, Êàë‰ΩèÂú®‰Ω†ÁöÑÂ∫äÂ∫ï‰∏ã, Êàë‰ΩèÂú®‰Ω†ÁöÑÈñ£Ê®ìË£°, ‰æÜÊâæÊàë, ‰æÜ...",
149186,I‡Ω≤'m‡Ω≤ i‡Ω≤n‡Ω≤ y‡Ω≤o‡Ω≤u‡Ω≤r‡Ω≤ w‡Ω≤a‡Ω≤l‡Ω≤l‡Ω≤s‡Ω≤ I‡Ω≤'m‡Ω≤ i‡Ω≤n‡Ω≤ y‡Ω≤o‡Ω≤...,schizophrenia,[],
147699,a_free_white_horse Ôº¥Ôº°Ôº¨Ôº´ Ôº¥Ôº°Ôº¨Ôº´ Ôº¥Ôº°Ôº¨Ôº´\n\n‚ìê‚ìë‚ìû‚ì§‚ì£ ‚ìò‚ì£\...,schizophrenia,"[ÔΩîÔΩÅÔΩåÔΩã, ÔΩîÔΩÅÔΩåÔΩã, ÔΩîÔΩÅÔΩåÔΩã, neomaya]",
132644,√ÑM≈íG√ú≈†???????? WHEN THE,mentalillness,[√§m≈ìg√º≈°],
22908,What's pwBPD? What's pwBPD?,BPD,"[pwbpd, pwbpd]",


In [36]:
# Drop posts with empty w2v vector representations
w2v_train = w2v_train.dropna(subset=['vector'])

# Check if dropped
w2v_train.isna().sum()

post         0
subreddit    0
tokenized    0
vector       0
dtype: int64

In [37]:
# Store dense vector embeddings as numpy array
X_train_dense = np.array(w2v_train['vector'].tolist())

# Store the y values
y_train_dense = w2v_train['subreddit']

In [38]:
w2v_train

Unnamed: 0,post,subreddit,tokenized,vector
0,Just wanted to drop a note telling you I care‚Ä¶...,BPD,"[wanted, drop, note, telling, care, wanted, te...","[0.040843215, 0.049962725, 0.0064185006, 0.084..."
1,Do you guys ever regret or hesitate disclosing...,BPD,"[guys, ever, regret, hesitate, disclosing, bpd...","[0.046438448, 0.033510163, 0.043753274, 0.0814..."
2,DAE feel like a hermit? I have BPD and I often...,BPD,"[dae, feel, like, hermit, bpd, often, want, ho...","[0.070028685, 0.024488831, -0.0045175552, 0.09..."
3,my FP pushed me away feels like I'd rather bea...,BPD,"[fp, pushed, away, feels, like, rather, beaten...","[-0.0135599775, 0.08199056, 0.07342699, 0.0509..."
4,Feeling empowered with self hate Because I kno...,BPD,"[feeling, empowered, self, hate, know, hate, a...","[0.06837972, 0.028429667, 0.059834797, 0.07486..."
...,...,...,...,...
153527,Wanted to share my plan weight loss strategy t...,schizophrenia,"[wanted, share, plan, weight, loss, strategy, ...","[0.008502463, 0.07149074, -0.017547939, 0.1418..."
153528,Felt lonely and made a server with a few frien...,schizophrenia,"[felt, lonely, made, server, friends, talk, pe...","[0.024809647, 0.02319336, -0.035194397, 0.0661..."
153529,"how I figured schizophrenia out So, 10 years a...",schizophrenia,"[figured, schizophrenia, years, ago, mom, divo...","[0.0016873488, 0.066772856, -0.03414416, 0.087..."
153530,It's my 31st B-day tomorrow. SO far I've given...,schizophrenia,"[st, day, tomorrow, far, given, opiates, quit,...","[0.048358917, 0.016300201, 0.01368475, 0.04881..."


#### Test set

In [39]:
# Repeat dense w2v embedding for test set
w2v_test = org_test.copy()
w2v_test['tokenized'] = w2v_test['post'].apply(preprocess_w2v)

w2v_tokens = w2v_test['tokenized'].tolist() #overwrites list for train set
w2v_token_vectors = [get_word_vectors(toks, w2v_model) for toks in w2v_tokens] #overwrites list for train set
w2v_post_vectors = [np.mean(token_vec, axis=0) for token_vec in w2v_token_vectors] #overwrites list for train set

w2v_test['vector'] = w2v_post_vectors
w2v_test = w2v_test.dropna(subset=['vector'])
print(w2v_train.isna().sum().sum())

X_test_dense = np.array(w2v_test['vector'].tolist())
y_test_dense = w2v_test['subreddit']

w2v_test

  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)


0


Unnamed: 0,post,subreddit,tokenized,vector
5547,Lost My FP - How do I move on with my life? Ab...,BPD,"[lost, fp, move, life, two, months, ago, horri...","[0.048078947, 0.049309973, 0.013398189, 0.0657..."
694157,Any tips on stopping Negative Thought Spirals?...,mentalillness,"[tips, stopping, negative, thought, spirals, u...","[0.012358166, 0.083301365, 0.039498467, 0.0498..."
24095,I procrastinate sleeping in my bed. Hello -- I...,BPD,"[procrastinate, sleeping, bed, hello, sure, pa...","[0.03284032, 0.0360697, -0.020166585, 0.111350..."
580687,Can Anxiety Go Away Will my Anxiety go away if...,anxiety,"[anxiety, go, away, anxiety, go, away, caught,...","[0.051815636, 0.06778275, -0.03916369, 0.10600..."
492521,Hot sweaty palms I have a problem where my han...,anxiety,"[hot, sweaty, palms, problem, hands, get, swea...","[0.055758916, 0.03955738, -0.022751266, 0.0956..."
...,...,...,...,...
344317,I wish I had a personality around others I hav...,depression,"[wish, personality, around, others, things, fi...","[0.061346635, 0.042177945, 0.016557112, 0.1414..."
496047,I've been feeling like I'm going to have an an...,anxiety,"[feeling, like, going, anxiety, attack, second...","[0.10407967, 0.0719615, -0.06305537, 0.0552810..."
62633,How do I handle my friend with BPD? We are fri...,BPD,"[handle, friend, bpd, friends, nearly, ten, ye...","[0.037718367, 0.059844818, -0.0048405863, 0.09..."
341060,I‚Äôm about to check in on my ex I‚Äôm so ready to...,depression,"[check, ex, ready, bury, tonight, seen, ex, fo...","[0.021391585, 0.056926005, -0.008232529, 0.081..."


## Pickle

In [40]:
# Check all X and y and splits are stored correctly
print(f"{type(X_train_sparse) = }; {X_train_sparse.shape = }")
print(f"{type(y_train_sparse) = }; {y_train_sparse.shape = }")
print()
print(f"{type(X_test_sparse) = }; {X_test_sparse.shape = }")
print(f"{type(y_test_sparse) = }; {y_test_sparse.shape = }")
print()
print(f"{type(org_train) = }; {org_train.shape = }")
print(f"{type(org_test) = }; {org_test.shape = }")

type(X_train_sparse) = <class 'scipy.sparse.csr.csr_matrix'>; X_train_sparse.shape = (153532, 56705)
type(y_train_sparse) = <class 'pandas.core.series.Series'>; y_train_sparse.shape = (153532,)

type(X_test_sparse) = <class 'scipy.sparse.csr.csr_matrix'>; X_test_sparse.shape = (114271, 56705)
type(y_test_sparse) = <class 'pandas.core.series.Series'>; y_test_sparse.shape = (114271,)

type(org_train) = <class 'pandas.core.frame.DataFrame'>; org_train.shape = (153532, 2)
type(org_test) = <class 'pandas.core.frame.DataFrame'>; org_test.shape = (114271, 2)


In [41]:
print(f"{type(X_train_dense) = }; {X_train_dense.shape = }")
print(f"{type(y_train_dense) = }; {y_train_dense.shape = }")
print()
print(f"{type(X_test_dense) = }; {X_test_dense.shape = }")
print(f"{type(y_test_dense) = }; {y_test_dense.shape = }")
print()
print(f"{type(w2v_train) = }; {w2v_train.shape = }")
print(f"{type(w2v_test) = }; {w2v_test.shape = }")

type(X_train_dense) = <class 'numpy.ndarray'>; X_train_dense.shape = (153502, 300)
type(y_train_dense) = <class 'pandas.core.series.Series'>; y_train_dense.shape = (153502,)

type(X_test_dense) = <class 'numpy.ndarray'>; X_test_dense.shape = (114256, 300)
type(y_test_dense) = <class 'pandas.core.series.Series'>; y_test_dense.shape = (114256,)

type(w2v_train) = <class 'pandas.core.frame.DataFrame'>; w2v_train.shape = (153502, 4)
type(w2v_test) = <class 'pandas.core.frame.DataFrame'>; w2v_test.shape = (114256, 4)


In [42]:
# Pickle the sparse vectors and y vals
pickle.dump((X_train_sparse, X_test_sparse, y_train_sparse,
            y_test_sparse, org_train, org_test), open('pickles/sparse.pkl', 'wb'))

# Pickle the vectorizer and transformer
pickle.dump((vectorizer, transformer), open('pickles/vectorizer.pkl', 'wb'))

In [43]:
# Pickle the dense vectors and y vals
pickle.dump((X_train_dense, X_test_dense, y_train_dense,
            y_test_dense, w2v_train, w2v_test), open('pickles/dense.pkl', 'wb'))

In [44]:
# Load sparse
X_train_sparse, X_test_sparse, y_train_sparse, y_test_sparse, org_train, org_test = pd.read_pickle("pickles/sparse.pkl")

# Load the vectorizer and transformer
vectorizer, transformer = pd.read_pickle("pickles/vectorizer.pkl")

In [45]:
# Load dense
X_train_dense, X_test_dense, y_train_dense, y_test_dense, w2v_train, w2v_test = pd.read_pickle("pickles/dense.pkl")