In [3]:
import os
import json
import re

import pandas as pd
import numpy as np
from nltk import word_tokenize
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
import nltk
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer, TfidfTransformer, CountVectorizer

nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\kpaks\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [7]:
documents_path = os.path.join('..', 'data', 'subtitles')
documents = os.listdir(documents_path)

texts = []
titles = []
for document in documents:
    try:
        doc_path = os.path.join('..', 'data', 'subtitles', document)

        with open(doc_path, 'r') as f:
            doc_json = json.load(f)

        # get sentences
        sents = pd.DataFrame(doc_json)['text'].to_numpy()

        # get the document and truncate empty spaces
        doc = ' '.join(sents)
        doc = re.sub('\s+', ' ', doc)
        
        texts.append(doc)
        titles.append(document[:-5])

    except Exception as e:
        print(document, e)

Motivation_and_Goals__Part_3 Expecting value: line 1 column 1 (char 0)
Motivation_and_Goals__Part_4 Expecting value: line 1 column 1 (char 0)
Psychiatrist_Reacts Expecting value: line 1 column 1 (char 0)
Therapist_Reacts Expecting value: line 1 column 1 (char 0)
Therapist_Talks Expecting value: line 1 column 1 (char 0)


In [8]:
porter_stemmer = PorterStemmer()
stop_words = stopwords.words('english') + ['like', 'right', '__', '_connector_']

In [9]:
def preproc(text):
    text = text.lower()
    text = re.sub("\\s+(in|the|all|for|and|on)\\s+", " _connector_ ", text)  # normalize certain words

    # stem words
    words = re.split("\\s+", text)
    stemmed_words = [porter_stemmer.stem(word=word) for word in words if word not in stop_words]
    return ' '.join(stemmed_words)

In [10]:
pipe = Pipeline([('count', CountVectorizer(stop_words=stop_words, preprocessor=preproc)),
                 ('tfidf', TfidfTransformer())])
pipe.fit(texts)

X = pipe.transform(texts)
feature_names = pipe['count'].get_feature_names()

In [11]:
t = "I have trouble speaking to other people and feel very awkward in social situations."
testrow = pipe.transform([t])

In [15]:
from sklearn.metrics.pairwise import cosine_similarity

similarities = cosine_similarity(X, testrow)
np.argmax(similarities)

76

In [16]:
testdf = pd.DataFrame(testrow.T.todense(), index=feature_names, columns=['test'])
compdf = pd.DataFrame(X[76].T.todense(), index=feature_names, columns=['comp'])

In [17]:
compdf.sort_values('comp', ascending=False).iloc[:20]

Unnamed: 0,comp
go,0.33867
know,0.234203
appendix,0.226625
vestigi,0.226625
outsid,0.22459
bacteria,0.200821
book,0.179618
dune,0.173531
peopl,0.133174
get,0.12399


In [18]:
testdf.sort_values('test', ascending=False).iloc[:20]

Unnamed: 0,test
awkward,0.691623
troubl,0.465935
social,0.356064
speak,0.274024
feel,0.230036
peopl,0.2231
planner,0.0
planet,0.0
plane,0.0
pl,0.0
