In [1]:
#Installing the necessary libraries using pip
!pip install scikit-learn pandas regex nltk pyspellchecker textblob



In [2]:
#Importing all the installed libraries
import nltk
nltk.download('vader_lexicon')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')
from sklearn.model_selection import train_test_split
import lisaCleaningFunctions as lcf #created by: Lisa Juckett
from sklearn.feature_extraction.text import TfidfVectorizer
import pandas as pd
import glob
import re
from sklearn.cluster import KMeans
from sklearn.metrics import adjusted_rand_score
import string
from nltk.corpus import stopwords


[nltk_data] Downloading package vader_lexicon to
[nltk_data]     C:\Users\2148295\AppData\Roaming\nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\2148295\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\2148295\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\2148295\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


In [3]:
#Cleaning the data, which includes:
#Removing records where the value of response is Nan(Not a number) or -99
#Removing Records where the value of the UCLA Scale is Nan
#Removing stopwords
#Lemmatizing the text
#Most of the cleaning is done using functions created for cleaning by Lisa Juckett. (generate_exl.py)
df = pd.read_excel("Whatisloneliness060321.xlsx")
df = df[df["Q9_Loneliness_Meaning_Qual"] != -99]
df = df[df["Q9_Loneliness_Meaning_Qual"].notna()]
df = df[df["UCLA3itemscale"].notna()]
_input = df["Q9_Loneliness_Meaning_Qual"]
_input = _input.apply(lcf.remove_stopwords)
_input = _input.apply(lcf.clean_text)
_target = df["UCLA3itemscale"]

In [4]:
#Splitting the dataset into training and testing (75%-25%)
_input_train, _input_test, _target_train, _target_test = train_test_split(_input,_target, test_size = 0.25)
print(_input_train.shape, _input_test.shape, _target_train.shape, _target_test.shape )

(27714,) (9238,) (27714,) (9238,)


TF-IDF (Term Frequency - Inverse Document Frequency) is used which helps us understand the value of a term in a document relative to the large corpus of document. It is calculated by calculating the how many times the word appeared in the document times how many documents have the word. Example: The word "pizza" is mentioned 30 times in a document which contains 100 words and out of the corpus of 10,000 documents, 352 documents have the word "pizza". The TF-IDF calculated would be: TF => Term Frequency = 30/100 = 0.3 IDF => log(10,000/352) = 1.45 TF-IDF = 0.3* 1.45 = 0.44 This allows us to get the most meaningful words from the document

In [5]:
#The TfidVectorizer also aid in forming bigrams and trigrams
vectorizer = TfidfVectorizer(lowercase=True,max_df=0.8,min_df=5,ngram_range=(1,3),stop_words="english")
vectors = vectorizer.fit_transform(_input_train)
feature_names = vectorizer.get_feature_names()
dense = vectors.todense()
denselist = dense.tolist()
all_keywords =[]
for description in denselist:
    x=0
    keywords = []
    for word in description:
        if word>0:
            keywords.append(feature_names[x])
        x+=1
    all_keywords.append(keywords)
print(all_keywords)



[['happily', 'isnt', 'loneliness', 'loneliness spend', 'loneliness spend time', 'loneliness want', 'loneliness want share', 'prefer', 'prefer spend', 'share', 'share things', 'spend', 'spend time', 'theres', 'things', 'things theres', 'time', 'time prefer', 'want', 'want share', 'want share things'], ['meditation'], ['begin', 'day', 'day day', 'health', 'health issue', 'impact', 'isolation', 'issue', 'loneliness', 'mental', 'mental health', 'mental health issue'], ['drag', 'friends', 'good', 'good friends', 'look', 'morals', 'pick', 'sound', 'unless', 'wont'], ['care', 'care live', 'care live die', 'die', 'failure', 'feeling', 'feeling look', 'feeling world', 'irrelevant', 'life', 'life feeling', 'like', 'like care', 'live', 'live die', 'look', 'look world', 'outside', 'outsider', 'window', 'world', 'world live', 'world outside'], ['burdensome', 'feel', 'people', 'unconnected'], ['allow', 'brothers', 'child', 'child felt', 'education', 'fail', 'felt', 'fend', 'involve', 'know', 'leave'

In [None]:
keywordtxt = ''
for keywords in all_keywords:
    for words in keywords:
        for char in words:
            if char==" ":
                keywordtxt += '_'
            else:
                keywordtxt += char
        keywordtxt += ' '
    keywordtxt += '\n'
with open('keywords.txt','w') as f:
    f.write(keywordtxt)
f.close

In [6]:
#Storing the Training and Testing data in seperate CSV files including the processed Keywords for the Training data
input_train = pd.DataFrame()
input_train.insert(loc=0,column='Response',value=_input_train)
input_train.insert(loc=1,column='Keywords',value=all_keywords)
input_train.insert(loc=1,column='Target',value=_target_train)
input_train.to_csv('Training_Data.csv')
input_test = pd.DataFrame()
input_test.insert(loc=0,column='Response',value=_input_test)
input_test.insert(loc=1,column='Target',value=_target_test)
input_test.to_csv('Testing_Data.csv')