In [1]:
import pandas as pd

from nltk.corpus import stopwords 
import string
from nltk.stem.wordnet import WordNetLemmatizer
from nltk import word_tokenize 

In [19]:
file = "../raw_data/quotes - reduced.csv"
quotes = pd.read_csv(file, decimal=",")
quotes.head(3)

Unnamed: 0,quote,author,category
0,"I'm selfish, impatient and a little insecure. ...",Marilyn Monroe,"attributed-no-source, best, life, love, mistak..."
1,You've gotta dance like there's nobody watchin...,William W. Purkey,"dance, heaven, hurt, inspirational, life, love..."
2,You know you're in love when you can't fall as...,Dr. Seuss,"attributed-no-source, dreams, love, reality, s..."


In [20]:
quotes.shape


(156656, 3)

In [21]:
quotes["quote"]

0         I'm selfish, impatient and a little insecure. ...
1         You've gotta dance like there's nobody watchin...
2         You know you're in love when you can't fall as...
3         A friend is someone who knows all about you an...
4         Darkness cannot drive out darkness: only light...
                                ...                        
156651    The harassed look is that of a desperately tir...
156652    …In this way that he sought to control the ver...
156653    No matter how we choose to live, we both die a...
156654    The goal that you hope you will one day arrive...
156655    I've spent years living safely to secure a lon...
Name: quote, Length: 156656, dtype: object

In [22]:
quotes['quote'] = quotes["quote"].astype('str')

### Data Cleaning


In [23]:
from nltk.corpus import stopwords 
import string
from nltk.stem.wordnet import WordNetLemmatizer
from nltk import word_tokenize 

def clean (text):
    for punctuation in string.punctuation:
        text = text.replace(punctuation, ' ') # Remove Punctuation
    lowercased = text.lower() # Lower Case
    tokenized = word_tokenize(lowercased) # Tokenize
    words_only = [word for word in tokenized if word.isalpha()] # Remove numbers
    stop_words = set(stopwords.words('english')) # Make stopword list
    without_stopwords = [word for word in words_only if not word in stop_words] # Remove Stop Words
    lemma=WordNetLemmatizer() # Initiate Lemmatizer
    lemmatized = [lemma.lemmatize(word) for word in without_stopwords] # Lemmatize
    return lemmatized



In [24]:
# Apply to all quotes
quotes['clean_quotes'] = quotes.quote.apply(clean)


In [25]:
quotes["clean_quotes"] = quotes["clean_quotes"].astype('str')

In [26]:
quotes["clean_quotes"]

0         ['selfish', 'impatient', 'little', 'insecure',...
1         ['got', 'ta', 'dance', 'like', 'nobody', 'watc...
2         ['know', 'love', 'fall', 'asleep', 'reality', ...
3            ['friend', 'someone', 'know', 'still', 'love']
4         ['darkness', 'drive', 'darkness', 'light', 'ha...
                                ...                        
156651    ['harassed', 'look', 'desperately', 'tired', '...
156652    ['way', 'sought', 'control', 'passage', 'life'...
156653           ['matter', 'choose', 'live', 'die', 'end']
156654    ['goal', 'hope', 'one', 'day', 'arrive', 'long...
156655    ['spent', 'year', 'living', 'safely', 'secure'...
Name: clean_quotes, Length: 156656, dtype: object

### Latent Dirichlet Allocation model

In [46]:
#Train an LDA model to extract potential topics.

from sklearn.decomposition import LatentDirichletAllocation
from sklearn.feature_extraction.text import CountVectorizer

vectorizer = CountVectorizer(max_df=0.95, min_df=2, stop_words="english")

data_vectorized = vectorizer.fit_transform(quotes['clean_quotes'])

lda_model = LatentDirichletAllocation(n_components=5)

lda_vectors = lda_model.fit_transform(data_vectorized)


In [47]:
lda_model.components_

array([[3.26934913, 0.2000015 , 0.2000144 , ..., 0.20000464, 0.38207064,
        2.1652035 ],
       [0.20226623, 0.20000299, 1.2002096 , ..., 0.21003663, 0.20167896,
        0.20183587],
       [0.20221164, 0.20000115, 0.20500404, ..., 2.18995069, 0.20447902,
        0.20349697],
       [0.2036303 , 2.19567188, 0.20875525, ..., 0.20000498, 0.20000436,
        0.20044245],
       [6.1225427 , 0.20432248, 1.18601671, ..., 0.20000306, 2.01176702,
        1.2290212 ]])

In [48]:
#function to print potential topics

def print_topics(model, vectorizer):
    for idx, topic in enumerate(model.components_):
        print("Topic %d:" % (idx))
        print([(vectorizer.get_feature_names()[i], topic[i])
                        for i in topic.argsort()[:-10 - 1:-1]])
        

print_topics(lda_model, vectorizer)

Topic 0:
[('love', 6692.1484566651325), ('human', 4085.890593548118), ('woman', 3885.952276808731), ('child', 3506.8041703140343), ('power', 3374.864407323377), ('life', 3044.1699906290546), ('world', 2780.727443276623), ('men', 2468.782294241673), ('art', 2457.034690325855), ('society', 2416.997675195514)]
Topic 1:
[('god', 13376.75914986686), ('people', 3420.3867553278264), ('faith', 2883.3438170919126), ('soul', 2357.464295337891), ('fear', 2295.285242775864), ('good', 2231.5647285853893), ('spirit', 1555.790922796209), ('religion', 1541.5497796740992), ('man', 1466.950104143869), ('strength', 1464.8806100127663)]
Topic 2:
[('life', 21597.160154022604), ('thing', 11796.939994352055), ('people', 9950.381817744257), ('make', 8792.93519996877), ('know', 8033.253921103877), ('time', 7815.436633249509), ('love', 7256.0782651871195), ('world', 6998.12078374794), ('want', 5699.5193901754465), ('think', 5280.486228225768)]
Topic 3:
[('world', 4542.957175153552), ('heart', 4181.404732702302)

In [49]:
#Test the model
example =["I am so sad and I want to cry."]

example_vectorized = vectorizer.transform(example)

lda_vectors_cry = lda_model.transform(example_vectorized)

print("topic 0 :", lda_vectors_cry[0][0])
print("topic 1 :", lda_vectors_cry[0][1])
print("topic 2 :", lda_vectors_cry[0][2])
print("topic 3 :", lda_vectors_cry[0][3])
print("topic 4 :", lda_vectors_cry[0][4])

topic 0 : 0.06678087413720929
topic 1 : 0.06670819622840185
topic 2 : 0.7301792370785184
topic 3 : 0.06712943467643834
topic 4 : 0.06920225787943218


In [61]:
#function to returning proper topic for text
def best_topic(text):
    text=[text]
    ev=vectorizer.transform(text)
    lda_ev=lda_model.transform(ev)
    topic_score={}
    for i in range(5):
        topic_score.update({f"topic{i}": lda_ev[0][i]})
        a=max(topic_score.values())
    for key, value in topic_score.items():
        if a == value:
            name=(key,a)
    print(name)

In [63]:
text ="I am so sad and I want to cry"
best_topic(text)

('topic2', 0.7301792370785184)


In [64]:
text='I am so happy, I just want to dance and sing.'
best_topic(text)

('topic3', 0.4474368543715951)


In [65]:
text='Here you can see my house next to the Brandenburg Gate in Berlin.'
best_topic(text)

('topic3', 0.7982270809755947)


### Vectorizer Tuning: Bag of words & TfidfVectorizer & MultinomialNB

In [None]:
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB

# Create Pipeline
pipeline = Pipeline([
    ('tfidf', TfidfVectorizer()),
    ('nb', MultinomialNB()),
])

# Set parameters to search
parameters = {
    'tfidf__ngram_range': ((1,1), (2,2)),
    'tfidf__min_df': (0.05,0.1),
    'tfidf__max_df': (0.75,1),
    'nb__alpha': (0.01,0.1,1,10),}

# Perform grid search on pipeline
grid_search = GridSearchCV(pipeline, parameters, n_jobs=-1, 
                           verbose=1, scoring = "accuracy", 
                           refit=True, cv=5)

grid_search.fit(data.clean_reviews,data.target)

In [66]:
#Bag of words for top 100 features
vectorizer = CountVectorizer(stop_words="english",max_features=100)

X1 = vectorizer.fit_transform(quotes["clean_quotes"])
vectorizer.get_feature_names()



['art',
 'away',
 'beautiful',
 'beauty',
 'believe',
 'best',
 'better',
 'body',
 'book',
 'change',
 'child',
 'come',
 'day',
 'death',
 'different',
 'dream',
 'earth',
 'end',
 'experience',
 'eye',
 'face',
 'fact',
 'faith',
 'fear',
 'feel',
 'feeling',
 'free',
 'friend',
 'god',
 'going',
 'good',
 'great',
 'hand',
 'happiness',
 'happy',
 'hard',
 'heart',
 'hope',
 'human',
 'idea',
 'joy',
 'kind',
 'know',
 'knowledge',
 'learn',
 'let',
 'life',
 'light',
 'like',
 'little',
 'live',
 'living',
 'long',
 'look',
 'love',
 'make',
 'man',
 'matter',
 'mean',
 'men',
 'mind',
 'moment',
 'nature',
 'need',
 'new',
 'night',
 'old',
 'pain',
 'peace',
 'people',
 'person',
 'place',
 'power',
 'read',
 'real',
 'reality',
 'really',
 'reason',
 'right',
 'said',
 'say',
 'self',
 'soul',
 'story',
 'tell',
 'thing',
 'think',
 'thought',
 'time',
 'true',
 'truth',
 'understand',
 'want',
 'war',
 'way',
 'woman',
 'word',
 'work',
 'world',
 'year']

In [67]:
#TfidfVectorizer for 100 features
from sklearn.feature_extraction.text import TfidfVectorizer

tf_idf_vectorizer = TfidfVectorizer(max_features = 100,stop_words="english")

X2 = tf_idf_vectorizer.fit_transform(quotes["clean_quotes"])


In [68]:
tf_idf_vectorizer.get_feature_names()

['art',
 'away',
 'beautiful',
 'beauty',
 'believe',
 'best',
 'better',
 'body',
 'book',
 'change',
 'child',
 'come',
 'day',
 'death',
 'different',
 'dream',
 'earth',
 'end',
 'experience',
 'eye',
 'face',
 'fact',
 'faith',
 'fear',
 'feel',
 'feeling',
 'free',
 'friend',
 'god',
 'going',
 'good',
 'great',
 'hand',
 'happiness',
 'happy',
 'hard',
 'heart',
 'hope',
 'human',
 'idea',
 'joy',
 'kind',
 'know',
 'knowledge',
 'learn',
 'let',
 'life',
 'light',
 'like',
 'little',
 'live',
 'living',
 'long',
 'look',
 'love',
 'make',
 'man',
 'matter',
 'mean',
 'men',
 'mind',
 'moment',
 'nature',
 'need',
 'new',
 'night',
 'old',
 'pain',
 'peace',
 'people',
 'person',
 'place',
 'power',
 'read',
 'real',
 'reality',
 'really',
 'reason',
 'right',
 'said',
 'say',
 'self',
 'soul',
 'story',
 'tell',
 'thing',
 'think',
 'thought',
 'time',
 'true',
 'truth',
 'understand',
 'want',
 'war',
 'way',
 'woman',
 'word',
 'work',
 'world',
 'year']

In [69]:
#TfidfVectorizer for 100 features of combination words

tf_idf_vectorizer = TfidfVectorizer(max_features = 100,ngram_range=(2,2))

X3 = tf_idf_vectorizer.fit_transform(quotes["clean_quotes"])


In [70]:
tf_idf_vectorizer.get_feature_names()

['another person',
 'anyone else',
 'anything else',
 'best friend',
 'best way',
 'change life',
 'change world',
 'come back',
 'could never',
 'could see',
 'day day',
 'even though',
 'every day',
 'every man',
 'every moment',
 'every single',
 'every time',
 'everyone else',
 'everything else',
 'fall love',
 'feel like',
 'felt like',
 'find way',
 'first time',
 'go back',
 'god love',
 'gon na',
 'good bad',
 'good thing',
 'hard work',
 'human being',
 'human life',
 'important thing',
 'jesus christ',
 'know know',
 'know love',
 'let go',
 'life life',
 'life like',
 'life love',
 'life one',
 'life without',
 'live life',
 'long time',
 'look back',
 'look like',
 'love god',
 'love life',
 'love like',
 'love love',
 'love one',
 'love someone',
 'make feel',
 'make life',
 'make sense',
 'make sure',
 'many people',
 'many thing',
 'many time',
 'men woman',
 'never forget',
 'never know',
 'nothing else',
 'one another',
 'one day',
 'one know',
 'one life',
 'one must'