In [1]:
import pandas as pd

from nltk.corpus import stopwords 
import string
from nltk.stem.wordnet import WordNetLemmatizer
from nltk import word_tokenize 

In [19]:
file = "../raw_data/quotes - reduced.csv"
quotes = pd.read_csv(file, decimal=",")
quotes.head(3)

Unnamed: 0,quote,author,category
0,"I'm selfish, impatient and a little insecure. ...",Marilyn Monroe,"attributed-no-source, best, life, love, mistak..."
1,You've gotta dance like there's nobody watchin...,William W. Purkey,"dance, heaven, hurt, inspirational, life, love..."
2,You know you're in love when you can't fall as...,Dr. Seuss,"attributed-no-source, dreams, love, reality, s..."


In [20]:
quotes.shape


(156656, 3)

In [21]:
quotes["quote"]

0         I'm selfish, impatient and a little insecure. ...
1         You've gotta dance like there's nobody watchin...
2         You know you're in love when you can't fall as...
3         A friend is someone who knows all about you an...
4         Darkness cannot drive out darkness: only light...
                                ...                        
156651    The harassed look is that of a desperately tir...
156652    …In this way that he sought to control the ver...
156653    No matter how we choose to live, we both die a...
156654    The goal that you hope you will one day arrive...
156655    I've spent years living safely to secure a lon...
Name: quote, Length: 156656, dtype: object

In [22]:
quotes['quote'] = quotes["quote"].astype('str')

### Data Cleaning


In [23]:
from nltk.corpus import stopwords 
import string
from nltk.stem.wordnet import WordNetLemmatizer
from nltk import word_tokenize 

def clean (text):
    for punctuation in string.punctuation:
        text = text.replace(punctuation, ' ') # Remove Punctuation
    lowercased = text.lower() # Lower Case
    tokenized = word_tokenize(lowercased) # Tokenize
    words_only = [word for word in tokenized if word.isalpha()] # Remove numbers
    stop_words = set(stopwords.words('english')) # Make stopword list
    without_stopwords = [word for word in words_only if not word in stop_words] # Remove Stop Words
    lemma=WordNetLemmatizer() # Initiate Lemmatizer
    lemmatized = [lemma.lemmatize(word) for word in without_stopwords] # Lemmatize
    return lemmatized



In [24]:
# Apply to all quotes
quotes['clean_quotes'] = quotes.quote.apply(clean)


In [25]:
quotes["clean_quotes"] = quotes["clean_quotes"].astype('str')

In [26]:
quotes["clean_quotes"]

0         ['selfish', 'impatient', 'little', 'insecure',...
1         ['got', 'ta', 'dance', 'like', 'nobody', 'watc...
2         ['know', 'love', 'fall', 'asleep', 'reality', ...
3            ['friend', 'someone', 'know', 'still', 'love']
4         ['darkness', 'drive', 'darkness', 'light', 'ha...
                                ...                        
156651    ['harassed', 'look', 'desperately', 'tired', '...
156652    ['way', 'sought', 'control', 'passage', 'life'...
156653           ['matter', 'choose', 'live', 'die', 'end']
156654    ['goal', 'hope', 'one', 'day', 'arrive', 'long...
156655    ['spent', 'year', 'living', 'safely', 'secure'...
Name: clean_quotes, Length: 156656, dtype: object

### Latent Dirichlet Allocation model

In [27]:
#Train an LDA model to extract potential topics.

from sklearn.decomposition import LatentDirichletAllocation
from sklearn.feature_extraction.text import CountVectorizer

vectorizer = CountVectorizer(max_df=0.95, min_df=2, stop_words="english")

data_vectorized = vectorizer.fit_transform(quotes['clean_quotes'])

lda_model = LatentDirichletAllocation(n_components=5)

lda_vectors = lda_model.fit_transform(data_vectorized)


In [28]:
lda_model.components_

array([[3.59326327, 0.34699743, 0.20003217, ..., 0.20000486, 1.19712089,
        1.68206954],
       [0.21407794, 1.99162189, 1.19897809, ..., 0.20483451, 0.20485768,
        0.20384361],
       [5.7849797 , 0.20000675, 1.17963265, ..., 0.20000273, 0.20703014,
        0.20395634],
       [0.203582  , 0.20000668, 0.2047127 , ..., 2.19515438, 1.19066664,
        1.7093748 ],
       [0.20409709, 0.26136724, 0.21664439, ..., 0.20000352, 0.20032465,
        0.20075571]])

In [31]:
#function to print potential topics

def print_topics(model, vectorizer):
    for idx, topic in enumerate(model.components_):
        print("Topic %d:" % (idx))
        print([(vectorizer.get_feature_names()[i], topic[i])
                        for i in topic.argsort()[:-10 - 1:-1]])
        

print_topics(lda_model, vectorizer)

Topic 0:
[('woman', 6991.357279013261), ('said', 3647.1041550022883), ('like', 3038.973763734751), ('men', 2984.1012339801123), ('child', 2310.7438039529115), ('man', 2073.4589737791985), ('sex', 1854.4062632514404), ('year', 1540.188378276173), ('girl', 1500.8982335913943), ('mother', 1398.6305062257554)]
Topic 1:
[('life', 11247.243609455589), ('people', 2977.664212865179), ('happiness', 2776.1126657758114), ('mind', 2715.010856449198), ('time', 2654.0115951983703), ('great', 2400.455789573284), ('make', 2388.05899421583), ('work', 2333.671322852402), ('war', 2217.676207473787), ('success', 2216.191595933482)]
Topic 2:
[('life', 11048.858126884956), ('thing', 10315.308848430703), ('time', 10100.664794006188), ('people', 9953.869019604164), ('know', 9940.737753334213), ('want', 9125.284671022135), ('like', 7962.043647068456), ('make', 7063.428218425889), ('think', 7050.802070918503), ('book', 5806.13743735092)]
Topic 3:
[('love', 14380.654440410828), ('god', 13663.864864538524), ('wor

In [39]:
#Test the model
example =["I am so sad and I want to cry."]

example_vectorized = vectorizer.transform(example)

lda_vectors_cry = lda_model.transform(example_vectorized)

print("topic 0 :", lda_vectors_cry[0][0])
print("topic 1 :", lda_vectors_cry[0][1])
print("topic 2 :", lda_vectors_cry[0][2])
print("topic 3 :", lda_vectors_cry[0][3])
print("topic 4 :", lda_vectors_cry[0][4])

topic 0 : 0.06676896484288439
topic 1 : 0.06667520503362068
topic 2 : 0.7326275579747601
topic 3 : 0.06681882686100665
topic 4 : 0.06710944528772816


In [37]:
def best_topic(text):
    #text=[str(text)]
    vectorizer = CountVectorizer(max_df=0.95, min_df=2, stop_words="english")
    ev=vectorizer.transform(text)
    lda_ev=lda_model.transform(ev)
    topic_score={}
    for i in range(5):
        topic_score.update({f"topic{i}": lda_ev[0][i]})
    return topic_score

        
        
        

In [38]:
text =["I am so sad and I want to cry."]
best_topic(text)

NotFittedError: Vocabulary not fitted or provided

### Vectorizer Tuning: Bag of words & TfidfVectorizer & MultinomialNB

In [None]:
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB

# Create Pipeline
pipeline = Pipeline([
    ('tfidf', TfidfVectorizer()),
    ('nb', MultinomialNB()),
])

# Set parameters to search
parameters = {
    'tfidf__ngram_range': ((1,1), (2,2)),
    'tfidf__min_df': (0.05,0.1),
    'tfidf__max_df': (0.75,1),
    'nb__alpha': (0.01,0.1,1,10),}

# Perform grid search on pipeline
grid_search = GridSearchCV(pipeline, parameters, n_jobs=-1, 
                           verbose=1, scoring = "accuracy", 
                           refit=True, cv=5)

grid_search.fit(data.clean_reviews,data.target)

In [None]:
#Bag of words for top 100 features
vectorizer = CountVectorizer(stop_words="english",max_features=100)

X1 = vectorizer.fit_transform(quotes["clean_quotes"])
vectorizer.get_feature_names()

In [None]:
#TfidfVectorizer for 100 features
from sklearn.feature_extraction.text import TfidfVectorizer

tf_idf_vectorizer = TfidfVectorizer(max_features = 100,stop_words="english")

X2 = tf_idf_vectorizer.fit_transform(quotes["clean_quotes"])


In [None]:
tf_idf_vectorizer.get_feature_names()

In [None]:
#TfidfVectorizer for 100 features of combination words

tf_idf_vectorizer = TfidfVectorizer(max_features = 100,ngram_range=(2,2))

X3 = tf_idf_vectorizer.fit_transform(quotes["clean_quotes"])


In [None]:
tf_idf_vectorizer.get_feature_names()