In [31]:
import pandas as pd

from nltk.corpus import stopwords 
import string
from nltk.stem.wordnet import WordNetLemmatizer
from nltk import word_tokenize 

In [32]:
file = "../raw_data/quotes.csv"
quotes = pd.read_csv(file, decimal=",")
quotes.head(3)

Unnamed: 0,quote,author,category
0,"I'm selfish, impatient and a little insecure. ...",Marilyn Monroe,"attributed-no-source, best, life, love, mistak..."
1,You've gotta dance like there's nobody watchin...,William W. Purkey,"dance, heaven, hurt, inspirational, life, love..."
2,You know you're in love when you can't fall as...,Dr. Seuss,"attributed-no-source, dreams, love, reality, s..."


In [39]:
quotes.shape


(499709, 3)

In [45]:
quotes["quote"]

0         I'm selfish, impatient and a little insecure. ...
1         You've gotta dance like there's nobody watchin...
2         You know you're in love when you can't fall as...
3         A friend is someone who knows all about you an...
4         Darkness cannot drive out darkness: only light...
                                ...                        
499704    I do believe the most important thing I can do...
499705    I'd say I'm a bit antimadridista although I do...
499706                                   The future is now.
499707    In all my life and in the future, I will alway...
499708      The future is as bright as the promises of God.
Name: quote, Length: 499709, dtype: object

In [47]:
quotes['quote'] = quotes["quote"].astype('str')

### Data Cleaning


In [48]:
from nltk.corpus import stopwords 
import string
from nltk.stem.wordnet import WordNetLemmatizer
from nltk import word_tokenize 

def clean (text):
    for punctuation in string.punctuation:
        text = text.replace(punctuation, ' ') # Remove Punctuation
    lowercased = text.lower() # Lower Case
    tokenized = word_tokenize(lowercased) # Tokenize
    words_only = [word for word in tokenized if word.isalpha()] # Remove numbers
    stop_words = set(stopwords.words('english')) # Make stopword list
    without_stopwords = [word for word in words_only if not word in stop_words] # Remove Stop Words
    lemma=WordNetLemmatizer() # Initiate Lemmatizer
    lemmatized = [lemma.lemmatize(word) for word in without_stopwords] # Lemmatize
    return lemmatized



In [49]:
# Apply to all quotes
quotes['clean_quotes'] = quotes.quote.apply(clean)


In [53]:
quotes["clean_quotes"] = quotes["clean_quotes"].astype('str')

In [54]:
quotes["clean_quotes"]

0         ['selfish', 'impatient', 'little', 'insecure',...
1         ['got', 'ta', 'dance', 'like', 'nobody', 'watc...
2         ['know', 'love', 'fall', 'asleep', 'reality', ...
3            ['friend', 'someone', 'know', 'still', 'love']
4         ['darkness', 'drive', 'darkness', 'light', 'ha...
                                ...                        
499704    ['believe', 'important', 'thing', 'help', 'you...
499705    ['say', 'bit', 'antimadridista', 'although', '...
499706                                           ['future']
499707    ['life', 'future', 'always', 'faithful', 'loya...
499708               ['future', 'bright', 'promise', 'god']
Name: clean_quotes, Length: 499709, dtype: object

### Latent Dirichlet Allocation model

In [69]:
#Train an LDA model to extract potential topics.

from sklearn.decomposition import LatentDirichletAllocation
from sklearn.feature_extraction.text import CountVectorizer

vectorizer = CountVectorizer()

data_vectorized = vectorizer.fit_transform(quotes['clean_quotes'])

lda_model = LatentDirichletAllocation(n_components=5)

lda_vectors = lda_model.fit_transform(data_vectorized)


In [57]:
lda_model.components_

array([[24.84701682,  6.39591812,  0.50194978, ...,  0.50403234,
         0.50317524,  0.52154638],
       [10.15298318,  0.60408188,  1.49805022, ...,  1.49596766,
         1.49682475,  2.47845362]])

In [70]:
#function to print potential topics

def print_topics(model, vectorizer):
    for idx, topic in enumerate(model.components_):
        print("Topic %d:" % (idx))
        print([(vectorizer.get_feature_names()[i], topic[i])
                        for i in topic.argsort()[:-10 - 1:-1]])
        

print_topics(lda_model, vectorizer)

Topic 0:




[('like', 29882.191088209598), ('know', 22798.17746946296), ('said', 22061.22797446991), ('get', 18451.050082623293), ('would', 17262.68789726628), ('want', 16627.84613922117), ('one', 14612.866313438763), ('say', 14486.818064221916), ('think', 14447.881970374992), ('time', 13640.231956193033)]
Topic 1:
[('love', 52671.204452057944), ('life', 48549.03256128378), ('one', 28180.57874740819), ('thing', 25249.007702776722), ('god', 25029.04770806585), ('never', 16753.044346147068), ('make', 16632.29248432716), ('time', 15801.693745430932), ('know', 15622.73078421346), ('people', 14978.6522497944)]
Topic 2:
[('people', 29761.479071480728), ('one', 9836.047653654647), ('world', 8338.110937271962), ('make', 7757.670741665258), ('human', 7281.362039390888), ('think', 7266.280308760845), ('right', 6150.1592636973155), ('problem', 5866.5126777233245), ('society', 5836.689424590782), ('money', 5787.881581774315)]
Topic 3:
[('like', 21126.150138647325), ('light', 10054.661800125366), ('one', 9335.

In [72]:
#Test the model
example =["I am so sad and I want to cry."]

example_vectorized = vectorizer.transform(example)

lda_vectors = lda_model.transform(example_vectorized)

print("topic 0 :", lda_vectors[0][0])
print("topic 1 :", lda_vectors[0][1])
print("topic 2 :", lda_vectors[0][2])
print("topic 3 :", lda_vectors[0][3])
print("topic 4 :", lda_vectors[0][4])

topic 0 : 0.645584306306476
topic 1 : 0.03401577570857187
topic 2 : 0.03344023144359796
topic 3 : 0.2528053131374051
topic 4 : 0.03415437340394907


### Vectorizer Tuning: Bag of words & TfidfVectorizer & MultinomialNB

In [None]:
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB

# Create Pipeline
pipeline = Pipeline([
    ('tfidf', TfidfVectorizer()),
    ('nb', MultinomialNB()),
])

# Set parameters to search
parameters = {
    'tfidf__ngram_range': ((1,1), (2,2)),
    'tfidf__min_df': (0.05,0.1),
    'tfidf__max_df': (0.75,1),
    'nb__alpha': (0.01,0.1,1,10),}

# Perform grid search on pipeline
grid_search = GridSearchCV(pipeline, parameters, n_jobs=-1, 
                           verbose=1, scoring = "accuracy", 
                           refit=True, cv=5)

grid_search.fit(data.clean_reviews,data.target)

In [90]:
#Bag of words for top 100 features
vectorizer = CountVectorizer(stop_words="english",max_features=100)

X1 = vectorizer.fit_transform(quotes["clean_quotes"])
vectorizer.get_feature_names()

['away',
 'bad',
 'beautiful',
 'believe',
 'best',
 'better',
 'body',
 'book',
 'change',
 'child',
 'come',
 'day',
 'death',
 'different',
 'dream',
 'end',
 'experience',
 'eye',
 'face',
 'fact',
 'fear',
 'feel',
 'feeling',
 'friend',
 'future',
 'god',
 'going',
 'good',
 'got',
 'great',
 'hand',
 'happy',
 'hard',
 'head',
 'heart',
 'help',
 'home',
 'hope',
 'human',
 'idea',
 'important',
 'kind',
 'know',
 'learn',
 'let',
 'life',
 'light',
 'like',
 'little',
 'live',
 'living',
 'long',
 'look',
 'lot',
 'love',
 'make',
 'man',
 'matter',
 'mean',
 'men',
 'mind',
 'moment',
 'nature',
 'need',
 'new',
 'night',
 'old',
 'pain',
 'past',
 'people',
 'person',
 'place',
 'power',
 'real',
 'really',
 'reason',
 'right',
 'said',
 'say',
 'self',
 'sense',
 'soul',
 'story',
 'tell',
 'thing',
 'think',
 'thought',
 'time',
 'true',
 'truth',
 'try',
 'understand',
 'want',
 'wanted',
 'way',
 'woman',
 'word',
 'work',
 'world',
 'year']

In [98]:
#TfidfVectorizer for 100 features
from sklearn.feature_extraction.text import TfidfVectorizer

tf_idf_vectorizer = TfidfVectorizer(max_features = 100,stop_words="english")

X2 = tf_idf_vectorizer.fit_transform(quotes["clean_quotes"])


In [99]:
tf_idf_vectorizer.get_feature_names()

['away',
 'bad',
 'beautiful',
 'believe',
 'best',
 'better',
 'body',
 'book',
 'change',
 'child',
 'come',
 'day',
 'death',
 'different',
 'dream',
 'end',
 'experience',
 'eye',
 'face',
 'fact',
 'fear',
 'feel',
 'feeling',
 'friend',
 'future',
 'god',
 'going',
 'good',
 'got',
 'great',
 'hand',
 'happy',
 'hard',
 'head',
 'heart',
 'help',
 'home',
 'hope',
 'human',
 'idea',
 'important',
 'kind',
 'know',
 'learn',
 'let',
 'life',
 'light',
 'like',
 'little',
 'live',
 'living',
 'long',
 'look',
 'lot',
 'love',
 'make',
 'man',
 'matter',
 'mean',
 'men',
 'mind',
 'moment',
 'nature',
 'need',
 'new',
 'night',
 'old',
 'pain',
 'past',
 'people',
 'person',
 'place',
 'power',
 'real',
 'really',
 'reason',
 'right',
 'said',
 'say',
 'self',
 'sense',
 'soul',
 'story',
 'tell',
 'thing',
 'think',
 'thought',
 'time',
 'true',
 'truth',
 'try',
 'understand',
 'want',
 'wanted',
 'way',
 'woman',
 'word',
 'work',
 'world',
 'year']

In [102]:
#TfidfVectorizer for 100 features of combination words

tf_idf_vectorizer = TfidfVectorizer(max_features = 100,ngram_range=(2,2))

X3 = tf_idf_vectorizer.fit_transform(quotes["clean_quotes"])


In [103]:
tf_idf_vectorizer.get_feature_names()

['another person',
 'anyone else',
 'anything else',
 'best friend',
 'best way',
 'come back',
 'could never',
 'could see',
 'day day',
 'even know',
 'even though',
 'every day',
 'every man',
 'every moment',
 'every single',
 'every time',
 'everyone else',
 'everything else',
 'fall love',
 'feel like',
 'felt like',
 'find way',
 'first time',
 'go back',
 'god love',
 'gon na',
 'good bad',
 'good thing',
 'hard work',
 'high school',
 'human being',
 'human life',
 'important thing',
 'jesus christ',
 'know know',
 'let go',
 'life life',
 'life like',
 'life one',
 'life without',
 'little bit',
 'live life',
 'long time',
 'look back',
 'look like',
 'lot people',
 'love life',
 'love love',
 'make difference',
 'make feel',
 'make life',
 'make sense',
 'make sure',
 'many people',
 'many thing',
 'many time',
 'men woman',
 'never know',
 'new york',
 'nothing else',
 'one another',
 'one day',
 'one ever',
 'one know',
 'one life',
 'one must',
 'one one',
 'one person',
