In [69]:
import pandas as pd
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import RegexpTokenizer
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer

In [70]:
part1 = """We are gathered here today on this joyous occasion to celebrate the special love that Monica and Chandler share. It is a love based on giving and receiving as well as having and sharing. And the love that they give and have is shared and received. And
through this having and giving and sharing and receiving, we too can share and love and have... and receive."""

part2 = """When I think of the love these two givers and receivers share I cannot help but envy the lifetime ahead of having and loving and giving and receiving."""

In [71]:
X_train = pd.DataFrame([part1, part2], columns=['speech'])

X_train

Unnamed: 0,speech
0,We are gathered here today on this joyous occa...
1,When I think of the love these two givers and ...


In [72]:
def preprocess_text(text):
    # Tokenise words while ignoring punctuation
    tokeniser = RegexpTokenizer(r'\w+')
    tokens = tokeniser.tokenize(text)
    
    # Lowercase and lemmatise 
    lemmatiser = WordNetLemmatizer()
    lemmas = [lemmatiser.lemmatize(token.lower(), pos='v') for token in tokens]
    
    # Remove stopwords
    keywords= [lemma for lemma in lemmas if lemma not in stopwords.words('english')]
    return keywords

In [75]:
X_train = pd.DataFrame.sparse.from_spmatrix(X_train)
col_map = {v:k for k, v in vectoriser.vocabulary_.items()}
for col in X_train.columns:
    X_train.rename(columns={col: col_map[col]}, inplace=True)
X_train

AttributeError: 'DataFrame' object has no attribute 'tocsc'

In [74]:
tokeniser = RegexpTokenizer(r'\w+')
tokens = tokeniser.tokenize(part1)
print(tokens)

['We', 'are', 'gathered', 'here', 'today', 'on', 'this', 'joyous', 'occasion', 'to', 'celebrate', 'the', 'special', 'love', 'that', 'Monica', 'and', 'Chandler', 'share', 'It', 'is', 'a', 'love', 'based', 'on', 'giving', 'and', 'receiving', 'as', 'well', 'as', 'having', 'and', 'sharing', 'And', 'the', 'love', 'that', 'they', 'give', 'and', 'have', 'is', 'shared', 'and', 'received', 'And', 'through', 'this', 'having', 'and', 'giving', 'and', 'sharing', 'and', 'receiving', 'we', 'too', 'can', 'share', 'and', 'love', 'and', 'have', 'and', 'receive']


In [50]:
lemmatiser = WordNetLemmatizer()
lemmas = [lemmatiser.lemmatize(token.lower(), pos='v') for token in tokens]
print(lemmas)

['we', 'be', 'gather', 'here', 'today', 'on', 'this', 'joyous', 'occasion', 'to', 'celebrate', 'the', 'special', 'love', 'that', 'monica', 'and', 'chandler', 'share', 'it', 'be', 'a', 'love', 'base', 'on', 'give', 'and', 'receive', 'as', 'well', 'as', 'have', 'and', 'share', 'and', 'the', 'love', 'that', 'they', 'give', 'and', 'have', 'be', 'share', 'and', 'receive', 'and', 'through', 'this', 'have', 'and', 'give', 'and', 'share', 'and', 'receive', 'we', 'too', 'can', 'share', 'and', 'love', 'and', 'have', 'and', 'receive']


In [51]:
print(len(stopwords.words('english')))
stopwords.words('english')[:5]

179


['i', 'me', 'my', 'myself', 'we']

In [52]:
keywords = [lemma for lemma in lemmas if lemma not in stopwords.words('english')]
print(keywords)

['gather', 'today', 'joyous', 'occasion', 'celebrate', 'special', 'love', 'monica', 'chandler', 'share', 'love', 'base', 'give', 'receive', 'well', 'share', 'love', 'give', 'share', 'receive', 'give', 'share', 'receive', 'share', 'love', 'receive']


In [53]:
{word: keywords.count(word) for word in set(keywords)}

{'monica': 1,
 'joyous': 1,
 'well': 1,
 'share': 5,
 'gather': 1,
 'occasion': 1,
 'chandler': 1,
 'receive': 4,
 'special': 1,
 'love': 4,
 'give': 3,
 'celebrate': 1,
 'today': 1,
 'base': 1}

In [57]:
vectoriser = CountVectorizer(analyzer=preprocess_text)
X_train = vectoriser.fit_transform(X_train['speech'])

In [58]:
# Convert sparse matrix to dataframe
X_train = pd.DataFrame.sparse.from_spmatrix(X_train)

# Save mapping on which index refers to which terms
col_map = {v:k for k, v in vectoriser.vocabulary_.items()}

# Rename each column using the mapping
for col in X_train.columns:
    X_train.rename(columns={col: col_map[col]}, inplace=True)
X_train

Unnamed: 0,ahead,base,cannot,celebrate,chandler,envy,gather,give,givers,help,...,monica,occasion,receive,receivers,share,special,think,today,two,well
0,0,1,0,1,1,0,1,3,0,0,...,1,1,4,0,5,1,0,1,0,1
1,1,0,1,0,0,1,0,1,1,1,...,0,0,1,1,1,0,1,0,1,0
