# Text Mining  pipeline

### let's import a few free-open source tools to our convenience 

In [1]:
import nltk
import pandas as pd
import re
from sklearn.feature_extraction.text import CountVectorizer
from nltk.tokenize.punkt import PunktSentenceTokenizer as PST
import string

# nltk.download('stopwords')
stopwords = nltk.corpus.stopwords.words('english')
ps = nltk.PorterStemmer()

### Read data

In [2]:
import pandas as pd
data = pd.read_json("stupidstuff.json")
data[0:10]

Unnamed: 0,body,category,id,rating
0,A blackjack dealer and a player with a thirtee...,Children,1,2.63
1,"At a dinner party, several of the guests were...",Blonde Jokes,2,2.57
2,One day this cop pulls over a blonde for spee...,Blonde Jokes,3,3.09
3,Three women are about to be executed for crim...,Blonde Jokes,4,4.1
4,A girl came skipping home FROM school one day...,Blonde Jokes,5,4.3
5,An airline captain was helping a new blonde f...,Military,6,3.23
6,A blonde and a brunette decided to rob a bank...,Blonde Jokes,7,4.0
7,"A brunette, a redhead and a blonde walk into ...",Blonde Jokes,8,2.33
8,A blonde suspects her boyfriend of cheating o...,Blonde Jokes,9,3.77
9,One day a blonde comes out of the tanning sal...,Blonde Jokes,10,3.64


In [7]:
data['body'] = data['body'].str.strip()

### Feature eng.

- number of words
- number of characters
- number of sentences
- number of punctuations
- number of all CAPS-LOCK words
- ratio of all CAPS-LOCK words to total words
- number of stopwords
- mean length of words
- number of syntax errors
- number of new-lines? /isn't sentence
- specific punctuations: ?, !
- repetition of words? / BoW-TfIdf

In [8]:
def remove_punctuation(text):
    return text.translate(str.maketrans('', '', string.punctuation))

In [9]:
data['body_no_punct'] = data['body'].apply(remove_punctuation)
data.head()

Unnamed: 0,body,category,id,rating,body_no_punct,words_count
0,A blackjack dealer and a player with a thirtee...,Children,1,2.63,A blackjack dealer and a player with a thirtee...,102
1,"At a dinner party, several of the guests were ...",Blonde Jokes,2,2.57,At a dinner party several of the guests were a...,80
2,One day this cop pulls over a blonde for speed...,Blonde Jokes,3,3.09,One day this cop pulls over a blonde for speed...,47
3,Three women are about to be executed for crime...,Blonde Jokes,4,4.1,Three women are about to be executed for crime...,144
4,A girl came skipping home FROM school one day....,Blonde Jokes,5,4.3,A girl came skipping home FROM school one day ...,185


In [10]:
def words_counter(text):
    words = nltk.word_tokenize(text)
    return len(words)
data['words_count'] = data['body_no_punct'].apply(words_counter)
data.head()

Unnamed: 0,body,category,id,rating,body_no_punct,words_count
0,A blackjack dealer and a player with a thirtee...,Children,1,2.63,A blackjack dealer and a player with a thirtee...,102
1,"At a dinner party, several of the guests were ...",Blonde Jokes,2,2.57,At a dinner party several of the guests were a...,80
2,One day this cop pulls over a blonde for speed...,Blonde Jokes,3,3.09,One day this cop pulls over a blonde for speed...,47
3,Three women are about to be executed for crime...,Blonde Jokes,4,4.1,Three women are about to be executed for crime...,144
4,A girl came skipping home FROM school one day....,Blonde Jokes,5,4.3,A girl came skipping home FROM school one day ...,185


In [11]:
def characters_counter(text):
    return len(text)
data['characters_count'] = data['body'].apply(characters_counter)
data.head()

Unnamed: 0,body,category,id,rating,body_no_punct,words_count,characters_count
0,A blackjack dealer and a player with a thirtee...,Children,1,2.63,A blackjack dealer and a player with a thirtee...,102,539
1,"At a dinner party, several of the guests were ...",Blonde Jokes,2,2.57,At a dinner party several of the guests were a...,80,449
2,One day this cop pulls over a blonde for speed...,Blonde Jokes,3,3.09,One day this cop pulls over a blonde for speed...,47,225
3,Three women are about to be executed for crime...,Blonde Jokes,4,4.1,Three women are about to be executed for crime...,144,898
4,A girl came skipping home FROM school one day....,Blonde Jokes,5,4.3,A girl came skipping home FROM school one day ...,185,998


In [12]:
def sentences_counter(text, pst): 
    sentences = [sentence for sentence in pst.sentences_from_text(text, False) if not sentence in string.punctuation]
    return len(sentences)

pst = PST()
data['sentences_count'] = data['body'].apply(sentences_counter, args = (pst,))
data.head()

Unnamed: 0,body,category,id,rating,body_no_punct,words_count,characters_count,sentences_count
0,A blackjack dealer and a player with a thirtee...,Children,1,2.63,A blackjack dealer and a player with a thirtee...,102,539,8
1,"At a dinner party, several of the guests were ...",Blonde Jokes,2,2.57,At a dinner party several of the guests were a...,80,449,8
2,One day this cop pulls over a blonde for speed...,Blonde Jokes,3,3.09,One day this cop pulls over a blonde for speed...,47,225,5
3,Three women are about to be executed for crime...,Blonde Jokes,4,4.1,Three women are about to be executed for crime...,144,898,19
4,A girl came skipping home FROM school one day....,Blonde Jokes,5,4.3,A girl came skipping home FROM school one day ...,185,998,19


In [13]:
def punct_counter(text):
    puncts = [c for c in text if c in string.punctuation]
    return len(puncts)

data['punct_count'] = data['body'].apply(punct_counter)
data.head()

Unnamed: 0,body,category,id,rating,body_no_punct,words_count,characters_count,sentences_count,punct_count
0,A blackjack dealer and a player with a thirtee...,Children,1,2.63,A blackjack dealer and a player with a thirtee...,102,539,8,33
1,"At a dinner party, several of the guests were ...",Blonde Jokes,2,2.57,At a dinner party several of the guests were a...,80,449,8,33
2,One day this cop pulls over a blonde for speed...,Blonde Jokes,3,3.09,One day this cop pulls over a blonde for speed...,47,225,5,8
3,Three women are about to be executed for crime...,Blonde Jokes,4,4.1,Three women are about to be executed for crime...,144,898,19,64
4,A girl came skipping home FROM school one day....,Blonde Jokes,5,4.3,A girl came skipping home FROM school one day ...,185,998,19,102


In [16]:
def all_caps_counter(text):
    all_caps_words = [word for word in nltk.word_tokenize(text) if (word.isupper() and len(remove_punctuation(word)) > 1 and not bool(re.search('(24:00|2[0-3]:[0-5][0-9]|[0-1][0-9]:[0-5][0-9])', word)))]
    return len(all_caps_words)
data['all_caps_count'] = data['body_no_punct'].apply(all_caps_counter)
data.head()

Unnamed: 0,body,category,id,rating,body_no_punct,words_count,characters_count,sentences_count,punct_count,all_caps_count,stopwords_count
0,A blackjack dealer and a player with a thirtee...,Children,1,2.63,A blackjack dealer and a player with a thirtee...,102,539,8,33,0,41
1,"At a dinner party, several of the guests were ...",Blonde Jokes,2,2.57,At a dinner party several of the guests were a...,80,449,8,33,0,27
2,One day this cop pulls over a blonde for speed...,Blonde Jokes,3,3.09,One day this cop pulls over a blonde for speed...,47,225,5,8,0,20
3,Three women are about to be executed for crime...,Blonde Jokes,4,4.1,Three women are about to be executed for crime...,144,898,19,64,0,51
4,A girl came skipping home FROM school one day....,Blonde Jokes,5,4.3,A girl came skipping home FROM school one day ...,185,998,19,102,3,59


In [17]:
def stopwords_counter(text):
    stops = [word for word in nltk.word_tokenize(text) if word in stopwords]
    return len(stops)
data['stopwords_count'] = data['body'].apply(stopwords_counter)
data.head()

Unnamed: 0,body,category,id,rating,body_no_punct,words_count,characters_count,sentences_count,punct_count,all_caps_count,stopwords_count
0,A blackjack dealer and a player with a thirtee...,Children,1,2.63,A blackjack dealer and a player with a thirtee...,102,539,8,33,0,47
1,"At a dinner party, several of the guests were ...",Blonde Jokes,2,2.57,At a dinner party several of the guests were a...,80,449,8,33,0,29
2,One day this cop pulls over a blonde for speed...,Blonde Jokes,3,3.09,One day this cop pulls over a blonde for speed...,47,225,5,8,0,21
3,Three women are about to be executed for crime...,Blonde Jokes,4,4.1,Three women are about to be executed for crime...,144,898,19,64,0,56
4,A girl came skipping home FROM school one day....,Blonde Jokes,5,4.3,A girl came skipping home FROM school one day ...,185,998,19,102,3,64


In [19]:
def mean_length(text):
    text = nltk.word_tokenize(text)
    return (sum( map(len, text) ) / len(text))

data['mean_len_word'] = data['body_no_punct'].apply(lambda x: mean_length(x))
data.head()

Unnamed: 0,body,category,id,rating,body_no_punct,words_count,characters_count,sentences_count,punct_count,all_caps_count,stopwords_count,mean_len_word
0,A blackjack dealer and a player with a thirtee...,Children,1,2.63,A blackjack dealer and a player with a thirtee...,102,539,8,33,0,47,3.872549
1,"At a dinner party, several of the guests were ...",Blonde Jokes,2,2.57,At a dinner party several of the guests were a...,80,449,8,33,0,29,4.175
2,One day this cop pulls over a blonde for speed...,Blonde Jokes,3,3.09,One day this cop pulls over a blonde for speed...,47,225,5,8,0,21,3.638298
3,Three women are about to be executed for crime...,Blonde Jokes,4,4.1,Three women are about to be executed for crime...,144,898,19,64,0,56,4.673611
4,A girl came skipping home FROM school one day....,Blonde Jokes,5,4.3,A girl came skipping home FROM school one day ...,185,998,19,102,3,64,3.789189


In [10]:
#def dot_counter(text): 
#    return text.count(".")
#data['dot_count'] = data['body'].apply(dot_counter)

In [21]:
def comma_counter(text): 
    return text.count(",")
data['comma_count'] = data['body'].apply(comma_counter)
data.head()

Unnamed: 0,body,category,id,rating,body_no_punct,words_count,characters_count,sentences_count,punct_count,all_caps_count,stopwords_count,mean_len_word,comma_count
0,A blackjack dealer and a player with a thirtee...,Children,1,2.63,A blackjack dealer and a player with a thirtee...,102,539,8,33,0,47,3.872549,11
1,"At a dinner party, several of the guests were ...",Blonde Jokes,2,2.57,At a dinner party several of the guests were a...,80,449,8,33,0,29,4.175,7
2,One day this cop pulls over a blonde for speed...,Blonde Jokes,3,3.09,One day this cop pulls over a blonde for speed...,47,225,5,8,0,21,3.638298,0
3,Three women are about to be executed for crime...,Blonde Jokes,4,4.1,Three women are about to be executed for crime...,144,898,19,64,0,56,4.673611,16
4,A girl came skipping home FROM school one day....,Blonde Jokes,5,4.3,A girl came skipping home FROM school one day ...,185,998,19,102,3,64,3.789189,43


In [22]:
def qmark_counter(text): 
    return text.count("?")
data['qmark_count'] = data['body'].apply(qmark_counter)
data.head()

Unnamed: 0,body,category,id,rating,body_no_punct,words_count,characters_count,sentences_count,punct_count,all_caps_count,stopwords_count,mean_len_word,comma_count,qmark_count
0,A blackjack dealer and a player with a thirtee...,Children,1,2.63,A blackjack dealer and a player with a thirtee...,102,539,8,33,0,47,3.872549,11,2
1,"At a dinner party, several of the guests were ...",Blonde Jokes,2,2.57,At a dinner party several of the guests were a...,80,449,8,33,0,29,4.175,7,0
2,One day this cop pulls over a blonde for speed...,Blonde Jokes,3,3.09,One day this cop pulls over a blonde for speed...,47,225,5,8,0,21,3.638298,0,0
3,Three women are about to be executed for crime...,Blonde Jokes,4,4.1,Three women are about to be executed for crime...,144,898,19,64,0,56,4.673611,16,0
4,A girl came skipping home FROM school one day....,Blonde Jokes,5,4.3,A girl came skipping home FROM school one day ...,185,998,19,102,3,64,3.789189,43,5


In [23]:
def excmark_counter(text): 
    return text.count("!")
data['excmark_count'] = data['body'].apply(excmark_counter)
data.head()

Unnamed: 0,body,category,id,rating,body_no_punct,words_count,characters_count,sentences_count,punct_count,all_caps_count,stopwords_count,mean_len_word,comma_count,qmark_count,excmark_count
0,A blackjack dealer and a player with a thirtee...,Children,1,2.63,A blackjack dealer and a player with a thirtee...,102,539,8,33,0,47,3.872549,11,2,0
1,"At a dinner party, several of the guests were ...",Blonde Jokes,2,2.57,At a dinner party several of the guests were a...,80,449,8,33,0,29,4.175,7,0,1
2,One day this cop pulls over a blonde for speed...,Blonde Jokes,3,3.09,One day this cop pulls over a blonde for speed...,47,225,5,8,0,21,3.638298,0,0,0
3,Three women are about to be executed for crime...,Blonde Jokes,4,4.1,Three women are about to be executed for crime...,144,898,19,64,0,56,4.673611,16,0,6
4,A girl came skipping home FROM school one day....,Blonde Jokes,5,4.3,A girl came skipping home FROM school one day ...,185,998,19,102,3,64,3.789189,43,5,3


In [27]:
def stopwords_cleaner(text):
    words = nltk.word_tokenize(text)
    text = [word.lower() for word in words if word not in stopwords]
    return text

data['text_prepared'] = data['body_no_punct'].apply(stopwords_cleaner)
data.head()

Unnamed: 0,body,category,id,rating,body_no_punct,words_count,characters_count,sentences_count,punct_count,all_caps_count,stopwords_count,mean_len_word,comma_count,qmark_count,excmark_count,text_prepared
0,A blackjack dealer and a player with a thirtee...,Children,1,2.63,A blackjack dealer and a player with a thirtee...,102,539,8,33,0,47,3.872549,11,2,0,"[a, blackjack, dealer, player, thirteen, count..."
1,"At a dinner party, several of the guests were ...",Blonde Jokes,2,2.57,At a dinner party several of the guests were a...,80,449,8,33,0,29,4.175,7,0,1,"[at, dinner, party, several, guests, arguing, ..."
2,One day this cop pulls over a blonde for speed...,Blonde Jokes,3,3.09,One day this cop pulls over a blonde for speed...,47,225,5,8,0,21,3.638298,0,0,0,"[one, day, cop, pulls, blonde, speeding, the, ..."
3,Three women are about to be executed for crime...,Blonde Jokes,4,4.1,Three women are about to be executed for crime...,144,898,19,64,0,56,4.673611,16,0,6,"[three, women, executed, crimes, ones, brunett..."
4,A girl came skipping home FROM school one day....,Blonde Jokes,5,4.3,A girl came skipping home FROM school one day ...,185,998,19,102,3,64,3.789189,43,5,3,"[a, girl, came, skipping, home, from, school, ..."


In [15]:
def stem_text(text):
    text = [ps.stem(word) for word in text]
    return text

Bow = CountVectorizer(analyzer=stem_text)
X_Bow = Bow.fit_transform(data['text_prepared'])

In [16]:
X_Data = data.copy()

In [17]:
X_Data = pd.get_dummies(X_Data, columns=['category'])

In [18]:
#list(X_Data.columns.values)

In [19]:
del X_Data['body']
del X_Data['id']
del X_Data['text_prepared']

In [25]:
X_features_ = pd.concat([X_Data, pd.DataFrame(X_Bow.toarray())], axis=1)
X_features_.head()

Unnamed: 0,rating,words_count,characters_count,sentences_count,punct_count,capitals_count,stopwords_counter,mean_len_word,comma_count,qmark_count,...,16913,16914,16915,16916,16917,16918,16919,16920,16921,16922
0,2.63,102,541,8,33,7,181,3.244444,11,2,...,0,0,0,0,0,0,0,0,0,0
1,2.57,80,451,8,33,0,145,3.429907,7,0,...,0,0,0,0,0,0,0,0,0,0
2,3.09,47,226,4,8,0,80,3.442308,0,0,...,0,0,0,0,0,0,0,0,0,0
3,4.1,162,901,31,64,-9,301,3.596154,16,0,...,0,0,0,0,0,0,0,0,0,0
4,4.3,185,999,19,102,30,314,2.912587,43,5,...,0,0,0,0,0,0,0,0,0,0


In [None]:
for i, j in X_features_.iterrows():
    t = 0
    try:
        t = X_features_[j][i]
    except:
        print(i, j)
    if (type(t) is not 'numpy.float64' and type(t) is not 'numpy.int64'):
        print(i, j)
        print(type(t))
        print(t)
        break

### ML TIME!

In [26]:
from sklearn.linear_model import Lasso
#from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.metrics import mean_absolute_error

In [27]:
X_train, X_test, y_train, y_test = train_test_split(X_features_, data['rating'], test_size=0.3)

In [28]:
param_grid = { 
    'alpha': [0.05, 0.1, 0.2],
    'fit_intercept': [True, False],
    'normalize' : [True, False],
    'max_iter' :[500, 1000],
    #'class_weight': [{1: 10, 0:1}, {1: 9, 0:1}]
}

In [29]:
model=Lasso()
CV_model = GridSearchCV(estimator=model, param_grid=param_grid, cv= 5, scoring= 'neg_mean_absolute_error', n_jobs=-1)
CV_model.fit(X_train, y_train)
CV_model.best_params_

{'alpha': 0.05, 'fit_intercept': False, 'max_iter': 500, 'normalize': True}

In [33]:
model1=Lasso(alpha=CV_model.best_params_['alpha'],
                fit_intercept= CV_model.best_params_['fit_intercept'],
                normalize=CV_model.best_params_['normalize'],
                max_iter=CV_model.best_params_['max_iter'])

In [36]:
model1.fit(X_train, y_train)
pred=model1.predict(X_test)

In [37]:
mean_absolute_error(y_test,pred)

0.0162604961161306

## Eval train set as well

In [41]:
X_Subset_ = X_train[0::17]
Y_Subset_ = y_train[0::17]

pred_sub=model1.predict(X_Subset_)
mean_absolute_error(Y_Subset_,pred_sub)

0.01632060395828205

### Feature Evaluation TBD

In [None]:
bins = np.linspace(0,50,40) #y amount of samples, x len of each
pp.hist(data['body_len'], bins)
pp.show()

In [None]:
for i in [1,2,3]:
    bins = np.linspace(0,50/(i**i),40)
    pp.hist((data[data['label'] == 'Valid']['body_len'])**(1/i), bins, label = 'VALID', alpha = 0.5)
    pp.hist((data[data['label'] == 'Spam']['body_len'])**(1/i), bins, label = 'SPAM', alpha = 0.5)
    pp.legend(loc='upper left')
    pp.title('transformation 1/{}'.format(str(i)))
    pp.show()