# Text Mining  pipeline

### let's import a few free-open source tools to our convenience 

In [None]:
import nltk
import pandas as pd
import re
from sklearn.feature_extraction.text import CountVectorizer
from nltk.tokenize import sent_tokenize
import string

# nltk.download('stopwords')
stopwords = nltk.corpus.stopwords.words('english')
ps = nltk.PorterStemmer()

### Read data

In [None]:
import pandas as pd
data = pd.read_json("stupidstuff.json")
data[0:10]

In [None]:
data['body'][2]

### Feature eng.

- number of words
- number of characters
- number of sentences
- number of punctuations
- number of all CAPS-LOCK words
- ratio of all CAPS-LOCK words to total words
- number of stopwords
- mean length of words
- number of syntax errors
- number of new-lines? /isn't sentence
- specific punctuations: ?, !
- repetition of words? / BoW-TfIdf

In [None]:
def words_counter(text):
    return len(text.split())

data['words_count'] = data['body'].apply(words_counter)

In [None]:
def characters_counter(text):
    return len(text)
data['characters_count'] = data['body'].apply(characters_counter)

In [None]:
# is it?
def sentences_counter(text): 
    return len(sent_tokenize(text))
data['sentences_count'] = data['body'].apply(sentences_counter)

In [None]:
count = lambda l1,l2: sum([1 for x in l1 if x in l2])
def punct_counter(text):
    return count(text, set(string.punctuation))
data['punct_count'] = data['body'].apply(punct_counter)

In [None]:
# is it?
def caps_counter(text):
    return sum(1 for c in text if c.isupper())
data['capitals_count'] = data['body'].apply(caps_counter)
data['capitals_count'] = data['capitals_count'] - data['sentences_count']

In [None]:
def stopwords_counter(text):
    stops = [word for word in text if word in stopwords]
    return len(stops)
data['stopwords_counter'] = data['body'].apply(stopwords_counter)

In [None]:
def mean_length(text):
    text = nltk.word_tokenize(text)
    return (sum( map(len, text) ) / len(text))

data['mean_len_word'] = data['body'].apply(lambda x: mean_length(x))

In [None]:
def dot_counter(text): 
    return text.count(".")
data['dot_counter'] = data['body'].apply(dot_counter)

In [None]:
def comma_counter(text): 
    return text.count(",")
data['comma_counter'] = data['body'].apply(comma_counter)

In [None]:
def stopwords_cleaner(text):
    tokens = re.split('\W+', text)
    text = [word.lower() for word in tokens if word not in stopwords]
    return text

data['text_prepared'] = data['body'].apply(stopwords_cleaner)
data.head()

In [None]:
def stem_text(text):
    text = [ps.stem(word) for word in text]
    return text

Bow = CountVectorizer(analyzer=stem_text)
X_Bow = tfidf_vect.fit_transform(data['text_prepared'])

X_features = pd.concat([data['body_len'], pd.DataFrame(X_Bow.toarray())], axis=1)
X_features.head()

### ML TIME!

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.metrics import 

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X_features, data['label'], test_size=0.2)

In [None]:
param_grid = { 
    'n_estimators': [20, 50, 100, 200, 500],
    'max_features': [None, 'sqrt', 'log2'],
    'max_depth' : [2,4,8,16],
    'criterion' :['gini', 'entropy'],
    'class_weight': [{1: 10, 0:1}, {1: 9, 0:1}]
}

In [None]:
rfc=RandomForestClassifier(random_state=42)
CV_rfc = GridSearchCV(estimator=rfc, param_grid=param_grid, cv= 5, scoring= scoreFunction, n_jobs=-1)
CV_rfc.fit(X_train, y_train)
CV_rfc.best_params_

In [None]:
rfc1=RandomForestClassifier(random_state=42, n_jobs=-1,
                            max_features=CV_rfc.best_params_['max_features'],
                            n_estimators= CV_rfc.best_params_['n_estimators'],
                            max_depth=CV_rfc.best_params_['max_depth'],
                            criterion=CV_rfc.best_params_['criterion'],
                           class_weight=CV_rfc.best_params_['class_weight'])

In [None]:
rfc1.fit(x_train, y_train)
pred=rfc1.predict(x_test)

### Feature Evaluation

In [None]:
bins = np.linspace(0,50,40) #y amount of samples, x len of each
pp.hist(data['body_len'], bins)
pp.show()

In [None]:
for i in [1,2,3]:
    bins = np.linspace(0,50/(i**i),40)
    pp.hist((data[data['label'] == 'Valid']['body_len'])**(1/i), bins, label = 'VALID', alpha = 0.5)
    pp.hist((data[data['label'] == 'Spam']['body_len'])**(1/i), bins, label = 'SPAM', alpha = 0.5)
    pp.legend(loc='upper left')
    pp.title('transformation 1/{}'.format(str(i)))
    pp.show()