In [1]:
import nltk
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet as wn
import unicodedata
import string
import pattern3
from nltk.corpus import stopwords
import re
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer 
from sklearn.feature_selection import SelectKBest, chi2
from sklearn.decomposition import LatentDirichletAllocation
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('wordnet')
nltk.download('stopwords')

class text_mining_opt(object):
    def __init__(self, text_data):
        self._comments = text_data
        self.CONTRACTION = {
            "ain't": "is not",
            "aren't": "are not",
            "can't": "cannot",
            "can't've": "cannot have",
            "'cause": "because",
            "could've": "could have",
            "couldn't": "could not",
            "couldn't've": "could not have",
            "didn't": "did not",
            "doesn't": "does not",
            "don't": "do not",
            "hadn't": "had not",
            "hadn't've": "had not have",
            "hasn't": "has not",
            "haven't": "have not",
            "he'd": "he would",
            "he'd've": "he would have",
            "he'll": "he will",
            "he'll've": "he he will have",
            "he's": "he is",
            "how'd": "how did",
            "how'd'y": "how do you",
            "how'll": "how will",
            "how's": "how is",
            "I'd": "I would",
            "I'd've": "I would have",
            "I'll": "I will",
            "I'll've": "I will have",
            "I'm": "I am",
            "I've": "I have",
            "i'd": "i would",
            "i'd've": "i would have",
            "i'll": "i will",
            "i'll've": "i will have",
            "i'm": "i am",
            "i've": "i have",
            "isn't": "is not",
            "it'd": "it would",
            "it'd've": "it would have",
            "it'll": "it will",
            "it'll've": "it will have",
            "it's": "it is",
            "let's": "let us",
            "ma'am": "madam",
            "mayn't": "may not",
            "might've": "might have",
            "mightn't": "might not",
            "mightn't've": "might not have",
            "must've": "must have",
            "mustn't": "must not",
            "mustn't've": "must not have",
            "needn't": "need not",
            "needn't've": "need not have",
            "o'clock": "of the clock",
            "oughtn't": "ought not",
            "oughtn't've": "ought not have",
            "shan't": "shall not",
            "sha'n't": "shall not",
            "shan't've": "shall not have",
            "she'd": "she would",
            "she'd've": "she would have",
            "she'll": "she will",
            "she'll've": "she will have",
            "she's": "she is",
            "should've": "should have",
            "shouldn't": "should not",
            "shouldn't've": "should not have",
            "so've": "so have",
            "so's": "so as",
            "that'd": "that would",
            "that'd've": "that would have",
            "that's": "that is",
            "there'd": "there would",
            "there'd've": "there would have",
            "there's": "there is",
            "they'd": "they would",
            "they'd've": "they would have",
            "they'll": "they will",
            "they'll've": "they will have",
            "they're": "they are",
            "they've": "they have",
            "to've": "to have",
            "wasn't": "was not",
            "we'd": "we would",
            "we'd've": "we would have",
            "we'll": "we will",
            "we'll've": "we will have",
            "we're": "we are",
            "we've": "we have",
            "weren't": "were not",
            "what'll": "what will",
            "what'll've": "what will have",
            "what're": "what are",
            "what's": "what is",
            "what've": "what have",
            "when's": "when is",
            "when've": "when have",
            "where'd": "where did",
            "where's": "where is",
            "where've": "where have",
            "who'll": "who will",
            "who'll've": "who will have",
            "who's": "who is",
            "who've": "who have",
            "why's": "why is",
            "why've": "why have",
            "will've": "will have",
            "won't": "will not",
            "won't've": "will not have",
            "would've": "would have",
            "wouldn't": "would not",
            "wouldn't've": "would not have",
            "y'all": "you all",
            "y'all'd": "you all would",
            "y'all'd've": "you all would have",
            "y'all're": "you all are",
            "y'all've": "you all have",
            "you'd": "you would",
            "you'd've": "you would have",
            "you'll": "you will",
            "you'll've": "you will have",
            "you're": "you are",
            "you've": "you have"
            }
        
    def normalize_accented_characters(self, text):
        text = unicodedata.normalize('NFKD', text).encode('ascii', 'ignore').decode('utf8')
        return text 
    
    def expand_contractions(self, text):
        contractions_pattern = re.compile('({})'.format('|'.join(self.CONTRACTION.keys())), 
                                      flags=re.IGNORECASE|re.DOTALL)
        
        def expand_match(contraction):
            match = contraction.group(0)
            first_char = match[0]
            expanded_contraction = self.CONTRACTION.get(match)\
                                    if self.CONTRACTION.get(match)\
                                    else self.CONTRACTION.get(match.lower())                       
            expanded_contraction = first_char+expanded_contraction[1:]
            return expanded_contraction
        
        expanded_text = contractions_pattern.sub(expand_match, text)
        expanded_text = re.sub("'", "", expanded_text)
        return expanded_text
    
    def tokenize_text(self, text):
        tokens = nltk.word_tokenize(text) 
        tokens = [token.strip() for token in tokens]
        return tokens
    
    def pos_tag_text(self, text_tokens):
        def penn_to_wn_tags(pos_tag):
            if pos_tag.startswith('J'):
                return wn.ADJ
            elif pos_tag.startswith('V'):
                return wn.VERB
            elif pos_tag.startswith('N'):
                return wn.NOUN
            elif pos_tag.startswith('R'):
                return wn.ADV
            else:
                return None  
        tagged_text = nltk.pos_tag(text_tokens)
        tagged_lower_text = [(word.lower(), penn_to_wn_tags(pos_tag))
                             for word, pos_tag in
                             tagged_text]
        return tagged_lower_text

    def lemmatize_text(self, text):
        wnl = WordNetLemmatizer()
        pos_tagged_text = self.pos_tag_text(text)
        lemmatized_tokens = [wnl.lemmatize(word, pos_tag) if pos_tag
                             else word                     
                             for word, pos_tag in pos_tagged_text]
        lemmatized_text = ' '.join(lemmatized_tokens)
        return lemmatized_text
    
    def remove_special_characters(self, text):
        tokens = self.tokenize_text(text)
        pattern = re.compile('[{}]'.format(re.escape(string.punctuation)))
        filtered_tokens = filter(None, [pattern.sub(' ', token) for token in tokens])
        filtered_text = ' '.join(filtered_tokens)
        return filtered_text 
    
    def remove_stopwords(self, text):
        stopword_list = nltk.corpus.stopwords.words('english')
        stopword_list = stopword_list + ['mr', 'mrs', 'come', 'go', 'get',
                                 'tell', 'listen', 'one', 'two', 'three',
                                 'four', 'five', 'six', 'seven', 'eight',
                                 'nine', 'zero', 'join', 'find', 'make',
                                 'say', 'ask', 'tell', 'see', 'try', 'back',
                                 'also','would']
        tokens = self.tokenize_text(text)
        filtered_tokens = [token for token in tokens if token not in stopword_list]
        filtered_text = ' '.join(filtered_tokens)    
        return filtered_text
    
    def keep_text_characters(self,text):
        filtered_tokens = []
        tokens = self.tokenize_text(text)
        for token in tokens:
            if re.search('[a-zA-Z]', token):
                filtered_tokens.append(token)
        filtered_text = ' '.join(filtered_tokens)
        return filtered_text
    
    def normalize(self, only_text_chars=True):
        """
        df: dataframe
        comment_col: str, the target comments column
        to_value: str, add a new column that is cleanned and standardized

        1.clean data
        2.tokenization
        3.lowercase
        4.removing stpwrds
        5.stemming
        6.calc. wrd freq
        """
        normalized_comments = list()
        
        for index, text in enumerate(self._comments):
            text = self.normalize_accented_characters(text)
            text = self.expand_contractions(text)
            text = self.tokenize_text(text)
            text = self.lemmatize_text(text)
            text = self.remove_special_characters(text)
            text = self.remove_stopwords(text)
            if only_text_chars:
                text = self.keep_text_characters(text)
            normalized_comments.append(text) 
        
        return normalized_comments
    
    '''Feature Engineering''' 
    # Feature Extraction 1: Bag-Of-Words
    # 1: find out all unique tokens, excluding those that are shown in all comments
    def selKBest(score_func = chi2, k = 10000):
        chi2_kbest = SelectKBest(score_func = score_func, k = k)
        return chi2_kbest
    
    def fe(self, normalized_comments, version='BOW'):
        print("please build a feature selector first!")
        
        if version=='BOW':
            bow_vectorizer = CountVectorizer()
            matrix = bow_vectorizer.fit_transform(normalized_comments).toarray()
            feature_names = bow_vectorizer.get_feature_names()

        if version=='TF-IDF':
            TF_IDF_vectorizer = TfidfVectorizer(norm = None, smooth_idf = True)
            matrix = TF_IDF_vectorizer.fit_transform(normalized_comments).toarray()
            feature_names = TF_IDF_vectorizer.get_feature_names()

            
        if version=='Tri-Gram':
            vectorizer_Tri_Grams = CountVectorizer(max_features=1000,ngram_range=(3,3),stop_words="english")
            matrix = vectorizer_Tri_Grams.fit_transform(normalized_comments).toarray()
            feature_names = vectorizer_Tri_Grams.get_feature_names()
        
        if version=='Topic':
            lda_bow_vectorizer = CountVectorizer()
            matrix = lda_bow_vectorizer.fit_transform(normalized_comments)
            ld_bow = LatentDirichletAllocation(n_components=2, max_iter=10,
                                           doc_topic_prior = 0.5,
                                           topic_word_prior = 0.5).fit(matrix)
            pred_topic_idx = ld_bow.transform(matrix).argmax(axis=1)
            return pred_topic_idx
            
        return matrix, feature_names

[nltk_data] Downloading package punkt to C:\Users\Hugo
[nltk_data]     Xue\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\Hugo Xue\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package wordnet to C:\Users\Hugo
[nltk_data]     Xue\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to C:\Users\Hugo
[nltk_data]     Xue\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [2]:
from sklearn.svm import LinearSVC
from sklearn.linear_model import LogisticRegression
from sklearn.feature_selection import SelectFromModel
# Step 2: Buidling a selector using machine learning algorithms
# option 1:
# L1-based feature selection Classification: With SVMs and logistic-regression, the parameter C controls the sparsity: the smaller C the fewer features selected. Regression: With Lasso, the higher the alpha parameter, the fewer features selected.
#machine learning feature selection model
class featureSele:
    def __init__(self,names, x, y, c=0.01, plty="l1", dl=False, mode="svm"):
        self.names = names
        self.x = x
        self.y = y 
        self.c = c
        self.plty = plty
        self.dl = dl
        self.mode = mode
        print("mode=svm, use linearsvc to reduce features/dimensionality;\
               mode=log, use logistic to reduce features/dimensionality\
               default params for log: multi_cls = 'ovr', mx_iter=1e4, slvr='liblinear'")
    
    def featureSele_svm(self):
        # Regularization
        lsvc = LinearSVC(C=self.c, penalty=self.plty, dual=self.dl) 
        lsvc_model = SelectFromModel(lsvc).fit(self.x,self.y) 

        count = 0
        support_vec = list()
        for x in lsvc_model.get_support():
            if x == True: 
                support_vec.append(count)
                count+=1

        new_features_names = list()
        for idx, col in enumerate(self.names):
            if idx in support_vec: new_features_names.append(col)

        return new_features_names

    def featureSele_logit(self, multi_cls,mx_iter,slvr):
        # Regularization
        logit = LogisticRegression(C=self.c, penalty=self.plty, dual=self.dl, multi_class=multi_cls,max_iter=mx_iter,solver=slvr)
        logit_model = SelectFromModel(logit).fit(self.x,self.y) 

        count = 0
        support_vec = list()
        for x in logit_model.get_support():
            if x == True: 
                support_vec.append(count)
                count+=1

        new_features_names = list()
        for idx, col in enumerate(self.names):
            if idx in support_vec: new_features_names.append(col)

        return new_features_names
  
    def run(self, multi_cls='ovr', mx_iter=1e4, slvr='liblinear'):
        if self.mode == "svm":
            return self.featureSele_svm()
        if self.mode == "log":
            return self.featureSele_logit(multi_cls=multi_cls, mx_iter=mx_iter,slvr=slvr)

In [3]:
# Use ML to test if the additional features help us to find the false positive case
from sklearn.model_selection import train_test_split
from sklearn.linear_model import SGDClassifier 
from sklearn.svm import LinearSVC
from sklearn.naive_bayes import MultinomialNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import BaggingClassifier
from sklearn.svm import SVC
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import classification_report
# Draw ROC curve packages
%matplotlib inline
import matplotlib.pyplot as plt
from sklearn.metrics import RocCurveDisplay
from sklearn.metrics import roc_curve, auc

class ML_Val:
    def __init__(self,x,y,test_size=0.2,random_state=456,max_iter=1e4, n_neighbors=3, nn_hid = (6, 2),solver = "adam"):
        self.x = x
        self.y = y
        self.test_size = test_size
        self.random_state=random_state
        self.max_iter=max_iter
        self.n_neighbors = n_neighbors
        self.nn_hid = nn_hid
        self.solver = solver
    
    def split_data(self):
        # Split into train/test
        x_train, x_test, y_train, y_test = train_test_split(    
            self.x, self.y, test_size=self.test_size, random_state=self.random_state)
        return x_train, x_test, y_train, y_test
  
    def logit(self,x_train,y_train,x_test):
        #logisitic classification
        clf_logit = SGDClassifier(loss='log', random_state=self.random_state)
        clf_logit.fit(x_train,y_train)
        clf_pred_logit = clf_logit.predict(x_test)

        y_score_logit = clf_logit.decision_function(x_test)
        return clf_pred_logit, y_score_logit
    
    def sgd(self,x_train,y_train,x_test):
        # Stochastic Gradient Descent
        clf_sgd = SGDClassifier(random_state=self.random_state)
        clf_sgd.fit(x_train,y_train)
        clf_pred_sgd = clf_sgd.predict(x_test)

        y_score_sgd = clf_sgd.decision_function(x_test)
        return clf_pred_sgd, y_score_sgd
  
    def svc(self,x_train,y_train,x_test):
        # Linear Support Vector Machine
        clf_linearsvc = LinearSVC(random_state=self.random_state,max_iter=self.max_iter)
        clf_linearsvc.fit(x_train,y_train)
        clf_pred_linearsvc = clf_linearsvc.predict(x_test)

        y_score_svc = clf_linearsvc.decision_function(x_test)
        return clf_pred_linearsvc, y_score_svc
  
    def nb(self,x_train,y_train,x_test):
        # Multinomial Naive Bayes
        clf_mnb = MultinomialNB()
        clf_mnb.fit(x_train,y_train)
        clf_pred_mnb = clf_mnb.predict(x_test)

        y_score_mnb = clf_mnb.predict_proba(x_test)
        mnb_y_score = list()
        for yscore in y_score_mnb:
            mnb_y_score.append(yscore[1])
        return clf_pred_mnb, mnb_y_score
  
    def knn(self,x_train,y_train,x_test):
        # KNearestNeighbors
        clf_neigh = KNeighborsClassifier(n_neighbors=self.n_neighbors) # 3 is good for classification
        clf_neigh.fit(x_train,y_train)
        clf_pred_neigh = clf_neigh.predict(x_test)

        y_score_knn = clf_neigh.predict_proba(x_test)
        knn_y_score = list()
        for yscore in y_score_knn:
            knn_y_score.append(yscore[1])
        return clf_pred_neigh, knn_y_score
  
    def dt(self,x_train,y_train,x_test):
        # Decision Tree
        clf_dtc = DecisionTreeClassifier(random_state=self.random_state)
        clf_dtc.fit(x_train,y_train)
        clf_pred_dtc = clf_dtc.predict(x_test)

        y_score_dtc = clf_dtc.predict_proba(x_test)
        dtc_y_score = list()
        for yscore in y_score_dtc:
            dtc_y_score.append(yscore[1])
        return clf_pred_dtc, dtc_y_score
    
    def rfc(self,x_train,y_train,x_test):
        # Random Forest
        clf_rfc = RandomForestClassifier(random_state=self.random_state)
        clf_rfc.fit(x_train,y_train)
        clf_pred_rfc = clf_rfc.predict(x_test)

        y_score_rfc = clf_rfc.predict_proba(x_test)
        rfc_y_score = list()
        for yscore in y_score_rfc:
            rfc_y_score.append(yscore[1])
        return clf_pred_rfc, rfc_y_score
    
    def gbc(self,x_train,y_train,x_test):
        # Gradient Boosting
        clf_gbc = RandomForestClassifier(random_state=self.random_state)
        clf_gbc.fit(x_train,y_train)
        clf_pred_gbc = clf_gbc.predict(x_test)

        y_score_gbc = clf_gbc.predict_proba(x_test)
        gbc_y_score = list()
        for yscore in y_score_gbc:
            gbc_y_score.append(yscore[1])
        return clf_pred_gbc, gbc_y_score
  
    def svbc(self,x_train,y_train,x_test):
        # Bagging based on SVC
        clf_svbc = BaggingClassifier(base_estimator=SVC(),random_state=self.random_state)
        clf_svbc.fit(x_train,y_train)
        clf_pred_svbc = clf_svbc.predict(x_test)

        y_score_bc = clf_svbc.predict_proba(x_test)
        bc_y_score = list()
        for yscore in y_score_bc:
            bc_y_score.append(yscore[1])
        return clf_pred_svbc, bc_y_score
  
    def nn(self,x_train,y_train,x_test):
        # Neural Network
        clf_nn = MLPClassifier(
          activation="relu", 
          solver=self.solver, 
          alpha=1e-5, 
          hidden_layer_sizes=self.nn_hid, 
          random_state=self.random_state, 
          max_iter=self.max_iter)
        clf_nn.fit(x_train, y_train)
        clf_pred_nn = clf_nn.predict(x_test)

        y_score_nn = clf_nn.predict_proba(x_test)
        nn_y_score = list()
        for yscore in y_score_nn:
            nn_y_score.append(yscore[1])
        return clf_pred_nn, nn_y_score
  
    def train(self, roc=False, w=15,h=10,ft_size=18):
        x_train, x_test, y_train, y_test = self.split_data()

        clf_pred_logit, y_score_logit = self.logit(x_train,y_train,x_test)
        print("===============Logistic======================")
        print(classification_report(y_test,clf_pred_logit))
        print("===============Logistic======================")
        print(" ")

        clf_pred_sgd, y_score_sgd = self.sgd(x_train,y_train,x_test)
        print("===============Stochastic Gradient Descent======================")
        print(classification_report(y_test,clf_pred_sgd))
        print("===============Stochastic Gradient Descent======================")
        print(" ")

        clf_pred_linearsvc, y_score_svc = self.svc(x_train,y_train,x_test)
        print("===============Linear Support Vector Machine======================")
        print(classification_report(y_test,clf_pred_linearsvc))
        print("===============Linear Support Vector Machine======================")
        print(" ")

        clf_pred_mnb, mnb_y_score = self.nb(x_train,y_train,x_test)
        print("===============Multinomial Naive Bayes======================")
        print(classification_report(y_test,clf_pred_mnb))
        print("===============Multinomial Naive Bayes======================")
        print(" ")

        clf_pred_neigh, knn_y_score = self.knn(x_train,y_train,x_test)
        print("===============KNearestNeighbors======================")
        print(classification_report(y_test,clf_pred_neigh))
        print("===============KNearestNeighbors======================")
        print(" ")

        clf_pred_dtc, dtc_y_score = self.dt(x_train,y_train,x_test)
        print("===============Decision Tree======================")
        print(classification_report(y_test,clf_pred_dtc))
        print("===============Decision Tree======================")
        print(" ")

        clf_pred_rfc, rfc_y_score = self.rfc(x_train,y_train,x_test)
        print("===============Random Forest======================")
        print(classification_report(y_test,clf_pred_rfc))
        print("===============Random Forest======================")
        print(" ")

        clf_pred_gbc, gbc_y_score = self.gbc(x_train,y_train,x_test)
        print("===============Gradient Boosting======================")
        print(classification_report(y_test,clf_pred_gbc))
        print("===============Gradient Boosting======================")
        print(" ")

        clf_pred_svbc, bc_y_score = self.svbc(x_train,y_train,x_test)
        print("===============Bagging based on SVC======================")
        print(classification_report(y_test,clf_pred_svbc))
        print("===============Bagging based on SVC======================")
        print(" ")

        clf_pred_nn, nn_y_score = self.nn(x_train,y_train,x_test)
        print("===============Neural Network======================")
        print(classification_report(y_test,clf_pred_nn))
    
        if roc:
            fpr_logit, tpr_logit, _ = roc_curve(y_test,y_score_logit)
            fpr_sgd, tpr_sgd, _ = roc_curve(y_test,y_score_sgd)
            fpr_svc, tpr_svc, _ = roc_curve(y_test,y_score_svc)
            fpr_mnb, tpr_mnb, _ = roc_curve(y_test,mnb_y_score)
            fpr_knn, tpr_knn, _ = roc_curve(y_test,knn_y_score)
            fpr_dtc, tpr_dtc, _ = roc_curve(y_test,dtc_y_score)
            fpr_rfc, tpr_rfc, _ = roc_curve(y_test,rfc_y_score)
            fpr_gbc, tpr_gbc, _ = roc_curve(y_test,gbc_y_score)
            fpr_bc, tpr_bc, _ = roc_curve(y_test,bc_y_score)
            fpr_nn, tpr_nn, _ = roc_curve(y_test, nn_y_score)

            ax = plt.gca()
            fig = plt.gcf()
            fig.set_size_inches(w,h)
            ax.set_xlim(0,1)
            ax.set_ylim(0,1)
            ax.set_title("Receiver operating characteristic",{"fontsize":ft_size})
            ax.plot(fpr_logit,tpr_logit,color="blue",lw=1, label="logit")
            ax.plot(fpr_sgd,tpr_sgd,color="pink", lw=1, label="sgd")
            ax.plot(fpr_svc,tpr_svc,color="yellow", lw=1,label="svc")
            ax.plot(fpr_mnb,tpr_mnb,color="red", lw=1,label="mnb")
            ax.plot(fpr_knn,tpr_knn,color="orange", lw=1,label="knn")
            ax.plot(fpr_dtc,tpr_dtc,color="gray", lw=1,label="dtc")
            ax.plot(fpr_rfc,tpr_rfc,color="cyan", lw=1,label="rfc")
            ax.plot(fpr_gbc,tpr_gbc,color="black", lw=1,label="gbc")
            ax.plot(fpr_bc,tpr_bc,color="olive", lw=1,label="bc")
            ax.plot(fpr_nn,tpr_nn,color="green", lw=1,label="nn")
            ax.legend(loc="best")

In [4]:
from sklearn.datasets import fetch_20newsgroups
categories = ['alt.atheism', 'talk.religion.misc', 'comp.graphics', 'sci.space']
corpus_train = fetch_20newsgroups(categories = categories,
                                  subset = 'train', 
                                  remove = ('headers', 'footers', 'quotes')) 

corpus_test = fetch_20newsgroups(categories = categories,
                                 subset='test', 
                                 remove=('headers', 'footers', 'quotes')) 

In [5]:
if __name__=='__main__':
    #     normalization
    tm = text_mining_opt(text_data=corpus_train.data)
    normalized_corpus = tm.normalize()
    #     feature engineering
    matrix, feature_names = tm.fe(normalized_corpus)
    #     feature selection
    fs = featureSele(names = feature_names, x=matrix, y=corpus_train.target, c=1, plty="l1", dl=False, mode="log")
    new_features_names = fs.run()
    
    # results
    print(normalized_corpus, "\n====>", feature_names, "\n=====>", new_features_names)
    print(matrix)
    print(feature_names)
    
    # validation
    try:
        ml = ML_Val(x=matrix,y=corpus_train.target,solver='lbfgs')
        ml.train(roc=False)
    except:
        print("some models are invalid")

please build a feature selector first!
mode=svm, use linearsvc to reduce features/dimensionality;               mode=log, use logistic to reduce features/dimensionality               default params for log: multi_cls = 'ovr', mx_iter=1e4, slvr='liblinear'




=====> ['000062david42', '000100255pixel', '000usd', '001200201pixel', '00index', '00pm', '018b', '01a', '023b', '04g', '054589e', '0856e16', '0a', '0b', '0e9', '0km', '0mph', '0w', '0x', '0x00', '0x100', '0x1f', '0x3d4', '0xc010', '0xc018', '1000r', '100c', '100k', '100km', '100megs', '100nm', '100th', '101h', '1024x1024', '1024x512', '1024x728', '1024x768', '1024x768x24', '1024x768x65000', '1024x786x24', '105m', '10bps', '10cm', '10fps', '10h', '10k', '10km', '10kw', '10m', '10mhz', '110m', '110mbytes', '111s', '115m', '119th', '11dec89', '11x17', '1200mi', '1200x900', '122nd', '128m', '12km', '12m', '133866082767180880e', '133941270127999174e', '135x180', '138p', '13e19', '13h', '1400s', '140m', '145mm', '14m', '14th', '1500kg', '1500s', '150a', '150miles', '153847166458030088e', '154m', '15bit', '15e10', '15m', '15mhz', '15rpm', '15th', '160x2xx', '167290000000000000e', '167317532658774153e', '16bit', '16m', '16th', '17apr199316423628', '17f', '17th', '1800s', '18084tm', '1830s', '

              precision    recall  f1-score   support

           0       0.72      0.70      0.71        92
           1       0.92      0.90      0.91       134
           2       0.82      0.85      0.83       117
           3       0.62      0.64      0.63        64

    accuracy                           0.80       407
   macro avg       0.77      0.77      0.77       407
weighted avg       0.80      0.80      0.80       407

 
              precision    recall  f1-score   support

           0       0.78      0.74      0.76        92
           1       0.92      0.89      0.90       134
           2       0.79      0.85      0.82       117
           3       0.65      0.64      0.65        64

    accuracy                           0.81       407
   macro avg       0.78      0.78      0.78       407
weighted avg       0.81      0.81      0.81       407

 
              precision    recall  f1-score   support

           0       0.72      0.73      0.72        92
           1     

In [6]:
new_features_names

['000062david42',
 '000100255pixel',
 '000usd',
 '001200201pixel',
 '00index',
 '00pm',
 '018b',
 '01a',
 '023b',
 '04g',
 '054589e',
 '0856e16',
 '0a',
 '0b',
 '0e9',
 '0km',
 '0mph',
 '0w',
 '0x',
 '0x00',
 '0x100',
 '0x1f',
 '0x3d4',
 '0xc010',
 '0xc018',
 '1000r',
 '100c',
 '100k',
 '100km',
 '100megs',
 '100nm',
 '100th',
 '101h',
 '1024x1024',
 '1024x512',
 '1024x728',
 '1024x768',
 '1024x768x24',
 '1024x768x65000',
 '1024x786x24',
 '105m',
 '10bps',
 '10cm',
 '10fps',
 '10h',
 '10k',
 '10km',
 '10kw',
 '10m',
 '10mhz',
 '110m',
 '110mbytes',
 '111s',
 '115m',
 '119th',
 '11dec89',
 '11x17',
 '1200mi',
 '1200x900',
 '122nd',
 '128m',
 '12km',
 '12m',
 '133866082767180880e',
 '133941270127999174e',
 '135x180',
 '138p',
 '13e19',
 '13h',
 '1400s',
 '140m',
 '145mm',
 '14m',
 '14th',
 '1500kg',
 '1500s',
 '150a',
 '150miles',
 '153847166458030088e',
 '154m',
 '15bit',
 '15e10',
 '15m',
 '15mhz',
 '15rpm',
 '15th',
 '160x2xx',
 '167290000000000000e',
 '167317532658774153e',
 '16bit',