In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import string
import re
import warnings
from tqdm import tqdm

from nltk.probability import FreqDist
from nltk.corpus import stopwords, wordnet
from nltk.tokenize import regexp_tokenize, word_tokenize, RegexpTokenizer
from nltk import pos_tag
from nltk.stem import WordNetLemmatizer

from sklearn.model_selection import train_test_split, RandomizedSearchCV, GridSearchCV
from sklearn.compose import ColumnTransformer, TransformedTargetRegressor
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import make_pipeline
from sklearn.pipeline import Pipeline
from sklearn.tree import DecisionTreeClassifier
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer, HashingVectorizer
from sklearn.dummy import DummyClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
from sklearn.ensemble import BaggingClassifier, RandomForestClassifier, GradientBoostingClassifier
from sklearn.ensemble import AdaBoostClassifier, BaggingClassifier
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.gaussian_process.kernels import RBF
from sklearn.naive_bayes import MultinomialNB, GaussianNB
from sklearn.neural_network import MLPClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, precision_score, confusion_matrix
from sklearn.metrics import roc_auc_score, f1_score


TEST_SIZE = .25
RANDOM_STATE = 42
df = pd.read_csv(
    'data/tweet_tweet.csv', 
    names=['body', 'product', 'target'],
    header=0
)

#dropping product
df.drop(columns='product', inplace=True)
#dropping null
df.dropna(inplace=True)


train, test = train_test_split(df, test_size=TEST_SIZE, random_state=RANDOM_STATE)

In [2]:
class ExperimentalTransformer(BaseEstimator, TransformerMixin):
    def __init__(self,
        toker=RegexpTokenizer(r"([a-zA-Z]+(?:’[a-z]+)?)"),
        lemming=WordNetLemmatizer(),
        vectorvictor=TfidfVectorizer(),
        sw=stopwords.words('english')
    ):
        '''
        A custom transformer that occasionally works in Pipeline:
            0. makes an X_copy for safety reasons
            1. makes str and lowercase
            3. tokenizes with default RegexTokenizer
            4. tags tokens with POS and converts POS to wordnet
            6. Lemmatizes with default WordNetLemmatizer
            7. creates corpus via ' '.join(X_copy)
            8. vectorizes via default TfidfVectorizer
            R. sparse matrix object
        '''
        # print('init() called.')
        self._fitted = False
        self.toker = toker
        self.lemming = lemming
        self.sw = sw
        self.vectorvictor = vectorvictor
        
        
    def _clean_me(self, words, toker, lemming):
        '''
        this is the cleaning function
        it represents steps 0 - 7
        '''
        # print('_clean_me() called.')
        words_copy = words.copy()
        words_copy = words.astype('string')
        words_copy = words_copy.apply(lambda x: x.lower())
        words_copy = words_copy.apply(lambda x: toker.tokenize(x))
        words_copy = words_copy.apply(lambda row: \
        [word for word in row if word not in self.sw])

        words_copy = words_copy.apply((lambda x: pos_tag(x)))
        words_copy = words_copy.apply(lambda row: \
        [(word[0], self.nltk_to_wordnet(word[1])) for word in row])

        words_copy = words_copy.apply(lambda row: \
        [lemming.lemmatize(word[0], word[1]) for word in row])
        
        corpus = words_copy.apply(lambda x: ' '.join(x))
        return corpus
         
        
    def fit(self, raw_doc, y=None):
        # print('fit() called.')
        cleaned = self._clean_me(raw_doc, self.toker, self.lemming)
        self.vectorvictor.fit(cleaned)
        self._fitted = True
        return self
        

    def transform(self, X, y=None):
        # print('transform() called.')
        if not self._fitted:
            raise KeyError('USER != competent /n ' \
                           'why you no fit?')
        X_copy = X.copy()
        X_copy = self._clean_me(X_copy, self.toker, self.lemming)
        vv = self.vectorvictor.transform(X_copy)
        return vv
    
#         if y != None:
#             y_copy = y.copy()
#             y_copy = self._clean_me(y_copy, self.toker, self.lemming)
#             return X_copy, y_copy
#         else:
#             return vv

    
    def nltk_to_wordnet(self, treebank_tag):
        '''
        Translate nltk POS to wordnet tags
        '''
        if treebank_tag.startswith('J'):
            return wordnet.ADJ
        elif treebank_tag.startswith('V'):
            return wordnet.VERB
        elif treebank_tag.startswith('N'):
            return wordnet.NOUN
        elif treebank_tag.startswith('R'):
            return wordnet.ADV
        else:
            return wordnet.NOUN

In [3]:
names = [
    "Dummy Classifier",
    "Nearest Neighbors",
    "Linear SVM",
    "RBF SVM",
    "Decision Tree",
    "Random Forest",
    "Bagging",
    "Gradient Boosting",
    "Neural Net",
    "AdaBoost",
    "MultiNomial Naive Bayes"
    # "QDA",
    # "Gaussian Process",
    # "Gaussian Naive Bayes",
]

classifiers = [
    DummyClassifier(strategy="most_frequent"),
    KNeighborsClassifier(3),
    SVC(kernel="linear", C=0.025, random_state=RANDOM_STATE),
    SVC(gamma=2, C=1, random_state=RANDOM_STATE),
    DecisionTreeClassifier(max_depth=5, random_state=RANDOM_STATE),
    RandomForestClassifier(max_depth=5, n_estimators=10, random_state=RANDOM_STATE),
    BaggingClassifier(random_state=RANDOM_STATE),
    GradientBoostingClassifier(random_state=RANDOM_STATE),
    MLPClassifier(alpha=1, max_iter=1000, random_state=RANDOM_STATE),
    AdaBoostClassifier(random_state=RANDOM_STATE),
    MultinomialNB()
    # QuadraticDiscriminantAnalysis(),
    # GaussianProcessClassifier(1.0 * RBF(1.0)),
    # GaussianNB(),
]


In [4]:
warnings.filterwarnings('ignore')

reg_stats = {}
for name, classifier in tqdm(zip(names, classifiers)):
    for_pipe = Pipeline([
        ('exp', ExperimentalTransformer()),
        ('for', classifier)
    ])

    for_pipe.fit(train.body, train.target)
    preds = for_pipe.predict(test.body)
    
    reg_stats[name] = [
        (' ', ' '),
        ('acc', accuracy_score(test.target, preds)),
        ('pre', precision_score(test.target, preds, average='weighted')),
        ('f1', f1_score(test.target, preds, average='weighted'))
    ]

    

11it [02:15, 12.30s/it]


In [5]:
reg_stats

{'Dummy Classifier': [(' ', ' '),
  ('acc', 0.5807303123625165),
  ('pre', 0.337247695696666),
  ('f1', 0.4266985874302932)],
 'Nearest Neighbors': [(' ', ' '),
  ('acc', 0.5983282006159261),
  ('pre', 0.5822768177975369),
  ('f1', 0.5840653106969752)],
 'Linear SVM': [(' ', ' '),
  ('acc', 0.5807303123625165),
  ('pre', 0.337247695696666),
  ('f1', 0.4266985874302932)],
 'RBF SVM': [(' ', ' '),
  ('acc', 0.6647602287725473),
  ('pre', 0.6620337952180668),
  ('f1', 0.6223984301531289)],
 'Decision Tree': [(' ', ' '),
  ('acc', 0.6062472503299604),
  ('pre', 0.6188293384544441),
  ('f1', 0.48963059798442554)],
 'Random Forest': [(' ', ' '),
  ('acc', 0.5807303123625165),
  ('pre', 0.337247695696666),
  ('f1', 0.4266985874302932)],
 'Bagging': [(' ', ' '),
  ('acc', 0.6396832380114387),
  ('pre', 0.6162306572415772),
  ('f1', 0.6105050697228794)],
 'Gradient Boosting': [(' ', ' '),
  ('acc', 0.6423229212494501),
  ('pre', 0.649254492358994),
  ('f1', 0.591081363080999)],
 'Neural Net': [

In [None]:
exp = ExperimentalTransformer()
exp.fit_transform(X_test)

In [None]:
ct = ColumnTransformer(
    [('exp', ExperimentalTransformer(), 'body'),
    ('label', LabelEncoder(), 'target')],
    remainder='passthrough'
)

In [None]:
dum_pipe = Pipeline([
    ('exp', ExperimentalTransformer()),
    ('dum', DummyClassifier(strategy='most_frequent'))
])

dum_pipe.fit(train.body, train.target)
print('fitted')
preds = dum_pipe.predict(test.body)

print(accuracy_score(test.target, preds))
print(precision_score(test.target, preds, average='weighted'))

In [None]:
mnb_pipe = Pipeline([
    ('exp', ExperimentalTransformer()),
    ('mnb', MultinomialNB())
])


mnb_pipe.fit(train.body, train.target)
print('fitted')
preds = mnb_pipe.predict(test.body)

print(accuracy_score(test.target, preds))
print(precision_score(test.target, preds, average='weighted'))

In [None]:
rfc_pipe = Pipeline([
    ('exp', ExperimentalTransformer()),
    ('rfc', RandomForestClassifier())
])


rfc_pipe.fit(train.body, train.target)
print('fitted')
preds = rfc_pipe.predict(test.body)

print(accuracy_score(test.target, preds))
print(precision_score(test.target, preds, average='weighted'))

In [None]:
bag_pipe = Pipeline([
    ('exp', ExperimentalTransformer()),
    ('bag', BaggingClassifier())
])


bag_pipe.fit(train.body, train.target)
print('fitted')
preds = bag_pipe.predict(test.body)

print(accuracy_score(test.target, preds))
print(precision_score(test.target, preds, average='weighted'))

In [None]:
gbc_pipe = Pipeline([
    ('exp', ExperimentalTransformer()),
    ('gbc', BaggingClassifier())
])


gbc_pipe.fit(train.body, train.target)
print('fitted')
preds = gbc_pipe.predict(test.body)

print(accuracy_score(test.target, preds))
print(precision_score(test.target, preds, average='weighted'))

In [None]:
ada_pipe = Pipeline([
    ('exp', ExperimentalTransformer()),
    ('ada', AdaBoostClassifier())
])


ada_pipe.fit(train.body, train.target)
print('fitted')
preds = ada_pipe.predict(test.body)

print(accuracy_score(test.target, preds))
print(precision_score(test.target, preds, average='weighted'))

In [None]:
 X, y = self._check_X_y(X, y)

In [None]:
X_train.dtypes

In [None]:
X_ab = X_train.astype('string')
X_ab.dtypes

In [None]:
from sklearn.base import BaseEstimator, TransformerMixin

class CompanyMentionDetector(BaseEstimator, TransformerMixin):
    # List of features in 'feature_names' and the 'power' of the exponent transformation
    def __init__(self):
        '''
        Scan body of tweet for keywords:
            0. copy for safety
            1. convert to str
            2. lowercase it all
            3. RegexTokenize
            4. create individual keyword columns
            5. combine columns into 'keyword' column
            6. drop indiviual keywrd problems

        '''
        if type(feature_names) != list:
            feature_names = [feature_names]
            
        self.feature_names = feature_names
        self.apple_words = ['apple', 'ipad', 'iphone', 'mac', 'ios']
        self.google_words = ['google', 'android', 'pixel']
        
        
    def fit(self, X, y=None):
        return self
    
    
    def transform(self, X, y=None):
#         return 'something'
        X_copy = X.copy()
        
        for feat in self.feature_names:
            X_copy[feat + '_string'] = X_copy[feat].astype('str')
            X_copy[feat + '_string'] = X_copy[feat + '_string'].str.lower()
            
            X_copy[feat + '_token'] = X_copy[feat + '_string']. \
            apply(RegexpTokenizer(r"([a-zA-Z]+(?:’[a-z]+)?)").tokenize)
            
            X_copy[feat + '_aapl'] = X_copy[feat + '_token'].apply(self.is_apple)
            X_copy[feat + '_goog'] = X_copy[feat + '_token'].apply(self.is_google)
            X_copy['keyword'] = X_copy[feat + '_aapl']
            X_copy['keyword'] = X_copy['keyword']. \
            combine_first(X_copy[feat + '_goog'])
            
            X_copy.drop(inplace=True, 
                        columns=[
                            feat + '_string',
                            feat + '_token', 
                            feat + '_aapl',
                            feat + '_goog'
            ])
            
            if 'product' in X_copy: X_copy.drop(inplace=True, columns='product')
                
        return X_copy


    def is_apple(self, tweet_text):
        for keyword in self.apple_words:
                if keyword.lower() in tweet_text:
                    return 'apple'
                else:
                    continue


    def is_google(self, tweet_text):
        for keyword in self.google_words:
                if keyword.lower() in tweet_text:
                    return 'google'
                else:
                    continue
