In [75]:
%load_ext autoreload
%autoreload 2

import os
import sys
module_path = os.path.abspath(os.path.join(os.pardir, os.pardir))
if module_path not in sys.path:
    sys.path.append(module_path)
    
import pandas as pd
import nltk
from nltk.probability import FreqDist
from nltk.corpus import stopwords
from nltk.tokenize import regexp_tokenize, word_tokenize, RegexpTokenizer
from nltk import pos_tag
from nltk.corpus import wordnet
import matplotlib.pyplot as plt
import string
import re

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer, HashingVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, precision_score, confusion_matrix

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [5]:
corpus = pd.read_csv('data/cleaned.csv')
corpus.drop(columns='Unnamed: 0', inplace=True)
corpus

Unnamed: 0,body,target,company
0,"['wesley', 'i', 'have', 'a', 'g', 'iphone', 'a...",negative emotion,apple
1,"['jessedee', 'know', 'about', 'fludapp', 'awes...",positive emotion,apple
2,"['swonderlin', 'can', 'not', 'wait', 'for', 'i...",positive emotion,apple
3,"['sxsw', 'i', 'hope', 'this', 'year', 'festiva...",negative emotion,apple
4,"['sxtxstate', 'great', 'stuff', 'on', 'fri', '...",positive emotion,google
...,...,...,...
8158,"['ipad', 'everywhere', 'sxsw', 'link']",positive emotion,apple
8159,"['wave', 'buzz', 'rt', 'mention', 'we', 'inter...",no emotion toward brand or product,google
8160,"['google', 'zeiger', 'a', 'physician', 'never'...",no emotion toward brand or product,google
8161,"['some', 'verizon', 'iphone', 'customer', 'com...",no emotion toward brand or product,apple


In [76]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.dummy import DummyClassifier
from sklearn.compose import ColumnTransformer
from sklearn.metrics import roc_auc_score
from sklearn.pipeline import make_pipeline
from sklearn.pipeline import Pipeline
from sklearn.datasets import load_breast_cancer
from sklearn.tree import DecisionTreeClassifier
from scipy.stats import randint
from sklearn.preprocessing import LabelEncoder, StandardScaler

TEST_SIZE = .25
RANDOM_STATE = 42
df = pd.read_csv(
    'data/tweet_tweet.csv', 
    names=['body', 'product', 'target'],
    header=0
)
X = df.drop(['target'], axis=1)
y = df['target']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=TEST_SIZE, random_state=RANDOM_STATE, stratify=y)

In [27]:
X_train.body

AttributeError: 'Series' object has no attribute 'body'

In [77]:
from sklearn.base import BaseEstimator, TransformerMixin

class CompanyMentionDetector(BaseEstimator, TransformerMixin):
    # List of features in 'feature_names' and the 'power' of the exponent transformation
    def __init__(self, feature_names=None):
        '''
        Scan body of tweet for keywords:
            0. copy for safety
            1. convert to str
            2. lowercase it all
            3. RegexTokenize
            4. create individual keyword columns
            5. combine columns into 'keyword' column
            6. drop indiviual keywrd columns

        '''
        if type(feature_names) != list:
            feature_names = [feature_names]
            
        self.feature_names = feature_names
        self.apple_words = ['apple', 'ipad', 'iphone', 'mac', 'ios']
        self.google_words = ['google', 'android', 'pixel']
        
        
    def fit(self, X, y=None):
        return self
    
    
    def transform(self, X, y=None):
#         return 'something'
        X_copy = X.copy()
        
        for feat in self.feature_names:
            X_copy[feat + '_string'] = X_copy[feat].astype('str')
            X_copy[feat + '_string'] = X_copy[feat + '_string'].str.lower()
            
            X_copy[feat + '_token'] = X_copy[feat + '_string']. \
            apply(RegexpTokenizer(r"([a-zA-Z]+(?:’[a-z]+)?)").tokenize)
            
            X_copy[feat + '_aapl'] = X_copy[feat + '_token'].apply(self.is_apple)
            X_copy[feat + '_goog'] = X_copy[feat + '_token'].apply(self.is_google)
            X_copy['keyword'] = X_copy[feat + '_aapl']
            X_copy['keyword'] = X_copy['keyword']. \
            combine_first(X_copy[feat + '_goog'])
            
            X_copy.drop(inplace=True, 
                        columns=[
                            feat + '_string',
                            feat + '_token', 
                            feat + '_aapl',
                            feat + '_goog'
            ])
            
            if 'product' in X_copy: X_copy.drop(inplace=True, columns='product')
                
        return X_copy


    def is_apple(self, tweet_text):
        for keyword in self.apple_words:
                if keyword.lower() in tweet_text:
                    return 'apple'
                else:
                    continue


    def is_google(self, tweet_text):
        for keyword in self.google_words:
                if keyword.lower() in tweet_text:
                    return 'google'
                else:
                    continue


In [78]:
class TweetPreprocessor(BaseEstimator, TransformerMixin):
    # List of features in 'feature_names' and the 'power' of the exponent transformation
    def __init__(self, feature_names=None, sw=stopwords.words('english')):
        '''
        Another custom function that:
            0. makes a copy for safety reasons
            1. makes str
            2. lowercase
            3. tokenize
            4. POS
            5. POS conversion
            6. Lemmatizes
            7. drops irrelevant columns
            7. returns "feat" + '_lemmed' column
        '''
        if type(feature_names) != list:
            feature_names = [feature_names]
        self.feature_names = feature_names
        self.sw = sw
        
        
    def fit(self, X, y=None):
        return self
    
    
    def transform(self, X, y=None):
        toker = RegexpTokenizer(r"([a-zA-Z]+(?:’[a-z]+)?)")
        lemming = nltk.stem.WordNetLemmatizer()
        X_copy = X.copy()
        
        for feat in self.feature_names:
            X_copy[feat + '_string'] = X_copy[feat].astype('str')
            X_copy[feat + '_string'] = X_copy[feat + '_string'].str.lower()

            X_copy[feat + '_token'] = X_copy[feat + '_string'] \
            .apply(toker.tokenize)

            X_copy[feat + '_no_safewords'] = X_copy[feat + '_token'] \
            .apply(lambda row: [word for word in row if word not in self.sw])

            X_copy[feat + '_tagged'] = X_copy[feat + '_no_safewords'] \
            .apply((lambda word: pos_tag(word)))

            X_copy[feat + '_tagged'] = X_copy[feat + '_tagged'] \
            .apply(lambda row: [(word[0], self.nltk_to_wordnet(word[1])) for word in row])

            X_copy[feat + '_lemmed'] = X_copy[feat + '_tagged'] \
            .apply(lambda row: [lemming.lemmatize(word[0], word[1]) for word in row])

            X_copy.drop(inplace=True, 
                        columns=[
                            feat,
                            feat + '_string',
                            feat + '_token', 
                            feat + '_no_safewords',
                            feat + '_tagged'
            ])

        return X_copy
    
    
    def nltk_to_wordnet(self, treebank_tag):
        '''
        Translate nltk POS to wordnet tags
        '''
        if treebank_tag.startswith('J'):
            return wordnet.ADJ
        elif treebank_tag.startswith('V'):
            return wordnet.VERB
        elif treebank_tag.startswith('N'):
            return wordnet.NOUN
        elif treebank_tag.startswith('R'):
            return wordnet.ADV
        else:
            return wordnet.NOUN

In [79]:
cmd = CompanyMentionDetector('body')
xtcmd = cmd.transform(X_train)
xtcmd

Unnamed: 0,body,keyword
990,Attending panel &quot;better living through cl...,google
25,RT @LaurieShook: I'm looking forward to the #S...,apple
8583,Google hotpot brings Netflix-style functionali...,google
4236,4 Most Valuable Apple iPad Apps; Top Critical ...,apple
4524,Domo iPhone &amp; Android App: Share with Face...,apple
...,...,...
8244,Sitting at ihop drooling over the @mention iPh...,apple
9045,@mention you are my favorite-- thanks for comi...,apple
4255,Scepticism expressed about iPad newspapers at ...,apple
6872,RT @mention Watching a guy simultaneously use ...,apple


In [80]:
tp = TweetPreprocessor('body')
xtcmdtp = tp.transform(xtcmd)
xtcmdtp

Unnamed: 0,keyword,body_lemmed
990,google,"[attend, panel, quot, well, live, cloud, compu..."
25,apple,"[rt, laurieshook, look, forward, smcdallas, pr..."
8583,google,"[google, hotpot, bring, netflix, style, functi..."
4236,apple,"[valuable, apple, ipad, apps, top, critical, t..."
4524,apple,"[domo, iphone, amp, android, app, share, faceb..."
...,...,...
8244,apple,"[sit, ihop, drool, mention, iphone, app, every..."
9045,apple,"[mention, favorite, thanks, come, mention, get..."
4255,apple,"[scepticism, express, ipad, newspaper, sxsw, l..."
6872,apple,"[rt, mention, watch, guy, simultaneously, use,..."


In [70]:
vect = TfidfVectorizer()
corpus = []

for row in xtcmdtp.body_lemmed:
    corpus.append(' '.join(row))



xtcmdtpvect = vect.fit_transform(corpus)

# making the X_train into a sparse matrix
X_sparse = pd.DataFrame.sparse.from_spmatrix(xtcmdtpvect)
X_sparse.columns = sorted(vect.vocabulary_)
X_sparse.set_index(y_train.index, inplace=True)

# making the x_test into a sparse matrix
xtestvec = vect.transform(X_test)
X_test_sparse = pd.DataFrame.sparse.from_spmatrix(xtestvec)
X_test_sparse.columns = sorted(vect.vocabulary_)
X_test_sparse.set_index(y_test.index, inplace=True)

# mnb = MultinomialNB()

# mnb.fit(X_sparse, y_train)
# y_hat = mnb.predict(X_test_sparse)

# precision_score(y_test, y_hat)
# accuracy_score(y_test, y_hat)

ValueError: Length mismatch: Expected 2 rows, received array of length 2274

In [84]:
body_prepper = Pipeline([
    ('cmd', CompanyMentionDetector('body')),
    ('tpp', TweetPreprocessor('body'))
])
target_prepper = Pipeline([
    ('label_me', LabelEncoder()),
    ('sclaer', StandardScaler())
])
prepper = ColumnTransformer([
    ('body', body_prepper, 'body')#,
#     ('target', target_prepper, None)
])

prepper.fit_transform(X_train)

# pipe = Pipeline([
#     ('prepper', prepper),
#     ('model', MultinomialNB())
# ])

# pipe.fit_transform(X_train)

NotFittedError: This ColumnTransformer instance is not fitted yet. Call 'fit' with appropriate arguments before using this estimator.

In [44]:
 cust_pipe = Pipeline([('aapl_or_goog', CompanyMentionDetector())])



# cleaning_pipe = Pipeline(steps=[
#     ('aapl_or_goog', CompanyMentionDetector(), 'body'),
#     ('label', LabelEncoder(), 'target'), 
#     ('toke', RegexpTokenizer(r"([a-zA-Z]+(?:’[a-z]+)?)"), 'body'),
#     ('model', LogisticRegression())
# ])

tweet_pipe = Pipeline([
    ('toke', RegexpTokenizer(r"([a-zA-Z]+(?:’[a-z]+)?)")),
    
])

preprocessor = ColumnTransformer([
    ('cust', CompanyMentionDetector(), 'body'),
    ('toke', tweet_pipe, 'body'),
    ('sent', sentiment_pipe, 'target')
])

# fit the pipe with transformers and basic model
pipe = Pipeline([
    ('prepper', preprocessor()),
    ('model', LogisticRegression())
])

pipe.fit(X_train, y_train)


NameError: name 'sentiment_pipe' is not defined

In [97]:
cmd = CompanyMentionDetector('body')
xtcmd = cmd.transform(X_train)
dubs = xtcmd.loc[(xtcmd.body_aapl == 'apple') & (xtcmd.body_goog == 'google')]
dubs

Unnamed: 0,body,product,body_string,body_token,body_aapl,body_goog,company
4524,Domo iPhone &amp; Android App: Share with Face...,iPad or iPhone App,domo iphone &amp; android app: share with face...,"[domo, iphone, amp, android, app, share, with,...",apple,google,apple
4022,#playhopskoch is in the apple app store (as we...,iPad or iPhone App,#playhopskoch is in the apple app store (as we...,"[playhopskoch, is, in, the, apple, app, store,...",apple,google,apple
2743,The Google Tv to IPad App: the connected tv ex...,iPad or iPhone App,the google tv to ipad app: the connected tv ex...,"[the, google, tv, to, ipad, app, the, connecte...",apple,google,apple
377,HootSuite blog ‰ЫТ Social Media Dashboard еИ H...,,hootsuite blog ‰ыт social media dashboard еи h...,"[hootsuite, blog, social, media, dashboard, ho...",apple,google,apple
3854,"#SXSW GO is available on 5 platforms - iPhone,...",,"#sxsw go is available on 5 platforms - iphone,...","[sxsw, go, is, available, on, platforms, iphon...",apple,google,apple
...,...,...,...,...,...,...,...
5127,RT @mention @mention has their Google Analytic...,,rt @mention @mention has their google analytic...,"[rt, mention, mention, has, their, google, ana...",apple,google,apple
3932,One Day Without Shoes (Thoms Shoes) New App fo...,,one day without shoes (thoms shoes) new app fo...,"[one, day, without, shoes, thoms, shoes, new, ...",apple,google,apple
2699,Every game or app ad in the #SXSW Interactive ...,,every game or app ad in the #sxsw interactive ...,"[every, game, or, app, ad, in, the, sxsw, inte...",apple,google,apple
894,HootSuite Mobile for #SXSW ~ Updates for iPhon...,,hootsuite mobile for #sxsw ~ updates for iphon...,"[hootsuite, mobile, for, sxsw, updates, for, i...",apple,google,apple


In [74]:
c = 'corey'
[c]

['corey']

In [73]:
apple_filter = binary_df['company'] == 'apple'
pos_filter = binary_df['target'] == 'positive emotion'

apple_pos_df = binary_df[apple_filter & pos_filter]
apple_pos_df

NameError: name 'binary_df' is not defined