In [38]:
import pandas as pd
import numpy as np
import re
from nltk.tokenize import RegexpTokenizer
from nltk.stem.porter import PorterStemmer
from nltk.stem import WordNetLemmatizer
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.preprocessing import FunctionTransformer, StandardScaler
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.compose import ColumnTransformer, make_column_transformer
from nltk.corpus import stopwords
from sklearn.base import BaseEstimator, TransformerMixin

In [39]:
df = pd.read_csv('FData/SPYHeadGrouped.csv')
df.head(2)

Unnamed: 0,Time,Headlines,Volume,DayDiff,Target
0,2020-07-16,Canary Wharf traders and landlord bank on retu...,54433414.0,1.0,1
1,2020-07-15,'Incredible' Boohoo denying knowledge of facto...,86921534.0,-0.56,0


In [40]:
X = df[['Headlines', 'Volume', 'DayDiff']]
Y = df[['Target']]

x_train, x_test, y_train, y_test = train_test_split(X,Y, stratify = Y.Target.values, random_state = 10, train_size = .70)

print(f'Train:\t{len(x_train)}\n{y_train.Target.value_counts()}\n~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~')
print(f'Test:\t{len(x_test)}\n{y_test.Target.value_counts()}\n~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~')

Train:	452
0    257
1    195
Name: Target, dtype: int64
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
Test:	194
0    110
1     84
Name: Target, dtype: int64
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~


In [41]:
#PIPELINE FUNCTIONS
def preprocess_headline(headline): 
    reg_token = RegexpTokenizer("([a-zA-Z&]+(?:'[a-z]+)?)")

    new_headline = ' '.join([i for i in headline.lower().split() if i != 'rt' and i.endswith('…') == False])
    new_headline  = ' '.join(re.sub("(@[A-Za-z0-9]+)|([^0-9A-Za-z \t])|(\w+:\/\/\S+)"," ",new_headline .lower()).split())
    new_headline  = reg_token.tokenize(new_headline .lower())
    
#     word_stem = PorterStemmer()
#     word_lem = WordNetLemmatizer()
#     new_tweet= ' '.join([word_lem.lemmatize(i) for i in new_tweet])
#     new_tweet= ' '.join([word_stem.stem(i) for i in new_tweet if len(i) > 1])

#     print(f'{tweet}\n')
#     print(f'{new_tweet}\n')
#     print('~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~')
#     return new_tweet
    
    return ' '.join(new_headline)

def new_headline(df): 
    df['Headlines'] = df.Headlines.map(preprocess_headline)
    return df[['Headlines']]

In [48]:
headline_transformer = FunctionTransformer(new_headline)
stop = stopwords
count_vec = CountVectorizer(stop_words = 'english', max_features = 5000, ngram_range = (1,3)).fit(x_train)
ss_transformer = ColumnTransformer(transformers = [('ss', StandardScaler(), ['Volume', 'DayDiff'])], 
                                   n_jobs = -1)
cv_transformer = make_column_transformer((count_vec, ['Headlines']))



pipe = Pipeline([('feats', FeatureUnion([('cv', cv_transformer), ('ss', ss_transformer)]))])



test = pipe.fit_transform(x_train)

ValueError: all the input array dimensions for the concatenation axis must match exactly, but along dimension 0, the array at index 0 has size 1 and the array at index 1 has size 452

In [49]:
test

array([['ryanair demands same tax holiday amid flybe rescue deal backlash senate approves revamped north american trade deal blackrock gets praise for coal divestment what it really needs is regulation business live doubts linger over us china trade deal but dow hits record high as it happened betfred owners make millions from company treating gambling addicts lost in showbiz do not adjust your set gwyneth paltrow is spreading goop all over tv how britain got the gambling bug it s a war between technology and a donkey how ai is shaking up hollywood amazon plans bn investment in india despite trader backlash the case for truly taking back control by reversing the privatisation of our cities halfords rides bicycle sales boom race and money black owned firms are twice as likely to be rejected for loans is this discrimination how carillion collapse stymied two state of the art hospitals uk electric van maker arrival secures m from kia and hyundai stockton on tees rues bloody online as debe

In [None]:
assert False

In [None]:
import pandas as pd
import numpy as np
import random
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.svm import LinearSVC
from sklearn.metrics import accuracy_score
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.model_selection import GridSearchCV

class TextExtractor(BaseEstimator, TransformerMixin):
    """Adapted from code by @zacstewart
       https://github.com/zacstewart/kaggle_seeclickfix/blob/master/estimator.py
       Also see Zac Stewart's excellent blogpost on pipelines:
       http://zacstewart.com/2014/08/05/pipelines-of-featureunions-of-pipelines.html
       """

    def __init__(self, column_name):
        self.column_name = column_name

    def transform(self, df):
        # Select the relevant column and return it as a numpy array.
        # Set the array type to be string.
        return np.asarray(df[self.column_name]).astype(str)

    def fit(self, *_):
        return self

class Apply(BaseEstimator, TransformerMixin):
    """Takes in a function and applies it element-wise to every element in the numpy array it's supplied with."""

    def __init__(self, fn):
        self.fn = np.vectorize(fn)

    def transform(self, data):
        # Note: reshaping is necessary because otherwise sklearn
        # interprets the 1-d array as a single sample.
        return self.fn(data.reshape(data.size, 1))

    def fit(self, *_):
        return self

class AverageWordLengthExtractor(BaseEstimator, TransformerMixin):
    """Takes in dataframe, extracts last name column, outputs average word length"""

    def __init__(self):
        pass

    def average_word_length(self, name):
        """Helper code to compute average word length of a name"""
        return np.mean([len(word) for word in name.split()])

    def transform(self, df, y=None):
        """The workhorse of this feature extractor"""
        return df['LAST_NAME'].apply(self.average_word_length)

    def fit(self, df, y=None):
        """Returns self unless something different happens in train and test"""
        return self

# Let's pick the same random 10% of the data to train with.
random.seed(1965)
train_test_set = df.loc[random.sample(list(df.index.values), int(len(df) / 10))]

# X = train_test_set[['road_name', 'has_malay_road_tag']]
X = train_test_set[['LAST_NAME']]
y = train_test_set['RACE']

vect = CountVectorizer(ngram_range=(1,4), analyzer='char')
clf = LinearSVC()

pipeline = Pipeline([
    ('name_extractor', TextExtractor('LAST_NAME')),    # Extract names from df.
    ('text_features', FeatureUnion([
        ('vect', vect),    # Extract ngrams from names.
        ('num_words', Apply(lambda s: len(s.split()))),    # Number of words.
        ('ave_word_length', Apply(lambda s: np.mean([len(w) for w in s.split()]))), # Average word length.
    ])),
    ('clf' , clf),     # Feed the output through a classifier.
])

def run_experiment(X, y, pipeline, num_expts=100):
    scores = list()
    for i in range(num_expts):
        X_train, X_test, y_train, y_true = train_test_split(X, y)
        model = pipeline.fit(X_train, y_train)  # Train the classifier.
        y_test = model.predict(X_test)          # Apply the model to the test data.
        score = accuracy_score(y_test, y_true)  # Compare the results to the gold standard.
        scores.append(score)

    print(sum(scores) / num_expts)

# Run x times (num_expts) and get the average accuracy.
run_experiment(X, y, pipeline, 100)

In [None]:
print(test)

In [None]:
assert False

In [None]:
# test = df.iloc[1].Headlines
# print(test)

def preprocess_headline(headline): 
    reg_token = RegexpTokenizer("([a-zA-Z&]+(?:'[a-z]+)?)")

    new_headline = ' '.join([i for i in headline.lower().split() if i != 'rt' and i.endswith('…') == False])
    new_headline  = ' '.join(re.sub("(@[A-Za-z0-9]+)|([^0-9A-Za-z \t])|(\w+:\/\/\S+)"," ",new_headline .lower()).split())
    new_headline  = reg_token.tokenize(new_headline .lower())
    
#     word_stem = PorterStemmer()
#     word_lem = WordNetLemmatizer()
#     new_tweet= ' '.join([word_lem.lemmatize(i) for i in new_tweet])
#     new_tweet= ' '.join([word_stem.stem(i) for i in new_tweet if len(i) > 1])

#     print(f'{tweet}\n')
#     print(f'{new_tweet}\n')
#     print('~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~')
#     return new_tweet
    
    return ' '.join(new_headline)
    
# new_test= preprocess_headline(test)
df['NewHeadline'] = df.Headlines.map(preprocess_headline)

In [None]:
df

## Train Test Split & Count Vectorizer

In [None]:
X = df['NewHeadline', 'Volume', 'DayDiff']