In [None]:
import pandas as pd
import numpy as np
import re
from nltk.tokenize import RegexpTokenizer
from nltk.stem.porter import PorterStemmer
from nltk.stem import WordNetLemmatizer
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.preprocessing import FunctionTransformer, StandardScaler
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.compose import ColumnTransformer, make_column_transformer
from nltk.corpus import stopwords
from sklearn.base import BaseEstimator, TransformerMixin
import pickle

### Pipeline 

1. apply regex and lower case
2. Standard Scaler to volume and DayDiff
3. Count Vectorizer to Headlines

In [None]:
model_num = 2
df = pd.read_csv('FData/SPYHeadGrouped.csv')
df.head(2)

In [None]:
#PIPELINE FUNCTIONS
def preprocess_headline(headline): 
    reg_token = RegexpTokenizer("([a-zA-Z&]+(?:'[a-z]+)?)")

    new_headline = ' '.join([i for i in headline.lower().split() if i != 'rt' and i.endswith('…') == False])
    new_headline  = ' '.join(re.sub("(@[A-Za-z0-9]+)|([^0-9A-Za-z \t])|(\w+:\/\/\S+)"," ",new_headline.lower()).split())
    new_headline  = reg_token.tokenize(new_headline .lower())
    
    word_stem = PorterStemmer()
#     word_lem = WordNetLemmatizer()
#     new_tweet= ' '.join([word_lem.lemmatize(i) for i in new_tweet])
#     new_headline = [word_stem.stem(i) for i in new_headline if len(i) > 1]

#     print(f'{tweet}\n')
#     print(f'{new_tweet}\n')
#     print('~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~')
#     return new_tweet
    
    return ' '.join(new_headline)

def new_headline(df): 
    df['Headlines'] = df.Headlines.map(preprocess_headline)
    return df

df = new_headline(df)

In [None]:
X = df[['Headlines', 'Volume', 'DayDiff']]
Y = df[['Target']]

x_train, x_test, y_train, y_test = train_test_split(X,Y, stratify = Y.Target.values, random_state = 10, train_size = .70)

print(f'Train:\t{len(x_train)}\n{y_train.Target.value_counts()}\n~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~')
print(f'Test:\t{len(x_test)}\n{y_test.Target.value_counts()}\n~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~')

In [None]:
#for original change max_features back to 3000

def get_preprocessing_pickles(pick_name, x_train, x_test, y_train, y_test, new = True):
    if new == True:    
        ss_volume = StandardScaler().fit(x_train['Volume'].values.reshape(-1,1))
        ss_daydiff = StandardScaler().fit(x_train['DayDiff'].values.reshape(-1,1))
        cv = CountVectorizer(stop_words = 'english', max_features = 5000, ngram_range = (1,1), min_df = 2).fit(x_train['Headlines'])
        preprocessing_dict = {'ss_volume': ss_volume, 'ss_daydiff': ss_daydiff, 'headlines': cv}
        pickle.dump(preprocessing_dict, open(f'Pickles/PreprocessingDict_{pick_name}.p', 'wb'))
    else: 
        preprocessing_dict = pickle.load(open(f'Pickles/PreprocessingDict_{pick_name}.p', 'rb'))
    
    return preprocessing_dict

preprocessing_dict = get_preprocessing_pickles(model_num, x_train, x_test, y_train, y_test, new = True)

In [None]:
def preprocess_steps(preprocessing_dict, x_train, x_test, y_train, y_test, pick_name = None): 
    #standard scaler volume
    x_train_new = pd.DataFrame()
    x_test_new = pd.DataFrame()
    
    x_train_new['Volume'] = preprocessing_dict['ss_volume'].transform(x_train.Volume.values.reshape(-1,1)).ravel()
    x_test_new['Volume'] = preprocessing_dict['ss_volume'].transform(x_test.Volume.values.reshape(-1,1)).ravel()
    
    #standard scaler daydiff
    x_train_new['DayDiff'] = preprocessing_dict['ss_daydiff'].transform(x_train.DayDiff.values.reshape(-1,1))
    x_test_new['DayDiff'] = preprocessing_dict['ss_daydiff'].transform(x_test.DayDiff.values.reshape(-1,1))
    
    cv_vec = preprocessing_dict['headlines']
    train_headlines = pd.DataFrame(cv_vec.transform(x_train['Headlines']).toarray(), columns = cv_vec.get_feature_names())
    test_headlines = pd.DataFrame(cv_vec.transform(x_test['Headlines']).toarray(), columns = cv_vec.get_feature_names())

    x_train_new = pd.concat([x_train_new, train_headlines], axis = 1)
    x_test_new = pd.concat([x_test_new, test_headlines], axis = 1)

    if pick_name: 
        tts = (x_train_new, x_test_new, y_train, y_test)
        pickle.dump(tts, open(f'Pickles/TTS_{pick_name}.p', 'wb'))
    return x_train_new, x_test_new, y_train, y_test

tts = preprocess_steps(preprocessing_dict, x_train, x_test, y_train, y_test, pick_name = model_num)

In [None]:
assert False

In [None]:
headline_transformer = FunctionTransformer(new_headline)
stop = stopwords
count_vec = CountVectorizer(stop_words = 'english', max_features = 5000, ngram_range = (1,3))
ss_transformer = ColumnTransformer(transformers = [('ss', StandardScaler(), ['Volume', 'DayDiff'])], 
                                   n_jobs = -1)




pipe = Pipeline([('feats', FeatureUnion([('cv', cv_transformer), ('ss', ss_transformer)]))])



test = pipe.fit_transform(x_train)

In [None]:
test

In [None]:
assert False

In [None]:
import pandas as pd
import numpy as np
import random
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.svm import LinearSVC
from sklearn.metrics import accuracy_score
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.model_selection import GridSearchCV

class TextExtractor(BaseEstimator, TransformerMixin):
    """Adapted from code by @zacstewart
       https://github.com/zacstewart/kaggle_seeclickfix/blob/master/estimator.py
       Also see Zac Stewart's excellent blogpost on pipelines:
       http://zacstewart.com/2014/08/05/pipelines-of-featureunions-of-pipelines.html
       """

    def __init__(self, column_name):
        self.column_name = column_name

    def transform(self, df):
        # Select the relevant column and return it as a numpy array.
        # Set the array type to be string.
        return np.asarray(df[self.column_name]).astype(str)

    def fit(self, *_):
        return self

class Apply(BaseEstimator, TransformerMixin):
    """Takes in a function and applies it element-wise to every element in the numpy array it's supplied with."""

    def __init__(self, fn):
        self.fn = np.vectorize(fn)

    def transform(self, data):
        # Note: reshaping is necessary because otherwise sklearn
        # interprets the 1-d array as a single sample.
        return self.fn(data.reshape(data.size, 1))

    def fit(self, *_):
        return self

class AverageWordLengthExtractor(BaseEstimator, TransformerMixin):
    """Takes in dataframe, extracts last name column, outputs average word length"""

    def __init__(self):
        pass

    def average_word_length(self, name):
        """Helper code to compute average word length of a name"""
        return np.mean([len(word) for word in name.split()])

    def transform(self, df, y=None):
        """The workhorse of this feature extractor"""
        return df['LAST_NAME'].apply(self.average_word_length)

    def fit(self, df, y=None):
        """Returns self unless something different happens in train and test"""
        return self

# Let's pick the same random 10% of the data to train with.
random.seed(1965)
train_test_set = df.loc[random.sample(list(df.index.values), int(len(df) / 10))]

# X = train_test_set[['road_name', 'has_malay_road_tag']]
X = train_test_set[['LAST_NAME']]
y = train_test_set['RACE']

vect = CountVectorizer(ngram_range=(1,4), analyzer='char')
clf = LinearSVC()

pipeline = Pipeline([
    ('name_extractor', TextExtractor('LAST_NAME')),    # Extract names from df.
    ('text_features', FeatureUnion([
        ('vect', vect),    # Extract ngrams from names.
        ('num_words', Apply(lambda s: len(s.split()))),    # Number of words.
        ('ave_word_length', Apply(lambda s: np.mean([len(w) for w in s.split()]))), # Average word length.
    ])),
    ('clf' , clf),     # Feed the output through a classifier.
])

def run_experiment(X, y, pipeline, num_expts=100):
    scores = list()
    for i in range(num_expts):
        X_train, X_test, y_train, y_true = train_test_split(X, y)
        model = pipeline.fit(X_train, y_train)  # Train the classifier.
        y_test = model.predict(X_test)          # Apply the model to the test data.
        score = accuracy_score(y_test, y_true)  # Compare the results to the gold standard.
        scores.append(score)

    print(sum(scores) / num_expts)

# Run x times (num_expts) and get the average accuracy.
run_experiment(X, y, pipeline, 100)

In [None]:
print(test)

In [None]:
assert False