In [9]:
import pandas as pd
import numpy as np
import re
from nltk.tokenize import RegexpTokenizer
from nltk.stem.porter import PorterStemmer
from nltk.stem import WordNetLemmatizer
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.preprocessing import FunctionTransformer, StandardScaler
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.compose import ColumnTransformer, make_column_transformer
from nltk.corpus import stopwords
from sklearn.base import BaseEstimator, TransformerMixin
import pickle
from sklearn.cluster import KMeans
import matplotlib.pyplot as plt
from tqdm import tqdm

### Pipeline 

1. apply regex and lower case
2. Standard Scaler to volume and DayDiff
3. Count Vectorizer to Headlines

In [10]:
model_num = 3
df = pd.read_csv('../FData/Headlines/New/SPYHeadlinesGrouped1.csv')
df.head(2)

Unnamed: 0,Time,Headlines,1. open,2. high,3. low,4. close,Volume,CloseDiff,CloseDiffNew,Target,DayDiff
0,2017-12-19,House prices to fall in London and south-east ...,268.48,268.53,267.09,267.17,82382876.0,-1.03,-0.14,0,-1.31
1,2017-12-20,Hedge funds fail to stop 'billion-dollar brain...,268.27,268.33,266.69,267.03,76751500.0,-0.14,0.55,1,-1.24


In [11]:
#PIPELINE FUNCTIONS
def preprocess_headline(headline): 
    reg_token = RegexpTokenizer("([a-zA-Z&]+(?:'[a-z]+)?)")

    new_headline = ' '.join([i for i in headline.lower().split() if i != 'rt' and i.endswith('…') == False])
    new_headline  = ' '.join(re.sub("(@[A-Za-z0-9]+)|([^0-9A-Za-z \t])|(\w+:\/\/\S+)"," ",new_headline.lower()).split())
    new_headline  = reg_token.tokenize(new_headline .lower())
    
    word_stem = PorterStemmer()
#     word_lem = WordNetLemmatizer()
#     new_tweet= ' '.join([word_lem.lemmatize(i) for i in new_tweet])
    new_headline = [word_stem.stem(i) for i in new_headline if len(i) > 1]

#     print(f'{tweet}\n')
#     print(f'{new_tweet}\n')
#     print('~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~')
#     return new_tweet
    
    return ' '.join(new_headline)

def new_headline(df): 
    df['Headlines'] = df.Headlines.map(preprocess_headline)
    return df

df = new_headline(df)

In [12]:
X = df[['Headlines', 'Volume', 'DayDiff']]
Y = df[['Target']]

x_train, x_test, y_train, y_test = train_test_split(X,Y, stratify = Y.Target.values, random_state = 10, train_size = .70)

print(f'Train:\t{len(x_train)}\n{y_train.Target.value_counts()}\n~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~')
print(f'Test:\t{len(x_test)}\n{y_test.Target.value_counts()}\n~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~')

Train:	451
1    254
0    197
Name: Target, dtype: int64
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
Test:	194
1    109
0     85
Name: Target, dtype: int64
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~


In [13]:
def preprocess_steps(preprocessing_dict, x_train, x_test, y_train, y_test, pick_name = None): 
    #standard scaler volume
    x_train_new = pd.DataFrame()
    x_test_new = pd.DataFrame()
    
    x_train_new['Volume'] = preprocessing_dict['ss_volume'].transform(x_train.Volume.values.reshape(-1,1)).ravel()
    x_test_new['Volume'] = preprocessing_dict['ss_volume'].transform(x_test.Volume.values.reshape(-1,1)).ravel()
    
    #standard scaler daydiff
    x_train_new['DayDiff'] = preprocessing_dict['ss_daydiff'].transform(x_train.DayDiff.values.reshape(-1,1))
    x_test_new['DayDiff'] = preprocessing_dict['ss_daydiff'].transform(x_test.DayDiff.values.reshape(-1,1))
    
    cv_vec = preprocessing_dict['headlines']
    train_headlines = pd.DataFrame(cv_vec.transform(x_train['Headlines']).toarray(), columns = cv_vec.get_feature_names())
    test_headlines = pd.DataFrame(cv_vec.transform(x_test['Headlines']).toarray(), columns = cv_vec.get_feature_names())

    x_train_new = pd.concat([x_train_new, train_headlines], axis = 1)
    x_test_new = pd.concat([x_test_new, test_headlines], axis = 1)
    
    kmeans = KMeans(n_clusters = 3, max_iter = 1000, tol = 1e-3).fit(x_train_new.values)
    preprocessing_dict['k_cluster'] = kmeans
    x_test_new['KCluster'] = kmeans.predict (x_test_new.values)
    x_train_new['KCluster'] = kmeans.predict(x_train_new.values)

    if pick_name: 
        tts = (x_train_new, x_test_new, y_train, y_test, preprocessing_dict)
        pickle.dump(tts, open(f'../Pickles/TTS_{pick_name}.p', 'wb'))
#         pickle.dump(preprocessing_dict, open(f'Pickles/PreprocessingDict_{pick_name}.p', 'wb'))

    return x_train_new, x_test_new, y_train, y_test, preprocessing_dict


def get_preprocessing_pickles(pick_name, x_train, x_test, y_train, y_test):
    ss_volume = StandardScaler().fit(x_train['Volume'].values.reshape(-1,1))
    ss_daydiff = StandardScaler().fit(x_train['DayDiff'].values.reshape(-1,1))
    cv = CountVectorizer(stop_words = 'english', max_features = 5000, ngram_range = (1,1), min_df = 1).fit(x_train['Headlines'])

    preprocessing_dict = {'ss_volume': ss_volume, 'ss_daydiff': ss_daydiff, 'headlines': cv}
    
    x_train_new, x_test_new, y_train, y_test, preprocessing_dict = preprocess_steps(preprocessing_dict, x_train, x_test, 
                                                                                    y_train, y_test, pick_name = pick_name)
    
    return x_train_new, x_test_new, y_train, y_test, preprocessing_dict
    


x_train_new, x_test_new, y_train, y_test, preprocessing_dict = get_preprocessing_pickles(model_num, x_train, x_test, y_train, y_test)

In [6]:
assert False

AssertionError: 

In [None]:
ssd = [] 
K = range(1,15)
for k in tqdm(K): 
    km = KMeans(n_clusters = k, max_iter = 1000, tol = 1e-2)
    km = km.fit(x_train_new.values)
    ssd.append(km.inertia_)

plt.figure(figsize = (15,5))
plt.plot(K, ssd, 'bx-')

In [None]:
def preprocess_steps(preprocessing_dict, x_train, x_test, y_train, y_test, pick_name = None): 
    #standard scaler volume
    x_train_new = pd.DataFrame()
    x_test_new = pd.DataFrame()
    
    x_train_new['Volume'] = preprocessing_dict['ss_volume'].transform(x_train.Volume.values.reshape(-1,1)).ravel()
    x_test_new['Volume'] = preprocessing_dict['ss_volume'].transform(x_test.Volume.values.reshape(-1,1)).ravel()
    
    #standard scaler daydiff
    x_train_new['DayDiff'] = preprocessing_dict['ss_daydiff'].transform(x_train.DayDiff.values.reshape(-1,1))
    x_test_new['DayDiff'] = preprocessing_dict['ss_daydiff'].transform(x_test.DayDiff.values.reshape(-1,1))
    
    cv_vec = preprocessing_dict['headlines']
    train_headlines = pd.DataFrame(cv_vec.transform(x_train['Headlines']).toarray(), columns = cv_vec.get_feature_names())
    test_headlines = pd.DataFrame(cv_vec.transform(x_test['Headlines']).toarray(), columns = cv_vec.get_feature_names())

    x_train_new = pd.concat([x_train_new, train_headlines], axis = 1)
    x_test_new = pd.concat([x_test_new, test_headlines], axis = 1)

    x_test_new['KCluster'] = preprocessing_dict['k_cluster'].transform(x_test_new)
    x_train_new['KCluster'] = preprocessing_dict['k_cluster'].transform(x_train_new)

    if pick_name: 
        tts = (x_train_new, x_test_new, y_train, y_test)
        pickle.dump(tts, open(f'Pickles/TTS_{pick_name}.p', 'wb'))
    return x_train_new, x_test_new, y_train, y_test

x_train, x_test, y_train, y_test = preprocess_steps(preprocessing_dict, x_train, x_test, y_train, y_test, pick_name = model_num)