# Stock Market Prediction - Starter Kernel


In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import os
from sklearn.ensemble import RandomForestRegressor
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.tree import DecisionTreeRegressor
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import mean_squared_error # wouldn't recommend since we're not being evaluated on MSE

### Importing Dataframes

In [None]:
from kaggle.competitions import twosigmanews
# You can only call make_env() once, so don't lose it!
env = twosigmanews.make_env()
print('Done!')

In [None]:
(market_train_df, news_train_df) = env.get_training_data()

In [None]:
# decide the length of the dataset (you can just get a handful if you just need it for testing)
market_train_df = market_train_df.sample(1_000_000)

In [None]:
market_train_df.head()

In [None]:
news_train_df.head()

In [None]:
news_train_df.corr()['relevance'].head(10)

In [None]:
market_train_df.corr()['returnsOpenNextMktres10'].head(10)

### Text Processing (WIP)

In [None]:
def get_headline(target, input_df):
    
    # get headlines as list
    headlines_lst = []
    for row in range(0,len(input_df.index)):
        headlines_lst.append(input_df.iloc[row])
        
    # split headlines to separate words
    basicvectorizer = CountVectorizer()
    headlines_vectored = basicvectorizer.fit_transform(headlines_lst)
    
    return headlines_vectored

def headline_modelling(target, headlines_vectored):
    
    # get model (logistic regression)
    headline_model = RandomForestRegressor()
    headline_model = headline_model.fit(headlines_vectored, target)
    
    return headline_model

### One Hot Encoding 

Note: Will use after processing the string data.

In [None]:
def one_hot_encoder(data, nan_as_category=True):
    original_columns = list(data.columns)
    categorical_columns = [col for col in data.columns \
                           if not pd.api.types.is_numeric_dtype(data[col].dtype)] # ignore error in eclipse
    for c in categorical_columns:
        if nan_as_category:
            data[c].fillna('NaN', inplace = True)
        values = list(data[c].unique())
        for v in values:
            data[str(c) + '_' + str(v)] = (data[c] == v).astype(np.uint8)
    data.drop(categorical_columns, axis = 1, inplace = True)
    return data, [c for c in data.columns if c not in original_columns]

In [None]:
# one_hot_encoder(market_train_df)
# one_hot_encoder(news_train_df)

### Merging dataframes

Will use later when working on both dataframes.

### Using Sklearn for Modelling

In [None]:
# prepare x dataframes for modelling/prediction
def convert_to_X(df):
    return df[[f for f in df.columns if df[f].dtype in ['int','float'] and f != 'returnsOpenNextMktres10' \
                                                                        and f != 'universe']].fillna(0)

In [None]:
y_train = market_train_df['returnsOpenNextMktres10']
X_train = convert_to_X(market_train_df)

In [None]:
%%time

# set model and parameters
model = DecisionTreeRegressor()

# train and get MSE.
model.fit(X_train, y_train)
y_train_pred = model.predict(X_train)

In [None]:
print(np.sqrt(mean_squared_error(y_train,y_train_pred)))

In [None]:
def set_ceiling(item):
    if item < -1:
        return -1
    elif item > 1:
        return 1
    elif item == None:
        return 0
    else:
        return item

def make_predictions(market_obs_df, news_obs_df):
    
    # predict using given model
    X_test = convert_to_X(market_obs_df)
    prediction_values = model.predict(X_test).tolist()
    prediction_values = [set_ceiling(n) for n in prediction_values]

    return prediction_values

### Making Predictions

Now the difference between the training and test data would be these two columns,  ``['returnsOpenNextMktres10', 'universe']``. We will be trying to predict ``returnsOpenNextMktres10`` and using that as the ``confidenceValue``.

In [None]:
for (market_obs_df, news_obs_df, predictions_template_df) in env.get_prediction_days(): # Looping over days from start of 2017 to 2019-07-15
    
    # make predictions
    predictions_template_df['confidenceValue'] = make_predictions(market_obs_df, news_obs_df)
    
    # save predictions
    env.predict(predictions_template_df)


### Export Submission

In [None]:
env.write_submission_file() # Writes your submission file

### Sources:
* [Getting Started - DJ Sterling](https://www.kaggle.com/dster/two-sigma-news-official-getting-started-kernel)
* [Bare bones script - William Cukierski](https://www.kaggle.com/wcukierski/bare-bones-script-loop-with-comments)
* [Extra data - aaron7sun](https://www.kaggle.com/aaron7sun/stocknews)
* [Text Preprocessing - Andrew Gelé](https://www.kaggle.com/ndrewgele/omg-nlp-with-the-djia-and-reddit)
* [fake news - SamLloyd](https://www.kaggle.com/sjdlloyd/it-s-fake-news-this-is-top-of-the-leaderboard)