# Machine Learning Model - First Segment Project Deliverable

## Model Plan

1. Prepare the dataframe with columns: tweet text, price previous day, price next day, price_diff
2. Preprocess the tweet text into features (countVectorier, tfidf)
    1. Classification: predict if it goes up or down (Binomial Naive Bayes)
    2. Regress the actual price difference (RandomForests, if time allows XGBoost or lightgbm)
3. Evaluate algorithms and discuss results

### 1. Prepare the dataframe with columns: tweet text, price previous day, price next day, price_diff

In [1]:
# Setting up libraries:

import requests
import pandas as pd

In [2]:
# Import data
tweets = pd.read_csv('https://raw.githubusercontent.com/angkohtenko/twitter_vs_stocks/kimberly_branch/Data/elon_tweets.csv')
tweets = tweets[['date', 'text']]

tesla_df = pd.read_csv('./Data/tesla_stocks.csv')
tesla_df['date'] = pd.to_datetime(tesla_df['date'])
tesla_df.head()

Unnamed: 0,date,open,high,low,close,adjclose,volume
0,2010-06-29,3.8,5.0,3.508,4.778,4.778,93831500
1,2010-06-30,5.158,6.084,4.66,4.766,4.766,85935500
2,2010-07-01,5.0,5.184,4.054,4.392,4.392,41094000
3,2010-07-02,4.6,4.62,3.742,3.84,3.84,25699000
4,2010-07-06,4.0,4.0,3.166,3.222,3.222,34334500


In [6]:
# Reformat Date and time types
from datetime import timedelta

tweets['prev_date'] = pd.to_datetime(tweets.date) - timedelta(days=1)
tweets['next_date'] = pd.to_datetime(tweets.date) + timedelta(days=1)

In [7]:
# Check for NaN
tweets.dropna().shape

(849, 4)

In [8]:
# Merge Dataframes 
tweets_price = pd.merge(tweets, tesla_df, how='left', left_on='prev_date', right_on='date', suffixes=('', '_prev'))
tweets_price = pd.merge(tweets_price, tesla_df, how='left', left_on='next_date', right_on='date', suffixes=('', '_next'))

In [9]:
# Rename Columns 
tweets_price = tweets_price.rename(columns={'close': 'close_prev'})
tweets_price = tweets_price[['date', 'text', 'close_prev', 'close_next']]
tweets_price['close_price_diff'] = tweets_price['close_next'] - tweets_price['close_prev']
tweets_price.dropna(inplace=True)
tweets_price.shape
tweets_price.head()

Unnamed: 0,date,text,close_prev,close_next,close_price_diff
4,2021-07-08,Maybe that movie gaslit us all,644.650024,656.950012,12.299988
5,2021-07-08,Listening to History of English podcast,644.650024,656.950012,12.299988
6,2021-07-08,Tropic Thunder is so good,644.650024,656.950012,12.299988
7,2021-07-08,Gas lit by gas light 💡,644.650024,656.950012,12.299988
8,2021-07-08,"Also, the British legal system in 2021 https:/...",644.650024,656.950012,12.299988


## A - Classification: Which tweets increase stock price vs decrease
    -Preprocess the tweet text into features (countVectorier, tfidf)
    -Classification: predict if it goes up or down (Binomial Naive Bayes)

In [10]:
# Setting up libraries for model

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split

text_clf = Pipeline([
    ('vect', CountVectorizer()),
    ('tfidf', TfidfTransformer()),
    ('clf', MultinomialNB()),
])

In [11]:
# Setting up variables
X = tweets_price.text.tolist()
y = (tweets_price['close_price_diff'] > 0).astype(int).values

In [12]:
# Separating data from training vs testing data
X_train, X_test, y_train, y_test = train_test_split(X, y)

In [13]:
# Classify text data
text_clf.fit(X_train, y_train)

Pipeline(steps=[('vect', CountVectorizer()), ('tfidf', TfidfTransformer()),
                ('clf', MultinomialNB())])

In [14]:
# Testing predicted probability
predicted_proba_test = text_clf.predict_proba(X_test)[:, 1]

In [15]:
# Adding results into DataFrame
results_test = pd.DataFrame({
    'proba_positive_tweet': predicted_proba_test,
    'tweet': X_test,
    'label': y_test
}).sort_values('proba_positive_tweet', ascending=False)
pd.set_option('display.max_colwidth', None)
results_test.head()

Unnamed: 0,proba_positive_tweet,tweet,label
65,0.600099,🌍🌎🌏 Happy Earth Day 🌍🌎🌏,0
53,0.549012,.@ERCOT_ISO is not earning that R,0
14,0.539118,RT @SpaceX: The Dragon spacecraft supporting SpaceX’s 21st cargo resupply mission to the @space_station departed our Hawthorne factory last…,1
23,0.512737,Such a privilege to work with people I like &amp; respect so much. I feel blessed.,0
72,0.511028,Elevator music is underrated,1


## B. Regression - Predict the stock price difference

In [16]:
# Setting up libraries for regression model

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split

from sklearn.pipeline import Pipeline
text_reg = Pipeline([
    ('vect', CountVectorizer()),
    ('tfidf', TfidfTransformer()),
    ('reg', RandomForestRegressor()),
])

In [17]:
# Setting up variables
X = tweets_price.text.tolist()
y = tweets_price['close_price_diff']

In [18]:
# Separating data from training vs testing data
X_train, X_test, y_train, y_test = train_test_split(X, y)

In [19]:
# Training
text_reg.fit(X_train, y_train)

Pipeline(steps=[('vect', CountVectorizer()), ('tfidf', TfidfTransformer()),
                ('reg', RandomForestRegressor())])

In [20]:
# Sklearn - Determine accuracy
text_reg.score(X_test, y_test)

-0.28381349779666865

## Multiclass classification: Positive, Neutral or Negative

In [21]:
# Setting up libraries for multiclass classification model of text data

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import train_test_split

from sklearn.pipeline import Pipeline
text_clf = Pipeline([
    ('vect', CountVectorizer()),
    ('tfidf', TfidfTransformer()),
    ('clf', MultinomialNB()),
])

In [22]:
#Setting up variables for classification
X = tweets_price.text.tolist()
y = pd.qcut(tweets_price['close_price_diff'], 4)
print(y.dtype.categories.tolist())
y = pd.qcut(tweets_price['close_price_diff'], 4, labels=False)

[Interval(-70.631, -16.62, closed='right'), Interval(-16.62, -2.46, closed='right'), Interval(-2.46, 15.04, closed='right'), Interval(15.04, 124.04, closed='right')]


In [23]:
# Separating data from training vs testing data
X_train, X_test, y_train, y_test = train_test_split(X, y)

In [24]:
# Training to get accuracy of guess
text_clf.fit(X_train, y_train)
text_clf.score(X_test, y_test)

0.20238095238095238

## Setting up Clusters for Analysis - kmeans

In [25]:
# Setting up imports for kmeans clusters


from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.naive_bayes import MultinomialNB
from sklearn.cluster import KMeans
from sklearn.model_selection import train_test_split

from sklearn.pipeline import Pipeline
text_clf = Pipeline([
    ('vect', CountVectorizer()),
    ('tfidf', TfidfTransformer()),
    ('clf', MultinomialNB()),
])

In [26]:
#Setting up variables for clusters
X = tweets_price.text.tolist()
km = KMeans(n_clusters=3)
km.fit(tweets_price['close_price_diff'].values.reshape(-1,1))
y = km.predict(tweets_price['close_price_diff'].values.reshape(-1,1))

In [27]:
# Separating data from training vs testing data
X_train, X_test, y_train, y_test = train_test_split(X, y)

In [28]:
# Training to get accuracy of guess
text_clf.fit(X_train, y_train)
text_clf.score(X_test, y_test)

0.5714285714285714

In [29]:
# Checking value counts for labels (0: neutral 1: positive 2: negative effect on stocks)
pd.Series(y).value_counts()

2    200
1     82
0     53
dtype: int64

In [30]:
# Viewing Cluster centers
km.cluster_centers_

array([[-55.89599667],
       [ 39.12980764],
       [ -5.32737984]])