In [1]:
import re
from datetime import datetime

import numpy as np
import pandas as pd
import seaborn as sns
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LinearRegression, Ridge, Lasso, LogisticRegression, SGDRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, mean_absolute_error, accuracy_score
import nltk
from nltk import wordpunct_tokenize, WordNetLemmatizer, SnowballStemmer
from nltk.corpus import stopwords
import gensim.downloader as api
from sklearn.base import BaseEstimator, TransformerMixin

In [2]:
df = pd.read_csv("Desktop\stock_data.csv")
df.head()

Unnamed: 0,Text,Sentiment
0,Kickers on my watchlist XIDE TIT SOQ PNK CPW B...,1
1,user: AAP MOVIE. 55% return for the FEA/GEED i...,1
2,user I'd be afraid to short AMZN - they are lo...,1
3,MNTA Over 12.00,1
4,OI Over 21.37,1


In [3]:
df.shape

(5791, 2)

In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5791 entries, 0 to 5790
Data columns (total 2 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   Text       5791 non-null   object
 1   Sentiment  5791 non-null   int64 
dtypes: int64(1), object(1)
memory usage: 90.6+ KB


In [5]:
train_df, test_df = train_test_split(df, test_size=0.2, random_state=42)

In [6]:
train_df.head()

Unnamed: 0,Text,Sentiment
4201,ove GNW!! 9-13 Calls are making me feel better...,1
387,"CSN option trader buys 1,500 of the Jan 11-16 ...",1
4385,people slag AAP for cannibalization but samsun...,1
5773,"Sensex opens 166 points lower at 35,469, Nifty...",-1
2348,CSOD Conf Call: CEO: feeling good about our po...,1


In [7]:
test_df.head()

Unnamed: 0,Text,Sentiment
1891,MCP take-over chatter... (I know don't laugh...),-1
1550,"AMZN 1,200 lot bid in the Feb weekly 255P. 27 ...",-1
1049,KO made with sugar is sold at local COST. Cons...,1
2523,HNZ Another American Institution sold to forei...,-1
156,Will watch the close carefully then decide whe...,-1


In [8]:
stem = SnowballStemmer("english")
lemma = WordNetLemmatizer()

In [9]:
stopwords_english = stopwords.words("english")

In [10]:
stopwords_english = set(stopwords_english)

In [11]:
stopwords_english.remove("not")

In [12]:
IS_ALPHA = re.compile("[\w!?]+")


def preprocess(text):
    text = text.replace("&", " and ")
    text = text.replace("n't", " not ")
    text = text.lower()
    text_tokens = wordpunct_tokenize(text)
    text_tokens = [lemma.lemmatize(token) for token in text_tokens if IS_ALPHA.match(token)]
    text_tokens = [token for token in text_tokens if token not in stopwords_english]
    text = " ".join(text_tokens)
    return text.strip()

In [13]:
train_df["Text_proceed"] = train_df["Text"].apply(preprocess)
test_df["Text_proceed"] = test_df["Text"].apply(preprocess)

In [14]:
train_df["Text_proceed"]

4201    ove gnw !! 9 13 call making feel better not du...
387     csn option trader buy 1 500 jan 11 16 call spr...
4385    people slag aap cannibalization samsung ha 80 ...
5773    sensex open 166 point lower 35 469 nifty start...
2348    csod conf call ceo feeling good position good ...
                              ...                        
3772    aap break close support 360 target im calling ...
5191    new industry data provide first hard look many...
5226    rt jchengwsj hindsight wall street probably no...
5390    global stock fall president trump issue new wa...
860     hpq many upgrade tomorrow assuming autonomy wo...
Name: Text_proceed, Length: 4632, dtype: object

In [15]:
COLS_TO_FIT = "Text"
TARGET_COL = "Sentiment"

In [16]:
class Word2VecModel(BaseEstimator, TransformerMixin):
    
    def __init__(self, model):
        self.model = model
        
    def get_mean_vector(self, text):
        v = np.zeros(300)
        c = 0
        for word in text.split(" "):
            if word in self.model:
                v += self.model.get_vector(word)
                c += 1
        c = max(43, c)
        return v / c
        
    def fit(self, X, y):
        return self
    
    def transform(self, X):
        return np.array([self.get_mean_vector(x) for x in X])

In [17]:
test_df.head()

Unnamed: 0,Text,Sentiment,Text_proceed
1891,MCP take-over chatter... (I know don't laugh...),-1,mcp take chatter know not laugh
1550,"AMZN 1,200 lot bid in the Feb weekly 255P. 27 ...",-1,amzn 1 200 lot bid feb weekly 255p 27 delta co...
1049,KO made with sugar is sold at local COST. Cons...,1,ko made sugar sold local cost consumer prefer ...
2523,HNZ Another American Institution sold to forei...,-1,hnz another american institution sold foreign ...
156,Will watch the close carefully then decide whe...,-1,watch close carefully decide whether ong shot ...


In [18]:
def train_and_validate(model, train_df, test_df):
    model.fit(train_df[COLS_TO_FIT], train_df[TARGET_COL])
    znachpred = model.predict(test_df[COLS_TO_FIT])
    accuracy = accuracy_score(test_df[TARGET_COL], znachpred)
    print(f"accuracy: {accuracy_score(test_df[TARGET_COL], znachpred):.3f}")
    return accuracy

In [19]:
model = api.load("word2vec-google-news-300")

In [60]:
pipe = Pipeline([
    ("vectorizer", CountVectorizer(ngram_range=(1, 3))),
    ("model", LogisticRegression(C=5.27, penalty="l1", solver="saga")),
])

In [61]:
train_and_validate(pipe, train_df, test_df)

accuracy: 0.811




0.811044003451251

In [40]:
pipe = Pipeline([
    ("vectorizer", CountVectorizer(ngram_range=(1, 2))),
    ("model", LogisticRegression(C=3.72, penalty="elasticnet", solver="saga", l1_ratio=1)),
])

In [41]:
train_and_validate(pipe, train_df, test_df)

accuracy: 0.808




0.808455565142364

In [38]:
pipe = Pipeline([
    ("vectorizer", CountVectorizer(token_pattern=r"\S+", ngram_range=(1, 1), min_df = 1, max_df=0.1)),
    ("model", LogisticRegression(C=5, penalty="l2", solver="saga")),
])

In [39]:
train_and_validate(pipe, train_df, test_df)

accuracy: 0.784




0.7842968075927523

In [34]:
pipe = Pipeline([
    ("vectorizer", CountVectorizer(ngram_range=(1, 2))),
    ("model", LogisticRegression(C=3.74, penalty="l1", solver="saga")),
])

In [35]:
train_and_validate(pipe, train_df, test_df)

accuracy: 0.808




0.808455565142364